From 4b6cd151edfc3f73397caabbb2ec04a70af9c97c Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Mon, 29 Sep 2025 16:19:46 +0200
Subject: [PATCH 01/58] Add simulated annealing runner and update CMake
 configuration

---
 tests/smoke/CMakeLists.txt             |   4 +
 tests/smoke/simulated_annealing_re.cpp | 618 +++++++++++++++++++++++++
 2 files changed, 622 insertions(+)
 create mode 100644 tests/smoke/simulated_annealing_re.cpp
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 7e7f2af4a..698413936 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -143,6 +143,10 @@ add_grb_executables( conjugate_gradient_complex conjugate_gradient.cpp
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 	COMPILE_DEFINITIONS _CG_COMPLEX
 )
+add_grb_executables( simulated_annealing_re simulated_annealing_re.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
 
 add_grb_executables( gmres gmres.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
new file mode 100644
index 000000000..73e9b1bbd
--- /dev/null
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -0,0 +1,618 @@
+/*
+  Minimal scaffold adapted from ising_machine_sb.cpp to drive a replica-exchange
+  simulated-annealing (RE-SA) solver.  Algorithmic parts are intentionally left
+  unimplemented (stubs).  This file mirrors the existing IO / launcher /
+  program structure and replaces numpy arrays with grb::Vector and lists of
+  numpy vectors with std::vector< grb::Vector<...> >. Sparse matrices are
+  represented as grb::Matrix< JType >.
+
+  Purpose: allow running internal tests or an external-run mode while the RE-SA
+  algorithm is implemented separately.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <tuple>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <cassert>
+
+#include <graphblas/nonzeroStorage.hpp>
+#include <graphblas/utils/timer.hpp>
+#include <graphblas/utils/parser.hpp>
+#include <graphblas/utils/singleton.hpp>
+#include <graphblas/utils/iterators/nonzeroIterator.hpp>
+#include <utils/output_verification.hpp>
+#include <graphblas.hpp>
+#include <utils/print_vec_mat.hpp>
+#include <random>
+
+using namespace grb;
+
+#define DEBUG_IMSB 1
+
+// Types
+using IOType = double;   // scalar/vector element type
+using JType  = double;   // coupling (matrix) value type
+
+/** Parser type */
+typedef grb::utils::MatrixFileReader<
+	JType,
+	std::conditional<
+		(sizeof(grb::config::RowIndexType) > sizeof(grb::config::ColIndexType)),
+		grb::config::RowIndexType,
+		grb::config::ColIndexType
+	>::type
+> Parser;
+
+/** Nonzero type */
+typedef internal::NonzeroStorage<
+	grb::config::RowIndexType,
+	grb::config::ColIndexType,
+	JType
+> NonzeroT;
+
+/** In-memory storage type using tuple */
+typedef grb::utils::Singleton<
+    std::tuple<
+        size_t,                    // n (rows/columns)
+        size_t,                    // nz (nonzeros)
+        size_t,                    // nsweeps
+        size_t,                    // n_replicas
+        unsigned,                  // seed
+        std::string,               // sweep_name
+        std::vector<NonzeroT>,     // matrix data
+        std::vector<JType>         // h vector
+    >
+> Storage;
+
+namespace test_data {
+    constexpr size_t n = 16;
+    constexpr size_t n_replicas = 3;
+    constexpr size_t nsweeps = 2;
+    constexpr unsigned seed = 8;
+
+    const std::vector< std::pair< std::pair< grb::config::RowIndexType, grb::config::ColIndexType >, JType > > j_matrix_data = {
+        {{0, 1}, -0.27523006},
+        {{1, 0}, -0.27523006},
+        {{1, 2},  0.28977992},
+        {{2, 1},  0.28977992},
+        {{2, 3}, -0.15453839},
+        {{3, 2}, -0.15453839},
+        {{3, 4},  0.48474944},
+        {{3, 5}, -0.61958321},
+        {{4, 3},  0.48474944},
+        {{4, 5}, -0.11904111},
+        {{5, 3}, -0.61958321},
+        {{5, 4}, -0.11904111},
+        {{5, 6},  0.70296404},
+        {{6, 5},  0.70296404},
+        {{7, 8}, -0.18031020},
+        {{8, 7}, -0.18031020},
+        {{9, 10}, 0.13823881},
+        {{10, 9}, 0.13823881}
+    };
+
+    const size_t nnz = j_matrix_data.size();
+
+    const std::vector< JType > h_array_data = {
+        0.03076145, -0.06152290, 0.09228435, -0.12304580,
+        0.15380725, -0.18456870, 0.21533015, -0.24609160,
+        0.27685305, -0.30761450, 0.33837595, -0.36913740,
+        0.39989885, -0.43066030, 0.46142175, -0.49218320
+    };
+}
+// --- New, minimal runner configuration and result types ---
+struct input {
+    bool use_default_data = false;
+    std::string filename_Jmatrix;
+    std::string filename_h;
+    size_t n_replicas = 3;
+    size_t nsweeps = 2;
+    unsigned seed = 8;
+    std::string sweep_name = "sequential_sweep_immediate";
+    bool verify = false;
+    std::string filename_ref_solution;
+	bool direct;
+    size_t rep = 0;
+    size_t outer = 1;
+};
+
+struct output {
+    int error_code = 0;
+    // TODO: remove itrations if not applicable
+    size_t iterations = 0; // total number of iterations performed does not make sense since the code does not have convergence criteria
+    double best_energy = 0.0;
+	size_t rep;
+	grb::utils::TimerResults times;
+    std::unique_ptr< PinnedVector< JType > > pinnedSolutionVector;
+    std::unique_ptr< PinnedVector< JType > > pinnedRefSolutionVector;
+    // other things like eg: best replicas ...
+};
+
+template< typename Dtype >
+void read_matrix_data(const std::string &filename, std::vector<Dtype> &data, bool direct) {
+    // Implementation for reading matrix data from file
+	try {
+		Parser parser( filename, direct );
+		assert( parser.m() == parser.n() );
+		std::get<0>(Storage::getData()) = parser.n();
+		try {
+			std::get<1>(Storage::getData()) = parser.nz();
+		} catch( ... ) {
+			std::get<1>(Storage::getData()) = parser.entries();
+		}
+		/* Once internal issue #342 is resolved this can be re-enabled
+		for(
+			auto it = parser.begin( PARALLEL );
+			it != parser.end( PARALLEL );
+			++it
+		) {
+			data.push_back( *it );
+		}*/
+		for(
+			auto it = parser.begin( SEQUENTIAL );
+			it != parser.end( SEQUENTIAL );
+			++it
+		) {
+			data.push_back( Dtype( *it ) );
+#ifdef DEBUG_IMSB
+			// print last data element from std::vector<NonzeroT> data
+			std::cout << "read_matrix_data: " << data.back().first.first << ", "
+				<< data.back().first.second << ", " << data.back().second << "\n";
+#endif
+		}
+	} catch( std::exception &e ) {
+		std::cerr << "I/O program failed: " << e.what() << "\n";
+		return;
+	}
+}
+
+template< typename NonzeroT, typename IType, typename VType >
+void read_matrix_data_from_array(
+	const std::vector<std::pair< std::pair< IType, IType >, VType > > &array,
+	std::vector<NonzeroT> &data
+) {
+	// Implementation for reading matrix data from array
+    try {
+        for (const auto &entry : array) {
+            data.emplace_back(
+                NonzeroT( entry.first.first, entry.first.second, entry.second )
+            );
+#ifdef DEBUG_IMSB
+            // print last data element from std::vector<NonzeroT> data
+            std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
+                << data.back().first.second << ", " << data.back().second << "\n";
+#endif
+        }
+        std::get<0>(Storage::getData()) = test_data::n;
+        std::get<1>(Storage::getData()) = data.size();
+    } catch (const std::exception &e) {
+        std::cerr << "Failed to read matrix data from array: " << e.what() << "\n";
+        return;
+    }
+}
+
+template< typename Dtype >
+void read_vector_data(const std::string &filename, std::vector<Dtype> &data) {
+    // Implementation for reading vector data from file
+    try {
+        std::ifstream file( filename );
+        if( !file.is_open() ) {
+            std::cerr << "Failed to open vector file: " << filename << "\n";
+            return;
+        }
+        std::string line;
+        while( std::getline( file, line ) ) {
+            if( line.empty() ) continue; // skip empty lines
+            std::istringstream iss( line );
+            Dtype v;
+            if( !(iss >> v) ) {
+                throw std::runtime_error( "Failed to parse line in vector file" );
+            }
+            data.push_back( v );
+        }
+    } catch( std::exception &e ) {
+        std::cerr << "I/O program failed: " << e.what() << "\n";
+        return;
+    }
+}
+
+
+template< typename Dtype >
+void read_vector_data_from_array(
+	const std::vector<Dtype> &array, std::vector<Dtype> &data
+) {
+	// Implementation for reading vector data from array
+	try {
+		for (size_t i = 0; i < array.size(); ++i) {
+			data.push_back(array[i]);
+		}
+	} catch (const std::exception &e) {
+		std::cerr << "Failed to read vector data from array: " << e.what() << "\n";
+		return;
+	}
+}
+
+
+void ioProgram( const struct input &data_in, bool &success ) {
+
+    using namespace test_data;
+	success = false;
+	// Parse and store matrix in singleton class
+    // Map Storage tuple fields to meaningful names and wire up default data
+    auto &storage = Storage::getData();
+    auto &n           = std::get<0>(storage); // n (rows/cols)
+    auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+    auto &nsweeps_st  = std::get<2>(storage); // nsweeps
+    auto &n_replicas_st = std::get<3>(storage); // n_replicas
+    auto &seed_st     = std::get<4>(storage); // seed
+    auto &sweep_name  = std::get<5>(storage); // sweep_name
+    auto &Jdata       = std::get<6>(storage); // std::vector<NonzeroT>
+    auto &h           = std::get<7>(storage); // std::vector<JType>
+
+    // Initialize metadata from input (allow CLI to override defaults)
+    nsweeps_st    = data_in.nsweeps;
+    n_replicas_st = data_in.n_replicas;
+    seed_st       = data_in.seed;
+    sweep_name    = data_in.sweep_name;
+
+    if ( data_in.use_default_data ) {
+        // if no file provided, use default data from file_content
+        read_matrix_data_from_array<NonzeroT>( test_data::j_matrix_data, Jdata );
+        read_vector_data_from_array<JType>( test_data::h_array_data, h );
+        // other data
+    } else {
+        // read from files if provided
+        read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
+        read_vector_data<JType>( data_in.filename_h, h );
+		if(data_in.verify) {
+			if(data_in.filename_ref_solution.empty()) {
+				std::cerr << "Reference solution file not provided for verification\n";
+				return;
+			}
+		}
+		//read_vector_data<JType>( data_in.filename_ref_solution, sol );
+
+    }
+
+	success = true;
+}
+
+
+void grbProgram(
+    const struct input &data_in, 
+    struct output &out
+) {
+    std::cout<< "grbProgram: running simulated-annealing RE solver (stub)\n";
+
+
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+    grb::utils::Timer timer;
+	timer.reset();
+
+    /* --- Problem setup --- */
+    const size_t n = std::get<0>(Storage::getData());
+	std::cout << "problem size n = " << n << "\n";
+    grb::Vector<JType> h( n );
+    // populate J with test (random) values
+    grb::RC rc = grb::SUCCESS;
+
+    // load into GraphBLAS
+    grb::Matrix<JType> J( n, n );
+	{
+		const auto &data = std::get<6>(Storage::getData());
+		RC io_rc = buildMatrixUnique(
+			J,
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cbegin() ),
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cend() ),
+			SEQUENTIAL
+		);
+		/* Once internal issue #342 is resolved this can be re-enabled
+		RC io_rc = buildMatrixUnique(
+			J,
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cbegin() ),
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cend() ),
+			PARALLEL
+		);*/
+		io_rc = io_rc ? io_rc : wait();
+		if( io_rc != SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+				<< "(" << toString( io_rc ) << ")." << std::endl;
+			out.error_code = 5;
+			return;
+		}
+
+#ifdef DEBUG_IMSB
+	if( s == 0 ) {
+		std::cout << "Matrix J:\n";
+		print_matrix( J);
+	}
+#endif
+	}
+
+    // build vector h with data from singleton
+    {
+        const auto &h_data = std::get<7>(Storage::getData());
+		rc = rc ? rc : buildVector(
+			h,
+			h_data.cbegin(),
+			h_data.cend(),
+			SEQUENTIAL
+		);
+    }
+
+    // create states storage and initialize with random 1/0 values
+    const size_t n_replicas = std::get<3>(Storage::getData());
+    std::vector< grb::Vector<IOType> > states;
+    for ( size_t r = 0; r < n_replicas; ++r ) {
+        states.emplace_back( grb::Vector<IOType>(n) );
+        // initialize with random values
+        std::default_random_engine generator( std::get<4>(Storage::getData()) + r );
+        std::uniform_int_distribution<int> distribution(0,1);
+        // we use buildvectorUnique with a random set of indices
+        std::vector< IOType > rand_data;
+        for ( size_t i = 0; i < n; ++i ) {
+            rand_data.emplace_back( static_cast<IOType>(
+                distribution(generator) ) );
+        }
+        rc = rc ? rc : grb::buildVector(
+            states.back(),
+            rand_data.cbegin(),
+            rand_data.cend(),
+            SEQUENTIAL
+        );
+    }
+
+    #ifdef DEBUG_IMSB
+    if( s == 0 ) {
+        for ( size_t r = 0; r < n_replicas; ++r ) {
+            std::cout << "Initial state replica " << r << ":\n";
+            print_vector( states[r], 30 ,"states values" );  
+            std::cout << std::endl;
+
+        }
+    }
+    #endif
+
+
+    // also make betas vector os size n_replicas and initialize with 10.0
+    grb::Vector<IOType> betas( n_replicas );
+    for ( size_t r = 0; r < n_replicas; ++r ) {
+        rc = rc ? rc : grb::setElement( betas, static_cast<IOType>(10.0), r );
+    }
+    rc = rc ? rc : wait();
+
+    // also make energies vector os size n_replicas and calculate values
+    // in python energies = np.array([get_energy(couplings, local_fields, state) for state in states])
+    // will be initalize in the algorithm
+    grb::Vector<IOType> energies( n_replicas );
+
+    // all temporary vectors and matrices should be created here
+
+    // TODO: add times
+
+	out.rep = data_in.rep;
+	// time a single call
+	if( out.rep == 0 ) {
+		timer.reset();
+		// rc = simulated_annealing_RE(
+        //     energies, states, J, h, ... other params ... ,
+        //     .. temp args, sol, out.iterations
+        // );
+
+		rc = rc ? rc : wait();
+		double single_time = timer.time();
+		if( !(rc == SUCCESS || rc == FAILED) ) {
+			std::cerr << "Failure: call to Simulated Annealing RE did not succeed ("
+				<< toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == FAILED ) {
+			std::cout << "Warning: call to Simulated Annealing RE did not converge\n";
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		out.rep = static_cast< size_t >( 1000.0 / single_time ) + 1;
+		if( rc == SUCCESS || rc == FAILED ) {
+			if( s == 0 ) {
+				if( rc == FAILED ) {
+					std::cout << "Info: cold Simulated Annealing RE did not converge within ";
+				} else {
+					std::cout << "Info: cold Simulated Annealing RE completed within ";
+				}
+				std::cout << out.iterations << " iterations. "
+					<< "Time taken was " << single_time << " ms. "
+					<< "Deduced inner repetitions parameter of " << out.rep << " "
+					<< "to take 1 second or more per inner benchmark.\n";
+			}
+		}
+	} else {
+		// do benchmark
+		timer.reset();
+		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+			if( rc == SUCCESS ) {
+                // rc = simulated_annealing_RE(
+                //     energies, states, J, h, ... other params ... ,
+                //     .. temp args, sol, out.iterations
+                // );
+			}
+			if( grb::Properties<>::isNonblockingExecution ) {
+				rc = rc ? rc : wait();
+			}
+		}
+		const double time_taken = timer.time();
+		out.times.useful = time_taken / static_cast< double >( out.rep );
+		// print timing at root process
+		if( grb::spmd<>::pid() == 0 ) {
+			std::cout << "Time taken for " << out.rep << " "
+				<< "Simulated Annealing RE calls (hot start): " << out.times.useful << ". "
+				<< "Error code is " << grb::toString( rc ) << std::endl;
+			std::cout << "\tnumber of IM-SB iterations: " << out.iterations << "\n";
+			std::cout << "\tmilliseconds per iteration: "
+				<< ( out.times.useful / static_cast< double >( out.iterations ) )
+				<< "\n";
+		}
+		sleep( 1 );
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+
+
+
+}
+
+
+// --- Simple help / CLI parser for the new runner (no backward compatibility) ---
+void printhelp( char *progname ) {
+    std::cout << "Usage: " << progname << " [--use-default-data] [--j-matrix-fname STR] [--h-fname STR]\n"
+              << "       [--n-replicas INT] [--nsweeps INT] [--seed INT] [--sweep STR]\n"
+              << "       [--verify] [--ref-solution-fname STR] [--help]\n\n"
+              << "Options:\n"
+              << "  --use-default-data         Use embedded default test data\n"
+              << "  --j-matrix-fname STR       Path to J matrix file (matrix-market or supported)\n"
+              << "  --h-fname STR              Path to h (local fields) vector (whitespace separated)\n"
+              << "  --n-replicas INT           Number of replicas (default: 3)\n"
+              << "  --nsweeps INT              Number of sweeps (default: 2)\n"
+              << "  --seed INT                 RNG seed (default: 8)\n"
+              << "  --sweep STR                Sweep selector (default: sequential_sweep_immediate)\n"
+              << "  --verify                   Verify output against reference solution\n"
+              << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
+              << "  --help, -h                 Print this help message\n";
+}
+
+bool parse_arguments( input &in, int argc, char ** argv ) {
+    in.filename_Jmatrix.clear();
+    in.filename_h.clear();
+    in.filename_ref_solution.clear();
+    in.direct = true;
+    // map benchmarking configuration to the runner's fields
+    in.rep = grb::config::BENCHMARKING::inner();
+    in.outer = grb::config::BENCHMARKING::outer();
+    // keep verify default (false) unless overridden via CLI
+    in.verify = false;
+
+    for ( int i = 1; i < argc; ++i ) {
+        std::string a = argv[i];
+        if ( a == "--use-default-data" ) {
+            in.use_default_data = true;
+        } else if ( a == "--j-matrix-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--j-matrix-fname requires an argument\n"; return false; }
+            in.filename_Jmatrix = argv[++i];
+        } else if ( a == "--h-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--h-fname requires an argument\n"; return false; }
+            in.filename_h = argv[++i];
+        } else if ( a == "--n-replicas" ) {
+            if ( i+1 >= argc ) { std::cerr << "--n-replicas requires an argument\n"; return false; }
+            in.n_replicas = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--nsweeps" ) {
+            if ( i+1 >= argc ) { std::cerr << "--nsweeps requires an argument\n"; return false; }
+            in.nsweeps = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--seed" ) {
+            if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
+            in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
+        } else if ( a == "--sweep" ) {
+            if ( i+1 >= argc ) { std::cerr << "--sweep requires an argument\n"; return false; }
+            in.sweep_name = argv[++i];
+        } else if ( a == "--verify" ) {
+            in.verify = true;
+        } else if ( a == "--ref-solution-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--ref-solution-fname requires an argument\n"; return false; }
+            in.filename_ref_solution = argv[++i];
+        } else if ( a == "--help" || a == "-h" ) {
+            printhelp( argv[0] );
+            return false;
+        } else {
+            std::cerr << "Unknown argument: " << a << "\n";
+            return false;
+        }
+    }
+
+    // basic validation
+    if ( !in.use_default_data ) {
+        if ( in.filename_Jmatrix.empty() || in.filename_h.empty() ) {
+            std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
+            return false;
+        }
+    }
+    if ( in.verify && !in.use_default_data && in.filename_ref_solution.empty() ) {
+        std::cerr << "--ref-solution-fname required when --verify is used without --use-default-data\n";
+        return false;
+    }
+    return true;
+}
+
+// --- Minimal main that uses the existing ioProgram / grbProgram entrypoints ---
+int main( int argc, char ** argv ) {
+    std::cout << "simulated_anealing_re runner\n";
+    input in;
+    output out;
+
+    if ( !parse_arguments( in, argc, argv ) ) {
+        printhelp( argv[0] );
+        return 1;
+    }
+
+    // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
+    std::srand( static_cast<unsigned>( in.seed ) );
+    static std::mt19937 global_rng( static_cast<unsigned>( in.seed ) );
+
+    std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=" << in.sweep_name << "\n";
+
+    // Run IO program (populates Storage or similar)
+    {
+        bool success = false;
+        grb::Launcher< AUTOMATIC > launcher;
+        grb::RC rc = launcher.exec( &ioProgram, in, success, true );
+        if ( rc != SUCCESS ) {
+            std::cerr << "I/O launcher failed: " << toString(rc) << "\n";
+            return 2;
+        }
+        if ( !success ) {
+            std::cerr << "I/O program reported failure\n";
+            return 3;
+        }
+    }
+
+    // Run main GraphBLAS program that builds data and calls reSA stub
+    {
+        grb::Launcher< AUTOMATIC > launcher;
+        grb::RC rc = launcher.exec( &grbProgram, in, out, true );
+        if ( rc != SUCCESS ) {
+            std::cerr << "grbProgram launcher failed: " << toString(rc) << "\n";
+            return 4;
+        }
+    }
+
+    std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
+    return out.error_code;
+}
\ No newline at end of file

From faccc91b613e9d1ec81ad6b056bab359ee6e6771 Mon Sep 17 00:00:00 2001
From: Denis Jelovina <denis.jelovina@huawei.com>
Date: Mon, 29 Sep 2025 16:20:48 +0200
Subject: [PATCH 02/58] Add test for parallel simulation of QUBO optimization
 using sequential sweeps. NOTE: remove in the fina

---
 tests/smoke/test_qubo_parallel.py | 664 ++++++++++++++++++++++++++++++
 1 file changed, 664 insertions(+)
 create mode 100644 tests/smoke/test_qubo_parallel.py

diff --git a/tests/smoke/test_qubo_parallel.py b/tests/smoke/test_qubo_parallel.py
new file mode 100644
index 000000000..b81ad862d
--- /dev/null
+++ b/tests/smoke/test_qubo_parallel.py
@@ -0,0 +1,664 @@
+"""
+Copyright © 2023, United States Government, as represented by the Administrator
+of the National Aeronautics and Space Administration. All rights reserved.
+
+The PySA, a powerful tool for solving optimization problems is licensed under
+the Apache License, Version 2.0 (the "License"); you may not use this file
+except in compliance with the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0.
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+"""
+
+from more_itertools import distribute
+from itertools import repeat
+from multiprocessing import Pool
+from os import cpu_count
+
+import numpy as np
+from typing import List, Tuple, Any, Callable, Optional, NoReturn
+
+
+Vector = List[float]
+# Matrix = List[List[float]] instead for List[List[float]] we will use sparse matrix from scipy
+from scipy.sparse import csr_matrix
+# Define general type
+dtype = 'float'
+# set random seed
+random_seed = 8
+
+Vector = List[float]
+Matrix = csr_matrix
+State = List[float]
+RefProbFun = Callable[[Vector, Optional[int]], float]
+EnergyFunction = Callable[[Matrix, Vector, State], float]
+UpdateSpinFunRef = Callable[
+    [Matrix, Vector, Vector, int, float, float, RefProbFun], float]
+UpdateSpinFunction = Callable[[Matrix, Vector, Vector, int, float, float],
+                              float]
+SweepFunction = Callable[[UpdateSpinFunction, Matrix, Vector, Vector, float],
+                         float]
+
+def partition_rows_by_independent_sets(couplings: csr_matrix, *, method: str = "welsh_powell"):
+    """
+    Partition the rows of a sparse symmetric matrix into independent sets (color classes)
+    so that within each set no two rows share a non-zero coupling (i.e. they are
+    pairwise non-adjacent in the row-interaction graph).
+
+    This implements a greedy graph-coloring (Welsh-Powell) style algorithm on the
+    graph implied by the sparsity pattern of `couplings`. The intent is to produce
+    row blocks that can be updated in parallel (or in any order) without violating
+    sequential Metropolis dependencies.
+
+    Parameters
+    - couplings: csr_matrix (shape [n, n]) sparse symmetric coupling matrix (only sparsity matters)
+    - method: currently only "welsh_powell" supported
+
+    Returns
+    - blocks: list of lists, each inner list is a color class containing row indices
+    - colors: numpy array of length n with integer color for each row
+
+    Complexity: O(n log n + m) where m is number of nonzeros (greedy ordering cost)
+    """
+
+    if not isinstance(couplings, csr_matrix):
+        couplings = csr_matrix(couplings)
+
+    n = couplings.shape[0]
+
+    # Build adjacency lists from the sparsity pattern (ignore diagonal)
+    indptr = couplings.indptr
+    indices = couplings.indices
+
+    degrees = np.diff(indptr)
+
+    # Order vertices by decreasing degree (Welsh-Powell)
+    order = np.argsort(-degrees)
+
+    colors = -1 * np.ones(n, dtype=int)
+    blocks = []
+
+    for v in order:
+        if colors[v] != -1:
+            continue
+
+        # Try to place v into the first existing color where it's independent
+        placed = False
+        for c_idx, block in enumerate(blocks):
+            # Check independence: v must not be adjacent to any node in block
+            # We can test by scanning neighbors of v and see if any have colors == c_idx
+            neigh = indices[indptr[v]:indptr[v+1]]
+            if not np.intersect1d(neigh, np.array(block)).size:
+                block.append(int(v))
+                colors[v] = c_idx
+                placed = True
+                break
+
+        if not placed:
+            # Create new color
+            colors[v] = len(blocks)
+            blocks.append([int(v)])
+
+    return blocks, colors
+
+
+def compute_row_batches(couplings: csr_matrix, *, order: str = "color", return_row_blocks: bool = False):
+    """
+    Compute a sequential list of batches (sizes) that cover all rows exactly once.
+
+    The simplest use is to partition rows by independent sets using
+    `partition_rows_by_independent_sets` and then produce a list of batch sizes
+    (and optionally the row indices per batch) that should be executed
+    sequentially. This is useful for the lazy-accumulator sweep which wants
+    to update entire independent blocks before flushing or proceeding.
+
+    Parameters
+    - couplings: csr_matrix
+    - order: currently only 'color' supported (return blocks in color order)
+    - return_row_blocks: if True return (batches, row_blocks) else return batches
+
+    Returns
+    - batches: list[int] of batch sizes whose sum equals n
+    - row_blocks (optional): list[list[int]] of row indices per batch
+    """
+
+    blocks, colors = partition_rows_by_independent_sets(couplings)
+
+    # Optionally reorder blocks; for now 'color' is the natural order
+    row_blocks = blocks
+    batches = [len(b) for b in row_blocks]
+
+    if return_row_blocks:
+        return batches, row_blocks
+    return batches
+
+
+
+def masked_row_dot(couplings: Matrix, D: np.ndarray, pos: int, bs: int) -> float:
+    """
+    Compute the contribution of the accumulated deltas `D` to the local field
+    at row `pos`, i.e. return (J[row=pos] dot D). This models a masked-mxv
+    primitive that computes only the scalar contribution required for the
+    Metropolis decision at `pos` without materializing the whole h += J.dot(D).
+
+    In Python we use the sparse row matvec; in C++ this should map to a
+    fast masked-mxv primitive or a pipeline read for the single scalar.
+    """
+    # For CSR matrices getrow returns a 1xN sparse matrix; dot(D) returns a 1-elem array
+    # we use todense to simulate a fast masked-mxv primitive
+    return (couplings.todense()[pos:pos+bs, :].dot(D))
+
+
+def get_energy(couplings: Matrix, local_fields: Vector, state: State) -> float:
+    """
+    Compute energy given couplings and local fields.
+    """
+    # Ensure state is 0/1
+    assert np.all((state == 0) | (state == 1)), "State must contain only 0 or 1 values."
+    # Ensure shapes are compatible
+    assert couplings.shape[0] == couplings.shape[1], "Couplings must be a square matrix."
+    assert couplings.shape[0] == state.shape[0], "State and couplings must have compatible shapes."
+    assert local_fields.shape[0] == state.shape[0], "Local fields and state must have compatible shapes."
+    return state.dot(couplings.dot(state) / 2 + local_fields)
+
+
+def sequential_sweep_x(couplings: Matrix, local_fields: Vector,
+                     state: State, beta: float) -> float:
+    """
+    Metropolis sweep that preserves sequential Metropolis semantics but avoids
+    expensive per-row dense conversions by precomputing the local field
+    h = J.dot(state) + local_fields and updating h incrementally when a spin
+    flip is accepted. Works with CSR sparse `couplings`.
+    """
+
+    n = len(state)
+
+    # Precompute local fields h = J.dot(state) + local_fields. In a lazy
+    # evaluation framework you'd typically start a pipeline here that
+    # represents h but we materialize once for correctness in the Python
+    # prototype.
+    h = couplings.dot(state) + local_fields
+
+    # Random numbers (log uniform)
+    log_r = np.log(np.random.random(size=n))
+
+    delta_energy = 0.0
+
+    # D is the lazy accumulator vector: it holds pending 0/1 deltas that we
+    # haven't yet materialized into h. The lazy evaluator in your framework
+    # would instead record these into a pipeline and only execute when a
+    # reduction-to-scalar (dot, min, sum) requires it.
+    D = np.zeros(n, dtype=h.dtype)
+
+    # iterate sequentially (Metropolis semantics). We allow D to accumulate
+    # multiple nonzeros. For each pos we compute the scalar contribution from
+    # D to h[pos] via a masked row dot (row J[pos,:] dot D) so we don't have
+    # to apply the whole matvec until we need to.
+
+    # Use precomputed batches/row_blocks attached to the matrix when available
+    # (compute_row_batches attaches them to the matrix as _row_blocks/_batches)
+    if hasattr(couplings, '_row_blocks') and hasattr(couplings, '_batches'):
+        row_blocks = getattr(couplings, '_row_blocks')
+        batches = getattr(couplings, '_batches')
+    else:
+        # Compute batches (row blocks) from the matrix sparsity pattern. Each
+        # block is an independent set and can be updated without internal
+        # dependencies. compute_row_batches returns sizes; request row blocks too.
+        batches, row_blocks = compute_row_batches(couplings, return_row_blocks=True)
+
+    # Iterate sequentially over blocks. Inside each block rows are independent
+    # so they can be processed in any order (or in parallel) while preserving
+    # Metropolis semantics between blocks.
+    for block_idx, rows in enumerate(row_blocks):
+        # test_states for debugging
+        test_states = state.copy()
+
+        # If there are pending deltas compute their effect on these rows only
+        # using a masked row dot over the block. masked_row_dot expects a
+        # starting pos and block size; we call it for each row in the block.
+        #row_contrib = np.asarray([masked_row_dot(couplings, D, r, 1).item() for r in rows])
+        # rewritten as a dense matrix vector product for the block to simulate efficent code
+        print("rows:",rows)
+        row_contrib = couplings[rows, :].dot(D)
+
+        # compute delta energies for the block
+        state_slice = np.array(state)[rows]
+        h_slice = np.array(h)[rows]
+        dn = (2.0 * state_slice - 1.0) * (h_slice + row_contrib)
+
+        # Vectorized Metropolis decision for rows in this block
+        accept = (dn >= 0) | (log_r[rows] < beta * dn)
+        old = np.array(state)[rows]
+        new = np.where(accept, 1 - old, old)
+        delta_energy += -np.sum(dn * accept)
+        # Update state and D in place
+        for idx, r in enumerate(rows):
+            state[r] = new[idx]
+            D[r] += (new[idx] - old[idx])
+
+
+    # Flush any remaining accumulated deltas into h before returning. This
+    # materializes the lazy pipeline; the framework could do this lazily at
+    # a later synchronization point instead.
+    # if np.any(D):
+    #     apply_accumulated(couplings, D, h)
+
+    return float(delta_energy)
+
+
+def sequential_sweep_immediate(couplings: Matrix, local_fields: Vector,
+                               state: State, beta: float, printinfo: bool = False) -> float:
+    """
+    Immediate-update Metropolis sweep: on each accepted flip we update the
+    local-field vector `h` immediately by iterating the nonzeros of the
+    flipped row (neighbor updates). This is the standard efficient approach
+    for CSR matrices and is provided here for performance comparison against
+    the lazy `D`-accumulator approach.
+
+    For simplicity in this prototype, we convert the sparse matrix to dense internally.
+    """
+    n = len(state)
+    # Convert couplings to dense numpy array
+    dense_couplings = couplings.toarray()
+    h = dense_couplings.dot(state) + local_fields
+    log_r = np.log(np.random.random(size=n))
+    delta_energy = 0.0
+
+    # Use same batching mechanism as sequential_sweep_x to improve locality.
+    if hasattr(couplings, '_row_blocks') and hasattr(couplings, '_batches'):
+        row_blocks = getattr(couplings, '_row_blocks')
+        batches = getattr(couplings, '_batches')
+    else:
+        batches, row_blocks = compute_row_batches(couplings, return_row_blocks=True)
+
+    for block_idx, rows in enumerate(row_blocks):
+        # process rows in this independent block (vectorized)
+        if printinfo:
+            print("rows = ", rows)
+        rows = np.array(rows)
+        hi = h[rows]
+        si = state[rows]
+        dn = (2.0 * si - 1.0) * hi
+
+        # Vectorized Metropolis decision
+        accept = (dn >= 0) | (log_r[rows] < beta * dn)
+        old = si
+        new = np.where(accept, 1 - old, old)
+        delta = new - old
+
+        # Update state and delta_energy
+        state[rows] = new
+        delta_energy += -np.sum(dn * accept)
+
+        # Update h for all spins (dense update)
+        if np.any(delta):
+            h += dense_couplings[:, rows].dot(delta)
+
+    return float(delta_energy)
+
+
+
+def pt(states: List[State], energies: List[float], beta_idx: List[int],
+       betas: List[float]) -> NoReturn:
+    """
+  Parallel tempering move.
+    states: [n_replicas, ...]  Array of replicas
+    energies: [n_replicas] Array of energies of each replica
+    beta_idx: [n_replicas] The replica index currently assigned to each beta,
+        i.e. inverse temperature K is currently used for simulating replica beta_idx[K]
+    betas: [n_replicas] Sequential array of inverse temperatures.
+
+    This function only modifies the order of beta_idx.
+  """
+    print("pt(in):",energies[beta_idx], "  beta(in):", betas[beta_idx])
+
+    # Get number of replicas
+    n_replicas = len(states)
+
+    # Apply PT for each pair of replicas
+    for k in range(n_replicas - 1):
+
+        # Get first index
+        k1 = n_replicas - k - 1
+
+        # Get second index
+        k2 = n_replicas - k - 2
+
+        # Compute delta energy
+        de = (energies[beta_idx[k1]] - energies[beta_idx[k2]]) * (betas[k1] - betas[k2])
+
+        # Accept/reject following Metropolis
+        if de >= 0 or np.random.random() < np.exp(de):
+            beta_idx[k1], beta_idx[k2] = beta_idx[k2], beta_idx[k1]
+
+    print("pt(out):",energies[beta_idx], "  beta(out):", betas[beta_idx])
+
+
+
+def simulation_parallel_x(
+                        sweep: SweepFunction,
+                        couplings: Matrix,
+                        local_fields: Vector,
+                        states: List[State],
+                        energies: List[float],
+                        beta_idx: List[int],
+                        betas: List[float],
+                        n_sweeps: int,
+                        get_part_fun: bool = False,
+                        use_pt: bool = True) -> Tuple[State, float, int, int]:
+    """
+  Apply simulation.
+  """
+
+    # Get number of replicas
+    n_replicas = len(states)
+
+    # Best configuration/energy
+    _best_energy = np.copy(energies)
+    _best_state = np.copy(states)
+    _best_sweeps = np.zeros(n_replicas, dtype=np.int32)
+    betas_sorted = np.empty_like(betas)
+    log_omegas = np.zeros(n_sweeps)
+
+    # For each run ...
+    for s in range(n_sweeps):
+        for k in range(n_replicas):
+            betas_sorted[beta_idx[k]] = betas[k]
+        # ... apply sweep for each replica ...
+        # interate k in random order to avoid bias
+        #print("betas_sorted: ",betas_sorted)
+        perm = np.random.permutation(n_replicas)
+        print("perm: ",perm)
+        for k in perm:  # numba.prange(n_replicas):
+
+            # Apply sweep
+            tmp = sweep(couplings, local_fields,states[k], betas_sorted[k])
+            energies[k] += tmp
+            print("Replica ",k," energy=",energies[k])
+
+            # Store best state
+            if energies[k] < _best_energy[k]:
+                _best_energy[k] = energies[k]
+                _best_state[k] = np.copy(states[k])
+                _best_sweeps[k] = s
+
+        # ... and pt move.
+        #print("beta_idx after sweep: ",beta_idx)
+        if use_pt:
+            pt(states, energies, beta_idx, betas)
+        #print("beta_idx after pt:    ",beta_idx)
+        # Calculate the weights for the partition function
+
+    # Get lowest energy
+    best_pos = np.argmin(_best_energy)
+    best_state = _best_state[best_pos]
+    best_energy = _best_energy[best_pos]
+    best_sweeps = _best_sweeps[best_pos]
+
+    # Return states and energies
+    return ((states, energies, beta_idx, log_omegas), (best_state, best_energy,
+                                                       best_sweeps, s + 1))
+
+
+
+def get_min_energy(couplings: Matrix, local_fields: Vector):
+
+    # Get number of variables
+    n_vars = couplings.shape[0]
+    # assert square matrix
+    assert couplings.shape[0] == couplings.shape[1], "Couplings must be a square matrix."
+    # assert compatible shapes
+    assert local_fields.shape[0] == n_vars, "Local fields must have the same number of variables as couplings."
+    min_energy = np.inf
+    best_state = np.array([0]*n_vars, dtype=dtype)
+
+    # Find minimum energy by bruteforce
+    for state in range(2**n_vars):
+        # Transform state
+        spin_state_unsigned = np.array([int(x) for x in bin(state)[2:].zfill(n_vars)], dtype=dtype)
+        # Get energy for the state
+        assert np.all(np.isin(spin_state_unsigned, [0, 1])), "State contains values other than 0 and 1"
+        energy = get_energy(couplings, local_fields, spin_state_unsigned)
+        # Store only the minimum energy
+        if energy < min_energy:
+            min_energy = energy
+            best_state = np.copy(spin_state_unsigned)
+
+    return min_energy, best_state
+
+
+def gen_random_problem(n_vars: int,
+                       dtype: Any = 'float', nzratio = 0.1, test_dense: bool = False, printinfo: bool = False) -> Tuple[Matrix, Vector]:
+
+    # Generate random problem
+    if (test_dense):
+        couplings = 2 * np.random.random((n_vars, n_vars)).astype(dtype) - 1
+        couplings = (couplings + couplings.T) / 2
+        vals = couplings.flatten()
+        row = np.array([i for i in range(n_vars) for j in range(n_vars)])
+        col = np.array([j for i in range(n_vars) for j in range(n_vars)])
+    else:
+        # couplings are random sparse matrix instead of dense with nz none zero elements
+        nz = int(nzratio * n_vars * n_vars)
+        row = np.sort(np.random.randint(0, n_vars, nz))
+        col = np.sort(np.random.randint(0, n_vars, nz))
+        # make sure there are no duplicate entries ins same i,j pairs in (row,col)
+        unique = np.unique(np.array([row, col]).T)
+        row = row[unique]
+        col = col[unique]
+        vals = 2 * np.random.random(len(unique)).astype(dtype) - 1
+
+    couplings = csr_matrix((vals, (row, col)), shape=(n_vars, n_vars))
+    couplings = (couplings + couplings.T) / 2
+    diag_couplings = couplings.diagonal()
+    #set diagonal to zero
+    couplings = couplings - csr_matrix((diag_couplings, (np.arange(n_vars), np.arange(n_vars))), shape=(n_vars, n_vars))
+
+    if printinfo:
+        # print sparse matrix structure
+        # ie
+        # 0 0 0 1 0
+        # 0 0 1 0 0
+        # 0 1 0 0 0
+        # print actual value for the unit test
+        print("Couplings matrix (indices and nonzero values in COO):")
+        print("Row indices:", couplings.nonzero()[0])
+        print("Column indices:", couplings.nonzero()[1])
+        print("Nonzero values:", couplings.data)
+
+        print("Couplings matrix structure (*=nonzero, .=zero):")
+        dense_couplings = couplings.toarray()
+        for i in range(n_vars):
+            row_str = ""
+            for j in range(n_vars):
+                if dense_couplings[i,j] != 0:
+                    row_str += "* "
+                else:
+                    row_str += ". "
+            print("[{}]".format(i),"\t",row_str)
+
+    # Split in couplings and local_fields
+    #local_fields = np.copy(np.diagonal(couplings))
+    #make local_fields random instead of from diagonal
+    local_fields = 2 * np.random.random(n_vars).astype(dtype) - 1
+    #print local_fields
+    if printinfo:
+        print("Local fields (random):", local_fields)
+
+    return couplings, local_fields
+
+
+def test_sequential_sweep_simulation_qubo(n_vars: int):
+
+    n_replicas = 3
+    print("n_vars =",n_vars)
+    print("n_replicas =",n_replicas)
+
+
+    # Generate random problem
+    couplings, local_fields = gen_random_problem(n_vars, dtype=dtype,printinfo=True)
+
+    # Find minimum energy by bruteforce
+    min_energy,best_state_bruteforce = get_min_energy(couplings, local_fields)
+
+    # Fix temperature
+    betas = np.array([10]*n_replicas, dtype=dtype)
+    print("Betas =",betas)
+    beta_idx = np.arange(n_replicas)
+    print("Initial beta_idx =",beta_idx)
+    # Get initial state
+    states = np.random.randint(2, size=(n_replicas, n_vars)).astype(dtype)
+    print("Initial states =")
+    for s in states:
+        print(s)
+
+    # Compute energies
+    for s in states:
+        assert np.all(np.isin(s, [0, 1])), "State contains values other than 0 and 1"
+    energies = np.array(
+        [get_energy(couplings, local_fields, state) for state in states])
+    print("Initial energies=",energies)
+
+    # Simulate
+    print("beta_idx =",beta_idx)
+    nsweeps = 2
+    (state, energy, _, _), (best_state, best_energy, _, _) = simulation_parallel_x(
+        sequential_sweep_immediate, 
+        couplings, 
+        local_fields,
+        states, 
+        energies, 
+        beta_idx, 
+        betas, 
+        nsweeps)
+
+    # Check that best energy is correct
+    #ref_best_energy = -7.9322789708332255 # dense
+    ref_best_energy = -5.079571790854985 # dense
+    ref_nsweeps = 2
+    ref_n_vars = 16
+    ref_random_seed = 8
+    # make sure parameters match reference
+    assert n_vars == ref_n_vars, f"n_vars {n_vars} does not match reference {ref_n_vars}"
+    assert nsweeps == ref_nsweeps, f"nsweeps {nsweeps} does not match reference {ref_nsweeps}"  
+    assert random_seed == ref_random_seed, f"random_seed {random_seed} does not match reference {ref_random_seed}"
+    # best_energy 
+    assert np.isclose(best_energy, ref_best_energy), f"best_energy {best_energy} does not match reference {ref_best_energy}"
+
+    assert np.all(np.isin(best_state, [0, 1])), "State contains values other than 0 and 1"
+    qubo_energy = get_energy(couplings, local_fields, best_state)
+
+    print("beta_idx(out) =",beta_idx)
+
+    print("energy      =",energy)
+    for e in energy:
+        print(e)
+    print("best_energy by solver     = ",best_energy)
+    print("best_energy by bruteforce = ",min_energy)
+
+    print("state(out)      =")
+    for s in state:
+        print(s)
+    print("best state by solver     = ",best_state)
+    print("best state by bruteforce = ",best_state_bruteforce)
+
+
+    assert (np.isclose(best_energy,
+                       get_energy(couplings, local_fields, best_state)))
+
+    # Best energy should be always larger than the minimum energy
+    assert (np.round(best_energy, 6) >= np.round(min_energy, 6))
+    #assert (np.isclose(qubo_energy, ref_energy))
+
+
+
+# ## test semantics of row-wise vs immediate update sweeps
+# def test_rowwise_vs_immediate_equivalence(
+#     nzratio=0.25,
+#     seed0 = 12345,
+#     seed1 = 20241010,
+#     n_vars = 815,
+#     beta = 10.0):
+#     """
+#     Cross-check that a row-wise sequential sweep implemented via
+#     `sequential_sweep_rowwise` (with an `update_spin` that follows the
+#     immediate-neighbor-update semantics) produces the same delta_energy and
+#     final state as `sequential_sweep_immediate` when both use the same RNG
+#     seed.
+#     """
+#     def sequential_sweep_rowwise(couplings: Matrix, local_fields: Vector,
+#                         state: State, beta: float) -> float:
+#         """
+#     Metropolis update.
+#     """
+#         def update_spin(couplings: Matrix, local_fields: Vector, state: State, pos: int, beta: float, log_r: float) -> float:
+#             """
+#             Update spin accordingly to Metropolis update.
+#             """
+#             # Ensure state is 0/1
+#             assert np.all((state == 0) | (state == 1)), "State must contain only 0 or 1 values."
+#             # Ensure pos is valid
+#             assert 0 <= pos < state.shape[0], "pos index out of bounds."
+#             # Ensure shapes are compatible
+#             assert couplings.shape[0] == couplings.shape[1], "Couplings must be a square matrix."
+#             assert couplings.shape[0] == state.shape[0], "State and couplings must have compatible shapes."
+#             assert local_fields.shape[0] == state.shape[0], "Local fields and state must have compatible shapes."
+
+#             # Get the negate delta energy (qubo)
+#             # delta_n_energy = (2. * state[pos] - 1.) * (couplings[pos].dot(state) + local_fields[pos]) 
+#             # # we need to rewrite this for sparse matrix couplings, ie couplings[pos] for dense matrix becomes couplings.getrow(pos).toarray()[0]
+#             delta_n_energy = (2. * state[pos] - 1.) * (couplings.getrow(pos).toarray()[0].dot(state) + local_fields[pos])
+
+#             # Metropolis update
+#             if delta_n_energy >= 0 or log_r < beta * delta_n_energy:
+#                 # Update spin (qubo)
+#                 state[pos] = 0 if state[pos] else 1
+#                 # Return delta energy
+#                 return -delta_n_energy
+#             else:
+#                 # Otherwise, return no change in energy
+#                 return 0.
+
+#         # Get random numbers
+#         log_r = np.log(np.random.random(size=len(state)))
+
+#         # Try to update every spin
+#         delta_energy = 0.
+#         for pos in range(len(state)):
+#             delta_energy += update_spin(couplings, local_fields, state, pos, beta,log_r[pos])
+
+#         return delta_energy
+
+#     np.random.seed(seed0)
+#     couplings, local_fields = gen_random_problem(n_vars, dtype=dtype, nzratio=nzratio)
+
+
+#     # initial state
+#     init_state = np.random.randint(2, size=n_vars).astype(dtype)
+
+#     # copy for both runs
+#     state_immediate = init_state.copy()
+#     state_rowwise = init_state.copy()
+
+#     np.random.seed(seed1)
+#     delta_immediate = sequential_sweep_immediate(couplings, local_fields, state_immediate, beta)
+
+#     # run rowwise with the same RNG sequence
+#     np.random.seed(seed1)
+#     delta_rowwise = sequential_sweep_rowwise(couplings, local_fields, state_rowwise, beta)
+
+#     # Compare energies and final states
+#     assert np.isclose(delta_immediate, delta_rowwise), f"delta_energy differs: immediate={delta_immediate}, rowwise={delta_rowwise}"
+#     assert np.array_equal(state_immediate, state_rowwise), f"states differ after sweep: immediate={state_immediate}, rowwise={state_rowwise}"
+#     print("Rowwise and Immediate sweeps are equivalent.")
+
+# for i in range(5):
+#     test_rowwise_vs_immediate_equivalence(seed1=i)
+
+# run several time to check parallelism
+for _ in range(4):
+    np.random.seed(random_seed)
+    test_sequential_sweep_simulation_qubo(n_vars=16)

From 45ff2836bb0fcf0efc318dbae8d399b234ee6623 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 16 Oct 2025 11:31:11 +0200
Subject: [PATCH 03/58] Not everything is broken

---
 .../algorithms/simulated_annealing_re.cpp     | 197 ++++++++++++++++++
 tests/smoke/simulated_annealing_re.cpp        | 170 ++++++++++++---
 2 files changed, 338 insertions(+), 29 deletions(-)
 create mode 100644 include/graphblas/algorithms/simulated_annealing_re.cpp

diff --git a/include/graphblas/algorithms/simulated_annealing_re.cpp b/include/graphblas/algorithms/simulated_annealing_re.cpp
new file mode 100644
index 000000000..4ce4b6672
--- /dev/null
+++ b/include/graphblas/algorithms/simulated_annealing_re.cpp
@@ -0,0 +1,197 @@
+/*
+ *   Copyright 2025 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides a Simulated Annealing-Replica Exchange QUBO optimizator.
+ *
+ * @author Giovanni Gaio
+ * @date TODO 2025
+ */
+
+#ifndef _H_GRB_ALGORITHMS_SA_RE
+#define _H_GRB_ALGORITHMS_SA_RE
+
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <assert.h>
+
+#ifndef NDEBUG
+#include <iostream>
+#endif
+
+
+#include <graphblas.hpp>
+
+namespace grb {
+
+	namespace algorithms {
+
+		/*
+		 * Parallel Tempering
+		 *
+		 *
+		 */
+		template<
+			typename StateType, 
+			typename EnergyType,
+			typename TempType,
+			Backend backend
+			>
+		grb::RC pt(
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType > &energies,
+				grb::Vector< TempType > &betas
+				){
+			const size_t n_replicas = states.size();
+
+			for( size_t i = 1 ; i < n_replicas ; ++i ){
+        		const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
+				if( de >= 0 || std::rand() < RAND_MAX*std::exp(de) ){
+					std::swap( betas[i], betas[i-1] );
+				}	
+			}
+
+			return grb::SUCCESS;
+		}
+
+		/*
+		 * Estimate a solution to a given Quadratic Unconstrained Binary Optimization
+		 * (QUBO) optimization problem. The solution is found using Simulated Annealing-
+		 * Replica Exchange (also known as Parallel Tempering).
+		 *
+		 * The state will be optimized to minimize the expression:
+		 * $x^TQx$, where $x$ is the binary state vector, and $Q$ is the coupling matrix.
+		 *
+		 * @param[in,out] x              On input: an initial state.
+		 *                               On output: the optimized state
+		 * @param[in]     Q              The (square, symmetric) couplings matrix.
+		 * @param[in]     te             Probabilities of flipping each bit at each
+		 *                               iteration (values between 0 and 1)
+		 * @param[in]     n_replicas     Number of replicas to run concurrently.
+		 * @param[in]     n_sweeps       Number of iterations.
+		 * @param[in]     seed 			 Seed to use in the generation of random bit flips.
+		 *
+		 * @tparam QType         The input/output vector nonzero type
+		 * @tparam QType         The input/output vector nonzero type
+		 *
+		 */
+		template<
+			typename QType, // type of coupling matrix values
+			typename StateType, // type of state, ideally 0/1
+			typename EnergyType,
+			typename TempType,
+			typename RSI, typename CSI, typename NZI, Backend backend,
+			class Ring = Semiring<
+				grb::operators::add< QType >, grb::operators::mul< QType >,
+				grb::identities::zero, grb::identities::one
+				>
+			>
+		grb::RC simulated_annealing_RE(
+				const std::function< 
+					EnergyType(
+						 const grb::Matrix< QType, backend, RSI, CSI, NZI >&,
+						 const grb::Vector< QType, backend >&,
+						 grb::Vector< StateType, backend >&,
+						 const TempType&,
+						 const Ring&
+				 	)
+				> &sweep,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				const grb::Matrix< QType, backend, RSI, CSI, NZI > &Q,
+				const grb::Vector< QType, backend > &local_fields,
+				grb::Vector< EnergyType > &energies,
+				grb::Vector< TempType > &betas,
+				const size_t &n_sweeps = 1,
+				const bool &use_pt = false,
+				const Ring &ring = Ring()
+				){
+
+			const size_t n_replicas = states.size();
+
+			assert( n_replicas > 0 );
+			assert( n_replicas == grb::size( betas ) );
+			assert( grb::ncols( Q ) == grb::nrows( Q ) );
+			assert( grb::size( states[0] ) == grb::nrows( Q ) );
+			assert( grb::size( states[0] ) == grb::size( local_fields ) );
+
+			for(size_t i = 1; i < n_replicas ; ++i ){
+				assert( grb::size( states[0] ) == grb::size( states[ i ] ) );
+			}
+
+			const size_t n = grb::size(states[0]);
+
+#ifndef NDEBUG
+			std::cerr << "DEBUG: Called  simulated_annealing_RE with parameters: "
+				      << "\n\t n = " << n
+				      << "\n\t n_replicas = " << n_replicas
+				      << "\n\t n_sweeps = " << n_sweeps
+				      << "\n\t use_pt = " << use_pt
+				      << std::endl;
+#endif
+
+			grb::RC rc = grb::SUCCESS;
+
+			static std::vector< grb::Vector< StateType, backend > >  best_states = states;
+			auto best_energies = energies;
+
+			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
+				// randomize order of replicas
+				std::random_shuffle( states.begin(), states.end() );
+
+				for( size_t j = 0 ; rc == grb::SUCCESS && j < n_replicas ; ++j ){
+					
+					energies[j] += sweep( Q, local_fields, states[j], betas[j], ring );
+				
+					// update_best state and energy
+					if( energies[j] < best_energies[j] ){
+						best_energies[j] = energies[j];
+						best_states[j] = states[j];
+					}
+				} // n_replicas
+
+				if( rc == SUCCESS && use_pt ){ // Parallel Tempering move
+					rc = pt( states, energies, betas );
+				}
+#ifndef NDEBUG
+				std::cerr << "Energy at iteration " << i_sweep << " = " << energies[ 0 ] << std::endl;
+#endif
+			} // n_sweeps
+
+#ifndef NDEBUG
+			if( rc != grb::SUCCESS ){
+				std::cerr << "ERROR at line " <<  __LINE__ << " in "
+					      << __FILE__ << ": " << grb::toString( rc ) << "\n";
+			}
+#endif
+			if( rc == SUCCESS ){
+				// copy assignment throws an error. We'll do move-assignment I guess
+				states = std::move(best_states);
+				energies = std::move(best_energies);
+			}
+
+			return rc;
+		}
+
+	} // namespace algorithms
+
+} // end namespace grb
+
+#endif // end _H_GRB_ALGORITHMS_SA-RE
+
+
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 73e9b1bbd..e60dbd8cc 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -19,7 +19,10 @@
 #include <memory>
 #include <algorithm>
 #include <cassert>
+#include <cstdlib>
+#include <unistd.h>
 
+#include <graphblas/algorithms/simulated_annealing_re.cpp>
 #include <graphblas/nonzeroStorage.hpp>
 #include <graphblas/utils/timer.hpp>
 #include <graphblas/utils/parser.hpp>
@@ -62,6 +65,7 @@ typedef grb::utils::Singleton<
         size_t,                    // nz (nonzeros)
         size_t,                    // nsweeps
         size_t,                    // n_replicas
+        bool,                      // use_pt
         unsigned,                  // seed
         std::string,               // sweep_name
         std::vector<NonzeroT>,     // matrix data
@@ -71,8 +75,9 @@ typedef grb::utils::Singleton<
 
 namespace test_data {
     constexpr size_t n = 16;
-    constexpr size_t n_replicas = 3;
     constexpr size_t nsweeps = 2;
+    constexpr size_t n_replicas = 3;
+    constexpr bool use_pt = true; 
     constexpr unsigned seed = 8;
 
     const std::vector< std::pair< std::pair< grb::config::RowIndexType, grb::config::ColIndexType >, JType > > j_matrix_data = {
@@ -112,6 +117,7 @@ struct input {
     std::string filename_h;
     size_t n_replicas = 3;
     size_t nsweeps = 2;
+    bool use_pt = true;
     unsigned seed = 8;
     std::string sweep_name = "sequential_sweep_immediate";
     bool verify = false;
@@ -124,8 +130,8 @@ struct input {
 struct output {
     int error_code = 0;
     // TODO: remove itrations if not applicable
-    size_t iterations = 0; // total number of iterations performed does not make sense since the code does not have convergence criteria
-    double best_energy = 0.0;
+    size_t iterations = 10; // total number of iterations performed does not make sense since the code does not have convergence criteria
+    double best_energy = std::numeric_limits< JType >::max();
 	size_t rep;
 	grb::utils::TimerResults times;
     std::unique_ptr< PinnedVector< JType > > pinnedSolutionVector;
@@ -161,7 +167,7 @@ void read_matrix_data(const std::string &filename, std::vector<Dtype> &data, boo
 			data.push_back( Dtype( *it ) );
 #ifdef DEBUG_IMSB
 			// print last data element from std::vector<NonzeroT> data
-			std::cout << "read_matrix_data: " << data.back().first.first << ", "
+			std::cout << "readmatrix_data: " << data.back().first.first << ", "
 				<< data.back().first.second << ", " << data.back().second << "\n";
 #endif
 		}
@@ -237,6 +243,90 @@ void read_vector_data_from_array(
 	}
 }
 
+template<
+		// Backend backend=grb::reference,
+		class Ring = Semiring<
+			grb::operators::add< JType >, grb::operators::mul< JType >,
+			grb::identities::zero, grb::identities::one
+		> >
+static JType sequential_sweep_immediate(
+				 const grb::Matrix< JType >& couplings,
+				 const grb::Vector< JType > &local_fields,
+				 grb::Vector< IOType > &state,
+				 const JType &beta,
+				 const Ring &ring = Ring()
+			  ){
+
+		const size_t n = grb::size( state );
+		static JType delta_energy = static_cast< JType >(0.0);
+
+		static grb::Vector< JType > delta ( n );
+		static grb::Vector< JType > dn ( n );
+		static grb::Vector< JType > log_rand ( n );
+		static grb::Vector< JType > h ( n );
+
+		for( size_t j = 0 ; j < n ; ++j ){
+			constexpr auto rm = static_cast< JType >( RAND_MAX ) + 2;
+			const auto randi =static_cast< JType >( std::rand() ) + 1;
+			const auto rand = randi / rm ;
+			grb::setElement(log_rand,  std::log( rand ), j );
+		}
+
+		grb::set( h, static_cast< JType >( 0.0 ) );
+		grb::mxv( h, couplings, state , ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
+		grb::foldl( h, local_fields, ring.getAdditiveOperator() );
+
+
+		// TODO: masking
+		for( size_t j = 0 ; j < n ; ++j ){
+#ifndef NDEBUG
+			// std::cerr << "...." << std::endl;
+#endif
+			grb::setElement( dn,  (2.0 * state[ j ] - 1.0) * h[ j ], j );
+			if( dn[ j ] >= 0 || (log_rand[ j ] < beta * dn[ j ]) ){
+				grb::setElement( state,  1 - state[j], j );
+				grb::setElement( delta,  static_cast< JType >( 1 ), j );
+				delta_energy -= dn[ j ] * delta[ j ];
+			}
+
+			// update h
+			grb::mxv( h, couplings, delta, ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
+		}
+
+		return delta_energy;
+}
+
+
+template<
+		// Backend backend=grb::reference,
+		class Ring = Semiring<
+			grb::operators::add< JType >, grb::operators::mul< JType >,
+			grb::identities::zero, grb::identities::one
+		> >
+std::function< JType(
+					 const grb::Matrix< JType >&,
+					 const grb::Vector< JType >&,
+					 grb::Vector< IOType >&,
+					 const JType&,
+					 const Ring&
+				 ) > get_sweep_function( std::string sweep_name ){
+	/*
+	if( sweep_name != "sequential_sweep_x" ){
+		return [](
+				 const grb::Matrix< JType >& couplings,
+				 const grb::Vector< JType > &local_fields,
+				 grb::Vector< IOType > &state,
+				 const JType &beta,
+				 const Ring &ring = Ring()
+			  ){ return 0; };
+	} */
+	if( sweep_name != "sequential_sweep_immediate" ){
+			std::cerr << "Warning: unknown sweep setting. Falling back to  \"sequential_sweep_immediate\"" << std::endl;
+	}
+	 return sequential_sweep_immediate< Ring >;
+}
+
+
 
 void ioProgram( const struct input &data_in, bool &success ) {
 
@@ -249,14 +339,16 @@ void ioProgram( const struct input &data_in, bool &success ) {
     auto &nnz         = std::get<1>(storage); // nz (nonzeros)
     auto &nsweeps_st  = std::get<2>(storage); // nsweeps
     auto &n_replicas_st = std::get<3>(storage); // n_replicas
-    auto &seed_st     = std::get<4>(storage); // seed
-    auto &sweep_name  = std::get<5>(storage); // sweep_name
-    auto &Jdata       = std::get<6>(storage); // std::vector<NonzeroT>
-    auto &h           = std::get<7>(storage); // std::vector<JType>
+    auto &use_pt      = std::get<4>(storage); // seed
+    auto &seed_st     = std::get<5>(storage); // seed
+    auto &sweep_name  = std::get<6>(storage); // sweep_name
+    auto &Jdata       = std::get<7>(storage); // std::vector<NonzeroT>
+    auto &h           = std::get<8>(storage); // std::vector<JType>
 
     // Initialize metadata from input (allow CLI to override defaults)
     nsweeps_st    = data_in.nsweeps;
     n_replicas_st = data_in.n_replicas;
+    use_pt        = data_in.use_pt;
     seed_st       = data_in.seed;
     sweep_name    = data_in.sweep_name;
 
@@ -307,7 +399,7 @@ void grbProgram(
     // load into GraphBLAS
     grb::Matrix<JType> J( n, n );
 	{
-		const auto &data = std::get<6>(Storage::getData());
+		const auto &data = std::get<7>(Storage::getData());
 		RC io_rc = buildMatrixUnique(
 			J,
 			utils::makeNonzeroIterator<
@@ -347,7 +439,7 @@ void grbProgram(
 
     // build vector h with data from singleton
     {
-        const auto &h_data = std::get<7>(Storage::getData());
+        const auto &h_data = std::get<8>(Storage::getData());
 		rc = rc ? rc : buildVector(
 			h,
 			h_data.cbegin(),
@@ -362,8 +454,8 @@ void grbProgram(
     for ( size_t r = 0; r < n_replicas; ++r ) {
         states.emplace_back( grb::Vector<IOType>(n) );
         // initialize with random values
-        std::default_random_engine generator( std::get<4>(Storage::getData()) + r );
-        std::uniform_int_distribution<int> distribution(0,1);
+        std::default_random_engine generator( std::get<5>(Storage::getData()) + r );
+        std::uniform_int_distribution< int > distribution(0,1);
         // we use buildvectorUnique with a random set of indices
         std::vector< IOType > rand_data;
         for ( size_t i = 0; i < n; ++i ) {
@@ -377,6 +469,9 @@ void grbProgram(
             SEQUENTIAL
         );
     }
+	
+	const auto sweep = get_sweep_function( data_in.sweep_name );
+
 
     #ifdef DEBUG_IMSB
     if( s == 0 ) {
@@ -391,18 +486,18 @@ void grbProgram(
 
 
     // also make betas vector os size n_replicas and initialize with 10.0
-    grb::Vector<IOType> betas( n_replicas );
+    grb::Vector< JType > betas( n_replicas );
     for ( size_t r = 0; r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast<IOType>(10.0), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast<JType>(10.0/r), r );
     }
     rc = rc ? rc : wait();
 
     // also make energies vector os size n_replicas and calculate values
     // in python energies = np.array([get_energy(couplings, local_fields, state) for state in states])
     // will be initalize in the algorithm
-    grb::Vector<IOType> energies( n_replicas );
+    grb::Vector< JType > energies( n_replicas );
+	grb::set( energies, 1 );
 
-    // all temporary vectors and matrices should be created here
 
     // TODO: add times
 
@@ -410,10 +505,9 @@ void grbProgram(
 	// time a single call
 	if( out.rep == 0 ) {
 		timer.reset();
-		// rc = simulated_annealing_RE(
-        //     energies, states, J, h, ... other params ... ,
-        //     .. temp args, sol, out.iterations
-        // );
+		rc = grb::algorithms::simulated_annealing_RE(
+				sweep, states, J, h, energies, betas, data_in.nsweeps, data_in.use_pt
+        );
 
 		rc = rc ? rc : wait();
 		double single_time = timer.time();
@@ -427,6 +521,10 @@ void grbProgram(
 		}
 		if( rc == SUCCESS ) {
 			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+
+			for(size_t i = 0 ; i < n_replicas ; ++i ){
+				out.best_energy = std::min( out.best_energy, energies[ i ] );
+			}
 		}
 		if( rc != SUCCESS ) {
 			out.error_code = 25;
@@ -451,16 +549,27 @@ void grbProgram(
 		timer.reset();
 		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
 			if( rc == SUCCESS ) {
-                // rc = simulated_annealing_RE(
-                //     energies, states, J, h, ... other params ... ,
-                //     .. temp args, sol, out.iterations
-                // );
+				out.iterations = data_in.nsweeps;
+
+                rc = grb::algorithms::simulated_annealing_RE(
+					sweep, states, J, h, energies, betas, data_in.nsweeps, data_in.use_pt
+                );
+
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {
 				rc = rc ? rc : wait();
 			}
 		}
 		const double time_taken = timer.time();
+		for ( size_t r = 0; r < n_replicas; ++r ) {
+			std::cout << "Final state replica " << r << ":\n";
+			print_vector( states[r], 30 ,"states values" );  
+			std::cout << std::endl;
+		}
+		for(size_t i = 0 ; i < n_replicas ; ++i ){
+			out.best_energy = std::min( out.best_energy, energies[ i ] );
+		}
+
 		out.times.useful = time_taken / static_cast< double >( out.rep );
 		// print timing at root process
 		if( grb::spmd<>::pid() == 0 ) {
@@ -478,6 +587,8 @@ void grbProgram(
 	// start postamble
 	timer.reset();
 
+
+
 	// set error code
 	if( rc == FAILED ) {
 		out.error_code = 30;
@@ -486,9 +597,6 @@ void grbProgram(
 		out.error_code = 35;
 		return;
 	}
-
-
-
 }
 
 
@@ -503,6 +611,7 @@ void printhelp( char *progname ) {
               << "  --h-fname STR              Path to h (local fields) vector (whitespace separated)\n"
               << "  --n-replicas INT           Number of replicas (default: 3)\n"
               << "  --nsweeps INT              Number of sweeps (default: 2)\n"
+              << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
               << "  --seed INT                 RNG seed (default: 8)\n"
               << "  --sweep STR                Sweep selector (default: sequential_sweep_immediate)\n"
               << "  --verify                   Verify output against reference solution\n"
@@ -537,6 +646,9 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
         } else if ( a == "--nsweeps" ) {
             if ( i+1 >= argc ) { std::cerr << "--nsweeps requires an argument\n"; return false; }
             in.nsweeps = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--use-pt" ) {
+            if ( i+1 >= argc ) { std::cerr << "--use-pt requires an argument\n"; return false; }
+            in.use_pt = static_cast<bool>( std::stoul(argv[++i]) );
         } else if ( a == "--seed" ) {
             if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
             in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
@@ -559,7 +671,7 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
 
     // basic validation
     if ( !in.use_default_data ) {
-        if ( in.filename_Jmatrix.empty() || in.filename_h.empty() ) {
+        if ( in.filename_Jmatrix.empty() ) {
             std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
             return false;
         }
@@ -615,4 +727,4 @@ int main( int argc, char ** argv ) {
 
     std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
     return out.error_code;
-}
\ No newline at end of file
+}

From 06c2a43c6f61df99904f94da61015203a01c63ab Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 17 Oct 2025 11:08:46 +0200
Subject: [PATCH 04/58] Bug fixes and debugging things in
 simulated_annealing_RE

---
 .../algorithms/simulated_annealing_re.cpp     |  37 +++--
 tests/smoke/simulated_annealing_re.cpp        | 155 ++++++++++++------
 2 files changed, 132 insertions(+), 60 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.cpp b/include/graphblas/algorithms/simulated_annealing_re.cpp
index 4ce4b6672..57139573f 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.cpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.cpp
@@ -29,7 +29,7 @@
 #include <vector>
 #include <algorithm>
 #include <cstdlib>
-#include <assert.h>
+#include <cassert>
 
 #ifndef NDEBUG
 #include <iostream>
@@ -54,8 +54,8 @@ namespace grb {
 			Backend backend
 			>
 		grb::RC pt(
-				std::vector< grb::Vector< StateType, backend > > &states,
-				grb::Vector< EnergyType > &energies,
+				const std::vector< grb::Vector< StateType, backend > > &states,
+				const grb::Vector< EnergyType > &energies,
 				grb::Vector< TempType > &betas
 				){
 			const size_t n_replicas = states.size();
@@ -76,11 +76,11 @@ namespace grb {
 		 * Replica Exchange (also known as Parallel Tempering).
 		 *
 		 * The state will be optimized to minimize the expression:
-		 * $x^TQx$, where $x$ is the binary state vector, and $Q$ is the coupling matrix.
+		 * $x^TQx$, where $x$ is the binary state vector, and $couplings$ is the coupling matrix.
 		 *
 		 * @param[in,out] x              On input: an initial state.
 		 *                               On output: the optimized state
-		 * @param[in]     Q              The (square, symmetric) couplings matrix.
+		 * @param[in]     couplings      The (square, symmetric) couplings matrix.
 		 * @param[in]     te             Probabilities of flipping each bit at each
 		 *                               iteration (values between 0 and 1)
 		 * @param[in]     n_replicas     Number of replicas to run concurrently.
@@ -113,7 +113,7 @@ namespace grb {
 				 	)
 				> &sweep,
 				std::vector< grb::Vector< StateType, backend > > &states,
-				const grb::Matrix< QType, backend, RSI, CSI, NZI > &Q,
+				const grb::Matrix< QType, backend, RSI, CSI, NZI > &couplings,
 				const grb::Vector< QType, backend > &local_fields,
 				grb::Vector< EnergyType > &energies,
 				grb::Vector< TempType > &betas,
@@ -122,12 +122,12 @@ namespace grb {
 				const Ring &ring = Ring()
 				){
 
-			const size_t n_replicas = states.size();
+			size_t n_replicas = states.size();
 
 			assert( n_replicas > 0 );
 			assert( n_replicas == grb::size( betas ) );
-			assert( grb::ncols( Q ) == grb::nrows( Q ) );
-			assert( grb::size( states[0] ) == grb::nrows( Q ) );
+			assert( grb::ncols( couplings ) == grb::nrows( couplings ) );
+			assert( grb::size( states[0] ) == grb::nrows( couplings ) );
 			assert( grb::size( states[0] ) == grb::size( local_fields ) );
 
 			for(size_t i = 1; i < n_replicas ; ++i ){
@@ -147,16 +147,25 @@ namespace grb {
 
 			grb::RC rc = grb::SUCCESS;
 
-			static std::vector< grb::Vector< StateType, backend > >  best_states = states;
-			auto best_energies = energies;
+			static std::vector< grb::Vector< StateType, backend > >  best_states;
+			static grb::Vector< EnergyType > best_energies ( n_replicas );
+			best_energies = energies;
+			best_states =  states;
 
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
 				// randomize order of replicas
-				std::random_shuffle( states.begin(), states.end() );
-
+				// std::random_shuffle( states.begin(), states.end() );
+
+				/*
+				grb::eWiseApply(energies, states, betas, 
+						[&](auto state, auto beta){
+						return sweep( couplings, local_fields, state, beta, ring )
+						}
+						);
+						*/
 				for( size_t j = 0 ; rc == grb::SUCCESS && j < n_replicas ; ++j ){
 					
-					energies[j] += sweep( Q, local_fields, states[j], betas[j], ring );
+					energies[j] += sweep( couplings, local_fields, states[j], betas[j], ring );
 				
 					// update_best state and energy
 					if( energies[j] < best_energies[j] ){
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index e60dbd8cc..63cf23e16 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -36,6 +36,8 @@
 using namespace grb;
 
 #define DEBUG_IMSB 1
+#define ISCLOSE(x) std::abs(x) < 1e-4
+
 
 // Types
 using IOType = double;   // scalar/vector element type
@@ -81,33 +83,44 @@ namespace test_data {
     constexpr unsigned seed = 8;
 
     const std::vector< std::pair< std::pair< grb::config::RowIndexType, grb::config::ColIndexType >, JType > > j_matrix_data = {
-        {{0, 1}, -0.27523006},
-        {{1, 0}, -0.27523006},
-        {{1, 2},  0.28977992},
-        {{2, 1},  0.28977992},
-        {{2, 3}, -0.15453839},
-        {{3, 2}, -0.15453839},
-        {{3, 4},  0.48474944},
-        {{3, 5}, -0.61958321},
-        {{4, 3},  0.48474944},
-        {{4, 5}, -0.11904111},
-        {{5, 3}, -0.61958321},
-        {{5, 4}, -0.11904111},
-        {{5, 6},  0.70296404},
-        {{6, 5},  0.70296404},
-        {{7, 8}, -0.18031020},
-        {{8, 7}, -0.18031020},
-        {{9, 10}, 0.13823881},
-        {{10, 9}, 0.13823881}
+		{{0, 1}, -0.2752300610319546},
+		{{1, 0}, -0.2752300610319546},
+		{{1, 2}, -0.10636508505639508},
+		{{2, 1}, -0.10636508505639508},
+		{{2, 3}, 0.3961450048806352},
+		{{3, 2}, 0.3961450048806352},
+		{{3, 4}, -0.15453838800213293},
+		{{3, 5}, 0.4847494372852713},
+		{{4, 3}, -0.15453838800213293},
+		{{4, 5}, -0.4712679510367046},
+		{{5, 3}, 0.4847494372852713},
+		{{5, 4}, -0.4712679510367046},
+		{{5, 6}, -0.1483152637298799},
+		{{6, 5}, -0.1483152637298799},
+		{{7, 8}, -0.11904111079614699},
+		{{8, 7}, -0.11904111079614699},
+		{{9, 10}, -0.18031020353297234},
+		{{10, 9}, -0.18031020353297234},
+		{{10, 11}, -0.22985425840853468},
+		{{11, 10}, -0.22985425840853468},
+		{{11, 12}, 0.30105588632639446},
+		{{11, 13}, 0.13823880612312134},
+		{{12, 11}, 0.30105588632639446},
+		{{13, 11}, 0.13823880612312134},
+		{{13, 14}, 0.10364447636911123},
+		{{14, 13}, 0.10364447636911123},
+		{{14, 15}, 0.2955745584289766},
+		{{15, 14}, 0.2955745584289766},
     };
 
+
     const size_t nnz = j_matrix_data.size();
 
     const std::vector< JType > h_array_data = {
-        0.03076145, -0.06152290, 0.09228435, -0.12304580,
-        0.15380725, -0.18456870, 0.21533015, -0.24609160,
-        0.27685305, -0.30761450, 0.33837595, -0.36913740,
-        0.39989885, -0.43066030, 0.46142175, -0.49218320
+		-0.08910436,  0.58034508,  0.97719304,  0.16792909,
+		-0.9221754 , -0.10715418 -0.62365497,  0.25411129,
+		-0.5693644 , -0.69805978 , 0.07228861 -0.79922641,
+		0.46231686 , 0.87930208 , 0.88663637, -0.25052299
     };
 }
 // --- New, minimal runner configuration and result types ---
@@ -243,6 +256,31 @@ void read_vector_data_from_array(
 	}
 }
 
+template<
+		class Ring = Semiring<
+			grb::operators::add< JType >, grb::operators::mul< JType >,
+			grb::identities::zero, grb::identities::one
+		> >
+JType get_energy(
+				 const grb::Matrix< JType >& couplings,
+				 const grb::Vector< JType > &local_fields,
+				 const grb::Vector< IOType > &state,
+				 const Ring &ring = Ring()
+			  ){
+	static grb::Vector< JType > tmp ( grb::size( state ) );
+	grb::clear( tmp );
+	JType energy = 0.0;
+
+	grb::mxv( tmp, couplings, state, ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
+	grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeOperator() );
+	grb::foldl( tmp, local_fields, ring.getAdditiveOperator() );
+	grb::dot( energy, tmp, state, ring );
+
+	return energy;
+}
+
+
+
 template<
 		// Backend backend=grb::reference,
 		class Ring = Semiring<
@@ -258,13 +296,17 @@ static JType sequential_sweep_immediate(
 			  ){
 
 		const size_t n = grb::size( state );
-		static JType delta_energy = static_cast< JType >(0.0);
+		JType delta_energy = static_cast< JType >(0.0);
 
-		static grb::Vector< JType > delta ( n );
-		static grb::Vector< JType > dn ( n );
-		static grb::Vector< JType > log_rand ( n );
 		static grb::Vector< JType > h ( n );
+		grb::set( h, static_cast< JType >( 0.0 ) );
+		grb::mxv( h, couplings, state , ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
+		grb::foldl( h, local_fields, ring.getAdditiveOperator() );
+
+		// static grb::Vector< JType > delta ( n );
+		// static grb::Vector< JType > dn ( n );
 
+		static grb::Vector< JType > log_rand ( n );
 		for( size_t j = 0 ; j < n ; ++j ){
 			constexpr auto rm = static_cast< JType >( RAND_MAX ) + 2;
 			const auto randi =static_cast< JType >( std::rand() ) + 1;
@@ -272,26 +314,41 @@ static JType sequential_sweep_immediate(
 			grb::setElement(log_rand,  std::log( rand ), j );
 		}
 
-		grb::set( h, static_cast< JType >( 0.0 ) );
-		grb::mxv( h, couplings, state , ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
-		grb::foldl( h, local_fields, ring.getAdditiveOperator() );
 
 
+		const auto old_state = state;
 		// TODO: masking
 		for( size_t j = 0 ; j < n ; ++j ){
-#ifndef NDEBUG
-			// std::cerr << "...." << std::endl;
-#endif
-			grb::setElement( dn,  (2.0 * state[ j ] - 1.0) * h[ j ], j );
-			if( dn[ j ] >= 0 || (log_rand[ j ] < beta * dn[ j ]) ){
-				grb::setElement( state,  1 - state[j], j );
-				grb::setElement( delta,  static_cast< JType >( 1 ), j );
-				delta_energy -= dn[ j ] * delta[ j ];
-			}
+			JType delta = static_cast< JType >( 0 );
+			JType dn = static_cast< JType >( 0 );
+			dn = (2.0 * state[ j ] - 1.0) * h[ j ];
+
+			const bool accept = ( dn >= 0 ) || ( log_rand[ j ] < beta * dn );
+			const IOType old = state[ j ];
+			grb::setElement( state,  (accept ? 1 - old : old), j );
+
+			// grb::setElement( delta,  static_cast< JType >( state[j] - old ), j );
+			delta =  static_cast< JType >( state[j] - old );
+			delta_energy -= dn * accept ;
 
 			// update h
-			grb::mxv( h, couplings, delta, ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
+			if( delta ){
+				static grb::Vector< JType > deltav ( n );
+				grb::clear( deltav );
+				grb::setElement( deltav, delta , j );
+
+				grb::mxv( h, couplings, deltav, ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
+			}
 		}
+		const auto new_state = state;
+
+		// std::cerr << "\n\t Delta_energy: " << delta_energy;
+		// std::cerr << "\n\t Real delta: " << (get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state));
+		// std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
+		// std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
+		// std::cerr << std::endl;
+
+		assert( ISCLOSE(delta_energy - (get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state))) );
 
 		return delta_energy;
 }
@@ -326,8 +383,6 @@ std::function< JType(
 	 return sequential_sweep_immediate< Ring >;
 }
 
-
-
 void ioProgram( const struct input &data_in, bool &success ) {
 
     using namespace test_data;
@@ -335,8 +390,8 @@ void ioProgram( const struct input &data_in, bool &success ) {
 	// Parse and store matrix in singleton class
     // Map Storage tuple fields to meaningful names and wire up default data
     auto &storage = Storage::getData();
-    auto &n           = std::get<0>(storage); // n (rows/cols)
-    auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+    // auto &n           = std::get<0>(storage); // n (rows/cols)
+    // auto &nnz         = std::get<1>(storage); // nz (nonzeros)
     auto &nsweeps_st  = std::get<2>(storage); // nsweeps
     auto &n_replicas_st = std::get<3>(storage); // n_replicas
     auto &use_pt      = std::get<4>(storage); // seed
@@ -478,25 +533,30 @@ void grbProgram(
         for ( size_t r = 0; r < n_replicas; ++r ) {
             std::cout << "Initial state replica " << r << ":\n";
             print_vector( states[r], 30 ,"states values" );  
+			std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
             std::cout << std::endl;
-
         }
+
+		grb::Vector<JType> zero ( n );
+		grb::set( zero, 0 );
+		grb::setElement( zero, 1, 1 );
+		assert( std::abs(get_energy(  J, h, zero ) - 0.5803450826765713) < 1e-4 );
     }
     #endif
 
 
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType > betas( n_replicas );
+    grb::Vector< JType > energies( n_replicas );
     for ( size_t r = 0; r < n_replicas; ++r ) {
         rc = rc ? rc : grb::setElement( betas, static_cast<JType>(10.0/r), r );
+        rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r] ), r );
     }
     rc = rc ? rc : wait();
 
     // also make energies vector os size n_replicas and calculate values
     // in python energies = np.array([get_energy(couplings, local_fields, state) for state in states])
     // will be initalize in the algorithm
-    grb::Vector< JType > energies( n_replicas );
-	grb::set( energies, 1 );
 
 
     // TODO: add times
@@ -564,7 +624,10 @@ void grbProgram(
 		for ( size_t r = 0; r < n_replicas; ++r ) {
 			std::cout << "Final state replica " << r << ":\n";
 			print_vector( states[r], 30 ,"states values" );  
+			std::cout << "With energy " << energies[ r ] << "\n";
+			std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
 			std::cout << std::endl;
+			assert( ISCLOSE(energies[ r ] - get_energy(  J, h, states[r] ) ) );
 		}
 		for(size_t i = 0 ; i < n_replicas ; ++i ){
 			out.best_energy = std::min( out.best_energy, energies[ i ] );

From 1c0f432189e86c82b458943b0a08e76fa58d512a Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 17 Oct 2025 12:15:35 +0200
Subject: [PATCH 05/58] Added documentation to simulated_annealing_re +
 bugfixing

---
 ...ling_re.cpp => simulated_annealing_re.hpp} | 45 +++++++++++++------
 tests/smoke/simulated_annealing_re.cpp        | 45 ++++++++++---------
 2 files changed, 56 insertions(+), 34 deletions(-)
 rename include/graphblas/algorithms/{simulated_annealing_re.cpp => simulated_annealing_re.hpp} (72%)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.cpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
similarity index 72%
rename from include/graphblas/algorithms/simulated_annealing_re.cpp
rename to include/graphblas/algorithms/simulated_annealing_re.hpp
index 57139573f..b59b0ed68 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.cpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -43,8 +43,18 @@ namespace grb {
 	namespace algorithms {
 
 		/*
-		 * Parallel Tempering
+		 * Do a Parallel Tempering pass.
+		 * This means exchanging states at low temperature with states at higher temperature.
+		 * To make the code simpler, this will be done by exchanging the temperatures instead.
 		 *
+		 * @param[in] states        On input: initial states.
+		 * @param[in] energies      The initial energy of each state.
+		 * @param[in,out] betas     Inverse temperature of each state.
+		 * 							The betas may be permuted.
+		 *
+		 * @tparam StateType	The state variable type.
+		 * @tparam EnergyType	The energy type.
+		 * @tparam TempType		The inverse temperature type.
 		 *
 		 */
 		template<
@@ -75,20 +85,29 @@ namespace grb {
 		 * (QUBO) optimization problem. The solution is found using Simulated Annealing-
 		 * Replica Exchange (also known as Parallel Tempering).
 		 *
-		 * The state will be optimized to minimize the expression:
-		 * $x^TQx$, where $x$ is the binary state vector, and $couplings$ is the coupling matrix.
+		 * The state will be optimized to minimize the value of the energy $U(x)$,
+		 * where $x$ is the binary state vector, and $couplings$ is the coupling matrix.
+		 * Energies will be changed when changing the states, so that each energy is
+		 * the actual energy of the relative state.
+		 * The parameter sweep is a function that (randomly) changes a given state and
+		 * returns the variation of energy made from its changes of the state.
 		 *
-		 * @param[in,out] x              On input: an initial state.
-		 *                               On output: the optimized state
-		 * @param[in]     couplings      The (square, symmetric) couplings matrix.
-		 * @param[in]     te             Probabilities of flipping each bit at each
-		 *                               iteration (values between 0 and 1)
-		 * @param[in]     n_replicas     Number of replicas to run concurrently.
-		 * @param[in]     n_sweeps       Number of iterations.
-		 * @param[in]     seed 			 Seed to use in the generation of random bit flips.
+		 * @param[in]     sweep      	The sweeping function.
+		 * 								Should return the energy variation implied from the changes that it made on the state.
+		 * @param[in,out] states        On input: initial states.
+		 *                              On output: optimized states.
+		 * @param[in]     couplings     The square (symmetric) couplings matrix.
+		 * @param[in,out] energies      The initial energy of each state.
+		 * @param[in,out] betas     	Inverse temperature of each state.
+		 * @param[in]     n_replicas    Number of replicas to run concurrently.
+		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
+		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
 		 *
-		 * @tparam QType         The input/output vector nonzero type
-		 * @tparam QType         The input/output vector nonzero type
+		 * @tparam QType		The coupling matrix and the local fields type.
+		 * @tparam StateType	The state variable type.
+		 * @tparam EnergyType	The energy type.
+		 * @tparam TempType		The inverse temperature type.
+		 * @tparam Ring			The semiring under which to make the sweeps.
 		 *
 		 */
 		template<
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 63cf23e16..f4681344e 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -22,7 +22,7 @@
 #include <cstdlib>
 #include <unistd.h>
 
-#include <graphblas/algorithms/simulated_annealing_re.cpp>
+#include <graphblas/algorithms/simulated_annealing_re.hpp>
 #include <graphblas/nonzeroStorage.hpp>
 #include <graphblas/utils/timer.hpp>
 #include <graphblas/utils/parser.hpp>
@@ -36,7 +36,7 @@
 using namespace grb;
 
 #define DEBUG_IMSB 1
-#define ISCLOSE(x) std::abs(x) < 1e-4
+#define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
 
 
 // Types
@@ -295,37 +295,39 @@ static JType sequential_sweep_immediate(
 				 const Ring &ring = Ring()
 			  ){
 
+		grb::RC rc = grb::SUCCESS;
 		const size_t n = grb::size( state );
 		JType delta_energy = static_cast< JType >(0.0);
 
-		static grb::Vector< JType > h ( n );
-		grb::set( h, static_cast< JType >( 0.0 ) );
-		grb::mxv( h, couplings, state , ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
-		grb::foldl( h, local_fields, ring.getAdditiveOperator() );
+		grb::Vector< JType > h ( n );
+		rc = rc ? rc : grb::set( h, static_cast< JType >( 0.0 ) );
+		rc = rc ? rc : grb::mxv( h, couplings, state , ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
+		rc = rc ? rc : grb::foldl( h, local_fields, ring.getAdditiveOperator() );
+		// rc = rc ? rc : grb::foldl( h, local_fields, ring.getAdditiveOperator() );
 
 		// static grb::Vector< JType > delta ( n );
 		// static grb::Vector< JType > dn ( n );
 
-		static grb::Vector< JType > log_rand ( n );
+		grb::Vector< JType > log_rand ( n );
 		for( size_t j = 0 ; j < n ; ++j ){
 			constexpr auto rm = static_cast< JType >( RAND_MAX ) + 2;
 			const auto randi =static_cast< JType >( std::rand() ) + 1;
 			const auto rand = randi / rm ;
-			grb::setElement(log_rand,  std::log( rand ), j );
+			rc = rc ? rc : grb::setElement(log_rand,  std::log( rand ), j );
 		}
 
 
 
 		const auto old_state = state;
 		// TODO: masking
-		for( size_t j = 0 ; j < n ; ++j ){
+		for( size_t j = 0 ; rc == grb::SUCCESS && j < n ; ++j ){
 			JType delta = static_cast< JType >( 0 );
 			JType dn = static_cast< JType >( 0 );
 			dn = (2.0 * state[ j ] - 1.0) * h[ j ];
 
 			const bool accept = ( dn >= 0 ) || ( log_rand[ j ] < beta * dn );
 			const IOType old = state[ j ];
-			grb::setElement( state,  (accept ? 1 - old : old), j );
+			rc = rc ? rc : grb::setElement( state,  (accept ? 1 - old : old), j );
 
 			// grb::setElement( delta,  static_cast< JType >( state[j] - old ), j );
 			delta =  static_cast< JType >( state[j] - old );
@@ -334,12 +336,13 @@ static JType sequential_sweep_immediate(
 			// update h
 			if( delta ){
 				static grb::Vector< JType > deltav ( n );
-				grb::clear( deltav );
-				grb::setElement( deltav, delta , j );
+				rc = rc ? rc : grb::clear( deltav );
+				rc = rc ? rc : grb::setElement( deltav, delta , j );
 
-				grb::mxv( h, couplings, deltav, ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
+				rc = rc ? rc : grb::mxv( h, couplings, deltav, ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
 			}
 		}
+		assert( rc == grb::SUCCESS );
 		const auto new_state = state;
 
 		// std::cerr << "\n\t Delta_energy: " << delta_energy;
@@ -348,7 +351,7 @@ static JType sequential_sweep_immediate(
 		// std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
 		// std::cerr << std::endl;
 
-		assert( ISCLOSE(delta_energy - (get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state))) );
+		assert( ISCLOSE(get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state), delta_energy ) );
 
 		return delta_energy;
 }
@@ -485,9 +488,9 @@ void grbProgram(
 		}
 
 #ifdef DEBUG_IMSB
-	if( s == 0 ) {
+	if( s == 0 && grb::ncols( J ) < 40 ) {
 		std::cout << "Matrix J:\n";
-		print_matrix( J);
+		print_matrix( J );
 	}
 #endif
 	}
@@ -540,7 +543,7 @@ void grbProgram(
 		grb::Vector<JType> zero ( n );
 		grb::set( zero, 0 );
 		grb::setElement( zero, 1, 1 );
-		assert( std::abs(get_energy(  J, h, zero ) - 0.5803450826765713) < 1e-4 );
+		// assert( std::abs(get_energy(  J, h, zero ) - 0.5803450826765713) < 1e-4 );
     }
     #endif
 
@@ -548,7 +551,7 @@ void grbProgram(
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType > betas( n_replicas );
     grb::Vector< JType > energies( n_replicas );
-    for ( size_t r = 0; r < n_replicas; ++r ) {
+    for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
         rc = rc ? rc : grb::setElement( betas, static_cast<JType>(10.0/r), r );
         rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r] ), r );
     }
@@ -623,11 +626,11 @@ void grbProgram(
 		const double time_taken = timer.time();
 		for ( size_t r = 0; r < n_replicas; ++r ) {
 			std::cout << "Final state replica " << r << ":\n";
-			print_vector( states[r], 30 ,"states values" );  
+			print_vector( states[r], 50 ,"states values" );  
 			std::cout << "With energy " << energies[ r ] << "\n";
 			std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
 			std::cout << std::endl;
-			assert( ISCLOSE(energies[ r ] - get_energy(  J, h, states[r] ) ) );
+			assert( ISCLOSE( get_energy( J, h, states[r] ), energies[ r ] ) );
 		}
 		for(size_t i = 0 ; i < n_replicas ; ++i ){
 			out.best_energy = std::min( out.best_energy, energies[ i ] );
@@ -734,7 +737,7 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
 
     // basic validation
     if ( !in.use_default_data ) {
-        if ( in.filename_Jmatrix.empty() ) {
+        if ( in.filename_Jmatrix.empty() || in.filename_h.empty() ) {
             std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
             return false;
         }

From e690759339cb71403c39a763929e212e5c19dee2 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 21 Oct 2025 15:38:46 +0200
Subject: [PATCH 06/58] Functions now no longer allocate vectors internally

---
 .../algorithms/simulated_annealing_re.hpp     | 54 +++++++-----
 tests/smoke/simulated_annealing_re.cpp        | 82 +++++++++----------
 2 files changed, 74 insertions(+), 62 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index b59b0ed68..25be3967b 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -27,9 +27,11 @@
 #define _H_GRB_ALGORITHMS_SA_RE
 
 #include <vector>
+#include <type_traits>
 #include <algorithm>
 #include <cstdlib>
 #include <cassert>
+#include <cmath>
 
 #ifndef NDEBUG
 #include <iostream>
@@ -70,9 +72,10 @@ namespace grb {
 				){
 			const size_t n_replicas = states.size();
 
-			for( size_t i = 1 ; i < n_replicas ; ++i ){
+			for( size_t i = n_replicas-1 ; i > 0 ; --i ){
         		const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
-				if( de >= 0 || std::rand() < RAND_MAX*std::exp(de) ){
+
+				if( de >= 0 || std::rand() < RAND_MAX * exp( de ) ){
 					std::swap( betas[i], betas[i-1] );
 				}	
 			}
@@ -99,6 +102,8 @@ namespace grb {
 		 * @param[in]     couplings     The square (symmetric) couplings matrix.
 		 * @param[in,out] energies      The initial energy of each state.
 		 * @param[in,out] betas     	Inverse temperature of each state.
+		 * @param[in,out] temp_states   Inverse temperature of each state.
+		 * @param[in,out] temp_energies Inverse temperature of each state.
 		 * @param[in]     n_replicas    Number of replicas to run concurrently.
 		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
 		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
@@ -128,6 +133,9 @@ namespace grb {
 						 const grb::Vector< QType, backend >&,
 						 grb::Vector< StateType, backend >&,
 						 const TempType&,
+						 grb::Vector< QType >&,
+						 grb::Vector< QType >&,
+						 grb::Vector< EnergyType >&,
 						 const Ring&
 				 	)
 				> &sweep,
@@ -136,6 +144,11 @@ namespace grb {
 				const grb::Vector< QType, backend > &local_fields,
 				grb::Vector< EnergyType > &energies,
 				grb::Vector< TempType > &betas,
+				std::vector< grb::Vector< StateType, backend > >  &temp_states,
+				grb::Vector< EnergyType > &temp_energies,
+				grb::Vector< QType > &temp_sweep1,
+				grb::Vector< QType > &temp_sweep2,
+				grb::Vector< EnergyType > &temp_sweep3,
 				const size_t &n_sweeps = 1,
 				const bool &use_pt = false,
 				const Ring &ring = Ring()
@@ -166,34 +179,26 @@ namespace grb {
 
 			grb::RC rc = grb::SUCCESS;
 
-			static std::vector< grb::Vector< StateType, backend > >  best_states;
-			static grb::Vector< EnergyType > best_energies ( n_replicas );
-			best_energies = energies;
-			best_states =  states;
+			temp_energies = energies;
+			temp_states =  states;
 
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
 				// randomize order of replicas
 				// std::random_shuffle( states.begin(), states.end() );
 
-				/*
-				grb::eWiseApply(energies, states, betas, 
-						[&](auto state, auto beta){
-						return sweep( couplings, local_fields, state, beta, ring )
-						}
-						);
-						*/
 				for( size_t j = 0 ; rc == grb::SUCCESS && j < n_replicas ; ++j ){
 					
-					energies[j] += sweep( couplings, local_fields, states[j], betas[j], ring );
+				energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep1, temp_sweep2 , temp_sweep3 , ring );
 				
 					// update_best state and energy
-					if( energies[j] < best_energies[j] ){
-						best_energies[j] = energies[j];
-						best_states[j] = states[j];
+					if( energies[j] < temp_energies[j] ){
+						temp_energies[j] = energies[j];
+						temp_states[j] = states[j];
 					}
 				} // n_replicas
 
-				if( rc == SUCCESS && use_pt ){ // Parallel Tempering move
+				if( rc == SUCCESS && use_pt ){
+					// do a Parallel Tempering move
 					rc = pt( states, energies, betas );
 				}
 #ifndef NDEBUG
@@ -208,14 +213,21 @@ namespace grb {
 			}
 #endif
 			if( rc == SUCCESS ){
-				// copy assignment throws an error. We'll do move-assignment I guess
-				states = std::move(best_states);
-				energies = std::move(best_energies);
+				states = temp_states;
+				energies = temp_energies;
 			}
 
 			return rc;
 		}
 
+		template< typename T >
+		inline T
+		exp(T x ){
+			static_assert(std::is_same<T, float>::value ||
+				std::is_same<T, double>::value ||
+				std::is_same<T, long double>::value);
+			return std::exp( x );
+		}
 	} // namespace algorithms
 
 } // end namespace grb
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index f4681344e..01e74926a 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -40,8 +40,9 @@ using namespace grb;
 
 
 // Types
-using IOType = double;   // scalar/vector element type
+using IOType = int;   // scalar/vector element type
 using JType  = double;   // coupling (matrix) value type
+using EnergyType  = double;   // coupling (matrix) value type
 
 /** Parser type */
 typedef grb::utils::MatrixFileReader<
@@ -144,7 +145,7 @@ struct output {
     int error_code = 0;
     // TODO: remove itrations if not applicable
     size_t iterations = 10; // total number of iterations performed does not make sense since the code does not have convergence criteria
-    double best_energy = std::numeric_limits< JType >::max();
+    EnergyType best_energy = std::numeric_limits< EnergyType >::max();
 	size_t rep;
 	grb::utils::TimerResults times;
     std::unique_ptr< PinnedVector< JType > > pinnedSolutionVector;
@@ -261,7 +262,7 @@ template<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
 		> >
-JType get_energy(
+EnergyType get_energy(
 				 const grb::Matrix< JType >& couplings,
 				 const grb::Vector< JType > &local_fields,
 				 const grb::Vector< IOType > &state,
@@ -286,29 +287,29 @@ template<
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
-		> >
-static JType sequential_sweep_immediate(
+		>
+	>
+static EnergyType sequential_sweep_immediate(
 				 const grb::Matrix< JType >& couplings,
 				 const grb::Vector< JType > &local_fields,
 				 grb::Vector< IOType > &state,
 				 const JType &beta,
+				 grb::Vector< JType > &h,
+				 grb::Vector< JType > &log_rand,
+				 grb::Vector< EnergyType > &deltav,
 				 const Ring &ring = Ring()
 			  ){
 
 		grb::RC rc = grb::SUCCESS;
 		const size_t n = grb::size( state );
-		JType delta_energy = static_cast< JType >(0.0);
+		EnergyType delta_energy = static_cast< JType >(0.0);
 
-		grb::Vector< JType > h ( n );
+		rc = rc ? rc : grb::resize( h, n );
 		rc = rc ? rc : grb::set( h, static_cast< JType >( 0.0 ) );
 		rc = rc ? rc : grb::mxv( h, couplings, state , ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
 		rc = rc ? rc : grb::foldl( h, local_fields, ring.getAdditiveOperator() );
-		// rc = rc ? rc : grb::foldl( h, local_fields, ring.getAdditiveOperator() );
-
-		// static grb::Vector< JType > delta ( n );
-		// static grb::Vector< JType > dn ( n );
 
-		grb::Vector< JType > log_rand ( n );
+		rc = rc ? rc : grb::resize( log_rand, n );
 		for( size_t j = 0 ; j < n ; ++j ){
 			constexpr auto rm = static_cast< JType >( RAND_MAX ) + 2;
 			const auto randi =static_cast< JType >( std::rand() ) + 1;
@@ -316,26 +317,25 @@ static JType sequential_sweep_immediate(
 			rc = rc ? rc : grb::setElement(log_rand,  std::log( rand ), j );
 		}
 
-
-
-		const auto old_state = state;
+#ifndef NDEBUG
+		const grb::Vector< IOType > old_state = state;
+#endif
 		// TODO: masking
 		for( size_t j = 0 ; rc == grb::SUCCESS && j < n ; ++j ){
-			JType delta = static_cast< JType >( 0 );
-			JType dn = static_cast< JType >( 0 );
+			EnergyType delta = static_cast< EnergyType >( 0 );
+			EnergyType dn = static_cast< EnergyType >( 0 );
 			dn = (2.0 * state[ j ] - 1.0) * h[ j ];
 
 			const bool accept = ( dn >= 0 ) || ( log_rand[ j ] < beta * dn );
 			const IOType old = state[ j ];
 			rc = rc ? rc : grb::setElement( state,  (accept ? 1 - old : old), j );
 
-			// grb::setElement( delta,  static_cast< JType >( state[j] - old ), j );
-			delta =  static_cast< JType >( state[j] - old );
+			// grb::setElement( delta,  static_cast< IOType >( state[j] - old ), j );
+			delta =  static_cast< EnergyType >( state[j] - old );
 			delta_energy -= dn * accept ;
 
 			// update h
 			if( delta ){
-				static grb::Vector< JType > deltav ( n );
 				rc = rc ? rc : grb::clear( deltav );
 				rc = rc ? rc : grb::setElement( deltav, delta , j );
 
@@ -343,6 +343,7 @@ static JType sequential_sweep_immediate(
 			}
 		}
 		assert( rc == grb::SUCCESS );
+#ifndef NDEBUG
 		const auto new_state = state;
 
 		// std::cerr << "\n\t Delta_energy: " << delta_energy;
@@ -352,6 +353,7 @@ static JType sequential_sweep_immediate(
 		// std::cerr << std::endl;
 
 		assert( ISCLOSE(get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state), delta_energy ) );
+#endif
 
 		return delta_energy;
 }
@@ -363,23 +365,16 @@ template<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
 		> >
-std::function< JType(
+std::function< EnergyType(
 					 const grb::Matrix< JType >&,
 					 const grb::Vector< JType >&,
 					 grb::Vector< IOType >&,
 					 const JType&,
+					 grb::Vector< JType >&,
+					 grb::Vector< JType >&,
+					 grb::Vector< EnergyType >&,
 					 const Ring&
 				 ) > get_sweep_function( std::string sweep_name ){
-	/*
-	if( sweep_name != "sequential_sweep_x" ){
-		return [](
-				 const grb::Matrix< JType >& couplings,
-				 const grb::Vector< JType > &local_fields,
-				 grb::Vector< IOType > &state,
-				 const JType &beta,
-				 const Ring &ring = Ring()
-			  ){ return 0; };
-	} */
 	if( sweep_name != "sequential_sweep_immediate" ){
 			std::cerr << "Warning: unknown sweep setting. Falling back to  \"sequential_sweep_immediate\"" << std::endl;
 	}
@@ -450,12 +445,12 @@ void grbProgram(
     /* --- Problem setup --- */
     const size_t n = std::get<0>(Storage::getData());
 	std::cout << "problem size n = " << n << "\n";
-    grb::Vector<JType> h( n );
+    grb::Vector< JType > h( n );
     // populate J with test (random) values
     grb::RC rc = grb::SUCCESS;
 
     // load into GraphBLAS
-    grb::Matrix<JType> J( n, n );
+    grb::Matrix< JType > J( n, n );
 	{
 		const auto &data = std::get<7>(Storage::getData());
 		RC io_rc = buildMatrixUnique(
@@ -540,9 +535,9 @@ void grbProgram(
             std::cout << std::endl;
         }
 
-		grb::Vector<JType> zero ( n );
-		grb::set( zero, 0 );
-		grb::setElement( zero, 1, 1 );
+		// grb::Vector< JType > zero ( n );
+		// grb::set( zero, 0 );
+		// grb::setElement( zero, 1, 1 );
 		// assert( std::abs(get_energy(  J, h, zero ) - 0.5803450826765713) < 1e-4 );
     }
     #endif
@@ -550,9 +545,10 @@ void grbProgram(
 
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType > betas( n_replicas );
-    grb::Vector< JType > energies( n_replicas );
+    grb::Vector< EnergyType > energies( n_replicas );
+    grb::Vector< EnergyType > temp_energies( n_replicas );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast<JType>(10.0/r), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
         rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r] ), r );
     }
     rc = rc ? rc : wait();
@@ -562,14 +558,18 @@ void grbProgram(
     // will be initalize in the algorithm
 
 
-    // TODO: add times
+    std::vector< grb::Vector<IOType> > temp_states;
+	grb::Vector< JType > temp_h ( n );
+	grb::Vector< JType > temp_log_rand ( n );
+	grb::Vector< EnergyType > temp_deltav ( n );
+
 
 	out.rep = data_in.rep;
 	// time a single call
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, states, J, h, energies, betas, data_in.nsweeps, data_in.use_pt
+				sweep, states, J, h, energies, betas, temp_states, temp_energies, temp_h, temp_log_rand, temp_deltav, data_in.nsweeps, data_in.use_pt
         );
 
 		rc = rc ? rc : wait();
@@ -615,7 +615,7 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE(
-					sweep, states, J, h, energies, betas, data_in.nsweeps, data_in.use_pt
+					sweep, states, J, h, energies, betas, temp_states, temp_energies, temp_h, temp_log_rand, temp_deltav, data_in.nsweeps, data_in.use_pt
                 );
 
 			}

From 6112d228330016b7a892fd661e44a0f4c23f82fd Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 23 Oct 2025 17:08:20 +0200
Subject: [PATCH 07/58] Masked rows sweep works -- needs cleanup

---
 .../algorithms/simulated_annealing_re.hpp     |   8 +-
 tests/smoke/simulated_annealing_re.cpp        | 140 ++++++++++++------
 2 files changed, 98 insertions(+), 50 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 25be3967b..30c007ada 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -135,7 +135,8 @@ namespace grb {
 						 const TempType&,
 						 grb::Vector< QType >&,
 						 grb::Vector< QType >&,
-						 grb::Vector< EnergyType >&,
+						 grb::Vector< StateType >&,
+					 	 const std::vector< grb::Vector< bool > >&,
 						 const Ring&
 				 	)
 				> &sweep,
@@ -148,7 +149,8 @@ namespace grb {
 				grb::Vector< EnergyType > &temp_energies,
 				grb::Vector< QType > &temp_sweep1,
 				grb::Vector< QType > &temp_sweep2,
-				grb::Vector< EnergyType > &temp_sweep3,
+				grb::Vector< StateType > &temp_sweep3,
+				const std::vector< grb::Vector< bool > >& masks,
 				const size_t &n_sweeps = 1,
 				const bool &use_pt = false,
 				const Ring &ring = Ring()
@@ -188,7 +190,7 @@ namespace grb {
 
 				for( size_t j = 0 ; rc == grb::SUCCESS && j < n_replicas ; ++j ){
 					
-				energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep1, temp_sweep2 , temp_sweep3 , ring );
+				energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep1, temp_sweep2 , temp_sweep3, masks , ring );
 				
 					// update_best state and energy
 					if( energies[j] < temp_energies[j] ){
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 01e74926a..44173c1fd 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -40,7 +40,7 @@ using namespace grb;
 
 
 // Types
-using IOType = int;   // scalar/vector element type
+using IOType = double;   // scalar/vector element type
 using JType  = double;   // coupling (matrix) value type
 using EnergyType  = double;   // coupling (matrix) value type
 
@@ -118,11 +118,19 @@ namespace test_data {
     const size_t nnz = j_matrix_data.size();
 
     const std::vector< JType > h_array_data = {
-		-0.08910436,  0.58034508,  0.97719304,  0.16792909,
-		-0.9221754 , -0.10715418 -0.62365497,  0.25411129,
-		-0.5693644 , -0.69805978 , 0.07228861 -0.79922641,
-		0.46231686 , 0.87930208 , 0.88663637, -0.25052299
+        -0.08910436,  0.58034508,  0.97719304,  0.16792909,
+		-0.9221754 , -0.10715418, -0.62365497,  0.25411129,
+		-0.5693644 , -0.69805978,  0.07228861, -0.79922641,
+		0.46231686 , 0.87930208 ,  0.88663637, -0.25052299,
     };
+
+	const std::vector< std::vector< size_t > > row_blocks = {
+		// {3, 1, 6, 7, 9, 11, 12, 13, 14, 15}, {5, 2, 0, 8, 10}, {4} // for python data files
+		// {2, 4}, {0, 3, 11, 12, 13, 14, 15}, {1, 5, 6, 7, 8, 9, 10}
+		{0, 2, 4, 7, 9, 12, 13, 15}, {1, 3, 6, 8, 11},
+		{5, 10, 14},
+		
+	};
 }
 // --- New, minimal runner configuration and result types ---
 struct input {
@@ -268,14 +276,15 @@ EnergyType get_energy(
 				 const grb::Vector< IOType > &state,
 				 const Ring &ring = Ring()
 			  ){
-	static grb::Vector< JType > tmp ( grb::size( state ) );
-	grb::clear( tmp );
+	static grb::Vector< JType > tmp ( grb::size( local_fields ) );
+	grb::RC rc = grb::clear( tmp );
 	JType energy = 0.0;
 
-	grb::mxv( tmp, couplings, state, ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
-	grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeOperator() );
-	grb::foldl( tmp, local_fields, ring.getAdditiveOperator() );
-	grb::dot( energy, tmp, state, ring );
+	rc = rc ? rc : grb::mxv( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeOperator() );
+	rc = rc ? rc : grb::foldl( tmp, local_fields, ring.getAdditiveOperator() );
+	rc = rc ? rc : grb::dot( energy, tmp, state, ring );
+	assert( rc == grb::SUCCESS );
 
 	return energy;
 }
@@ -296,7 +305,8 @@ static EnergyType sequential_sweep_immediate(
 				 const JType &beta,
 				 grb::Vector< JType > &h,
 				 grb::Vector< JType > &log_rand,
-				 grb::Vector< EnergyType > &deltav,
+				 grb::Vector< JType > &delta,
+				 const std::vector< grb::Vector< bool > > &masks,
 				 const Ring &ring = Ring()
 			  ){
 
@@ -316,41 +326,71 @@ static EnergyType sequential_sweep_immediate(
 			const auto rand = randi / rm ;
 			rc = rc ? rc : grb::setElement(log_rand,  std::log( rand ), j );
 		}
+		// print_vector( log_rand, 30, "log_rand" );
+
+		grb::Vector< EnergyType > dn ( n ); // TODO don't allocate O(n) memory
+		grb::Vector< bool > accept ( n );
 
 #ifndef NDEBUG
 		const grb::Vector< IOType > old_state = state;
+		const auto h0 = h;
 #endif
-		// TODO: masking
-		for( size_t j = 0 ; rc == grb::SUCCESS && j < n ; ++j ){
-			EnergyType delta = static_cast< EnergyType >( 0 );
-			EnergyType dn = static_cast< EnergyType >( 0 );
-			dn = (2.0 * state[ j ] - 1.0) * h[ j ];
-
-			const bool accept = ( dn >= 0 ) || ( log_rand[ j ] < beta * dn );
-			const IOType old = state[ j ];
-			rc = rc ? rc : grb::setElement( state,  (accept ? 1 - old : old), j );
-
-			// grb::setElement( delta,  static_cast< IOType >( state[j] - old ), j );
-			delta =  static_cast< EnergyType >( state[j] - old );
-			delta_energy -= dn * accept ;
+		grb::wait();
+		for(const auto &mask : masks ){
+
+			rc = rc ? rc : grb::clear( accept  );
+			rc = rc ? rc : grb::clear( delta  );
+			rc = rc ? rc : grb::clear( dn );
+
+			// dn = (2*state_slice - 1) * h_slice
+			rc = rc ? rc : grb::set( dn, mask, state );
+			rc = rc ? rc : grb::foldl( dn, mask, static_cast< EnergyType >( 2 ), ring.getMultiplicativeOperator()  );
+			rc = rc ? rc : grb::foldl( dn, mask, static_cast< EnergyType >( -1 ), ring.getAdditiveOperator() );
+			rc = rc ? rc : grb::foldl( dn, h, ring.getMultiplicativeMonoid() );
+
+			// ( dn >= 0 ) | ( log_rand < beta * dn )
+			rc = rc ? rc : grb::set( accept, mask );
+			rc = rc ? rc : grb::eWiseLambda<>(
+					[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
+						(void) i;
+						if( mask[i] ){
+							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
+						}
+					}, accept, log_rand, mask, dn );
+
+
+			// new_state = np.where(accept, 1 - old, old)
+			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeOperator() );
+			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveOperator() );
+			
+			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
+			rc = rc ? rc : grb::clear( delta  );
+			rc = rc ? rc : grb::set( delta, accept, state );
+			rc = rc ? rc : grb::foldl( delta, accept, static_cast< JType >( 2.0 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl( delta, accept, static_cast< JType >( -1.0 ), ring.getAdditiveMonoid() );
+			
+			// Update delta_energy -= dot(dn, accept)
+			rc = rc ? rc : grb::dot( delta_energy, delta, h, ring );
 
 			// update h
-			if( delta ){
-				rc = rc ? rc : grb::clear( deltav );
-				rc = rc ? rc : grb::setElement( deltav, delta , j );
+			rc = rc ? rc : grb::mxv( h, couplings, delta, ring );
+			
+			grb::wait();
+		}
+		// delta_energy = - delta_energy;
 
-				rc = rc ? rc : grb::mxv( h, couplings, deltav, ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
-			}
+#ifndef NDEBUG
+		if( rc != grb::SUCCESS ){
+			std::cerr << "\n\t Error in some GraphBLAS function " << rc << " : " << grb::toString( rc ) << std::endl;
 		}
 		assert( rc == grb::SUCCESS );
-#ifndef NDEBUG
 		const auto new_state = state;
 
-		// std::cerr << "\n\t Delta_energy: " << delta_energy;
-		// std::cerr << "\n\t Real delta: " << (get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state));
-		// std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
-		// std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
-		// std::cerr << std::endl;
+		std::cerr << "\n\t Delta_energy: " << delta_energy;
+		std::cerr << "\n\t Real delta: " << (get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state));
+		std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
+		std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
+		std::cerr << std::endl;
 
 		assert( ISCLOSE(get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state), delta_energy ) );
 #endif
@@ -372,7 +412,8 @@ std::function< EnergyType(
 					 const JType&,
 					 grb::Vector< JType >&,
 					 grb::Vector< JType >&,
-					 grb::Vector< EnergyType >&,
+					 grb::Vector< IOType >&,
+					 const std::vector< grb::Vector< bool > >&,
 					 const Ring&
 				 ) > get_sweep_function( std::string sweep_name ){
 	if( sweep_name != "sequential_sweep_immediate" ){
@@ -501,6 +542,18 @@ void grbProgram(
 		);
     }
 
+	assert( grb::nnz( grb::Vector< bool >( n ) ) == 0 );
+
+	// build masks from row block indices
+    std::vector< grb::Vector< bool > > masks;
+	for(const auto&v : test_data::row_blocks ){
+		masks.emplace_back( grb::Vector< bool >( n ) );
+		for(const auto&i : v ){
+			grb::setElement( masks.back(), 1, i );
+		}
+		print_vector( masks.back(), 30, "MASK" );
+	}
+
     // create states storage and initialize with random 1/0 values
     const size_t n_replicas = std::get<3>(Storage::getData());
     std::vector< grb::Vector<IOType> > states;
@@ -535,9 +588,6 @@ void grbProgram(
             std::cout << std::endl;
         }
 
-		// grb::Vector< JType > zero ( n );
-		// grb::set( zero, 0 );
-		// grb::setElement( zero, 1, 1 );
 		// assert( std::abs(get_energy(  J, h, zero ) - 0.5803450826765713) < 1e-4 );
     }
     #endif
@@ -553,15 +603,11 @@ void grbProgram(
     }
     rc = rc ? rc : wait();
 
-    // also make energies vector os size n_replicas and calculate values
-    // in python energies = np.array([get_energy(couplings, local_fields, state) for state in states])
-    // will be initalize in the algorithm
-
 
     std::vector< grb::Vector<IOType> > temp_states;
 	grb::Vector< JType > temp_h ( n );
 	grb::Vector< JType > temp_log_rand ( n );
-	grb::Vector< EnergyType > temp_deltav ( n );
+	grb::Vector< IOType > temp_deltav ( n );
 
 
 	out.rep = data_in.rep;
@@ -569,7 +615,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, states, J, h, energies, betas, temp_states, temp_energies, temp_h, temp_log_rand, temp_deltav, data_in.nsweeps, data_in.use_pt
+				sweep, states, J, h, energies, betas, temp_states, temp_energies, temp_h, temp_log_rand, temp_deltav, masks, data_in.nsweeps, data_in.use_pt
         );
 
 		rc = rc ? rc : wait();
@@ -615,7 +661,7 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE(
-					sweep, states, J, h, energies, betas, temp_states, temp_energies, temp_h, temp_log_rand, temp_deltav, data_in.nsweeps, data_in.use_pt
+					sweep, states, J, h, energies, betas, temp_states, temp_energies, temp_h, temp_log_rand, temp_deltav, masks, data_in.nsweeps, data_in.use_pt
                 );
 
 			}

From 83f4b3a58d0f9c8e07e71446e53afc346747f4ce Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 23 Oct 2025 17:16:20 +0200
Subject: [PATCH 08/58] Cleanup and test with different types

---
 tests/smoke/simulated_annealing_re.cpp | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 44173c1fd..0b5a76c5b 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -40,8 +40,8 @@ using namespace grb;
 
 
 // Types
-using IOType = double;   // scalar/vector element type
-using JType  = double;   // coupling (matrix) value type
+using IOType = int8_t;   // scalar/vector element type
+using JType  = float;   // coupling (matrix) value type
 using EnergyType  = double;   // coupling (matrix) value type
 
 /** Parser type */
@@ -305,7 +305,7 @@ static EnergyType sequential_sweep_immediate(
 				 const JType &beta,
 				 grb::Vector< JType > &h,
 				 grb::Vector< JType > &log_rand,
-				 grb::Vector< JType > &delta,
+				 grb::Vector< IOType > &delta,
 				 const std::vector< grb::Vector< bool > > &masks,
 				 const Ring &ring = Ring()
 			  ){
@@ -366,8 +366,8 @@ static EnergyType sequential_sweep_immediate(
 			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 			rc = rc ? rc : grb::clear( delta  );
 			rc = rc ? rc : grb::set( delta, accept, state );
-			rc = rc ? rc : grb::foldl( delta, accept, static_cast< JType >( 2.0 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl( delta, accept, static_cast< JType >( -1.0 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( 2.0 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( -1.0 ), ring.getAdditiveMonoid() );
 			
 			// Update delta_energy -= dot(dn, accept)
 			rc = rc ? rc : grb::dot( delta_energy, delta, h, ring );
@@ -377,7 +377,6 @@ static EnergyType sequential_sweep_immediate(
 			
 			grb::wait();
 		}
-		// delta_energy = - delta_energy;
 
 #ifndef NDEBUG
 		if( rc != grb::SUCCESS ){
@@ -386,11 +385,11 @@ static EnergyType sequential_sweep_immediate(
 		assert( rc == grb::SUCCESS );
 		const auto new_state = state;
 
-		std::cerr << "\n\t Delta_energy: " << delta_energy;
-		std::cerr << "\n\t Real delta: " << (get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state));
-		std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
-		std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
-		std::cerr << std::endl;
+		// std::cerr << "\n\t Delta_energy: " << delta_energy;
+		// std::cerr << "\n\t Real delta: " << (get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state));
+		// std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
+		// std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
+		// std::cerr << std::endl;
 
 		assert( ISCLOSE(get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state), delta_energy ) );
 #endif

From 5771f64ed962ed01111b9ef2db51259b496ba224 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 28 Oct 2025 16:04:24 +0100
Subject: [PATCH 09/58] Cleanup and better temporary data passing to sweep
 function

---
 .../algorithms/simulated_annealing_re.hpp     |  53 ++++---
 tests/smoke/simulated_annealing_re.cpp        | 140 ++++++++++--------
 2 files changed, 107 insertions(+), 86 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 30c007ada..d4f3cad5d 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -112,14 +112,16 @@ namespace grb {
 		 * @tparam StateType	The state variable type.
 		 * @tparam EnergyType	The energy type.
 		 * @tparam TempType		The inverse temperature type.
+		 * @tparam SweepDataType	Type of data to be passed on to the sweep function (e.g. a tuple of references to temporary vectors).
 		 * @tparam Ring			The semiring under which to make the sweeps.
 		 *
 		 */
 		template<
 			typename QType, // type of coupling matrix values
-			typename StateType, // type of state, ideally 0/1
+			typename StateType, // type of state, possibly 0/1
 			typename EnergyType,
 			typename TempType,
+			typename SweepDataType, // type of data to be passed through to the sweep function
 			typename RSI, typename CSI, typename NZI, Backend backend,
 			class Ring = Semiring<
 				grb::operators::add< QType >, grb::operators::mul< QType >,
@@ -133,10 +135,7 @@ namespace grb {
 						 const grb::Vector< QType, backend >&,
 						 grb::Vector< StateType, backend >&,
 						 const TempType&,
-						 grb::Vector< QType >&,
-						 grb::Vector< QType >&,
-						 grb::Vector< StateType >&,
-					 	 const std::vector< grb::Vector< bool > >&,
+						 SweepDataType&,
 						 const Ring&
 				 	)
 				> &sweep,
@@ -147,36 +146,35 @@ namespace grb {
 				grb::Vector< TempType > &betas,
 				std::vector< grb::Vector< StateType, backend > >  &temp_states,
 				grb::Vector< EnergyType > &temp_energies,
-				grb::Vector< QType > &temp_sweep1,
-				grb::Vector< QType > &temp_sweep2,
-				grb::Vector< StateType > &temp_sweep3,
-				const std::vector< grb::Vector< bool > >& masks,
+				SweepDataType& temp_sweep,
 				const size_t &n_sweeps = 1,
 				const bool &use_pt = false,
 				const Ring &ring = Ring()
 				){
 
-			size_t n_replicas = states.size();
+			const size_t n_replicas = states.size();
+			const size_t n = grb::size(states[0]);
 
 			assert( n_replicas > 0 );
 			assert( n_replicas == grb::size( betas ) );
-			assert( grb::ncols( couplings ) == grb::nrows( couplings ) );
-			assert( grb::size( states[0] ) == grb::nrows( couplings ) );
-			assert( grb::size( states[0] ) == grb::size( local_fields ) );
+			assert( n == grb::ncols( couplings ) );
+			assert( n == grb::nrows( couplings ) );
+			assert( n == grb::size( local_fields ) );
 
-			for(size_t i = 1; i < n_replicas ; ++i ){
-				assert( grb::size( states[0] ) == grb::size( states[ i ] ) );
+			for(size_t i = 0; i < n_replicas ; ++i ){
+				assert( n == grb::size( states[ i ] ) );
 			}
 
-			const size_t n = grb::size(states[0]);
 
 #ifndef NDEBUG
-			std::cerr << "DEBUG: Called  simulated_annealing_RE with parameters: "
-				      << "\n\t n = " << n
-				      << "\n\t n_replicas = " << n_replicas
-				      << "\n\t n_sweeps = " << n_sweeps
-				      << "\n\t use_pt = " << use_pt
-				      << std::endl;
+			if( grb::spmd<>::pid() == 0 ) {
+				std::cerr << "DEBUG: Called  simulated_annealing_RE with parameters: "
+						  << "\n\t n = " << n
+						  << "\n\t n_replicas = " << n_replicas
+						  << "\n\t n_sweeps = " << n_sweeps
+						  << "\n\t use_pt = " << use_pt
+						  << std::endl;
+			}
 #endif
 
 			grb::RC rc = grb::SUCCESS;
@@ -185,12 +183,9 @@ namespace grb {
 			temp_states =  states;
 
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
-				// randomize order of replicas
-				// std::random_shuffle( states.begin(), states.end() );
-
-				for( size_t j = 0 ; rc == grb::SUCCESS && j < n_replicas ; ++j ){
+				for( size_t j = 0 ; j < n_replicas ; ++j ){
 					
-				energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep1, temp_sweep2 , temp_sweep3, masks , ring );
+					energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep, ring );
 				
 					// update_best state and energy
 					if( energies[j] < temp_energies[j] ){
@@ -204,7 +199,9 @@ namespace grb {
 					rc = pt( states, energies, betas );
 				}
 #ifndef NDEBUG
-				std::cerr << "Energy at iteration " << i_sweep << " = " << energies[ 0 ] << std::endl;
+				if( grb::spmd<>::pid() == 0 ) {
+					std::cerr << "Energy at iteration " << i_sweep << " = " << energies[ 0 ] << std::endl;
+				}
 #endif
 			} // n_sweeps
 
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 0b5a76c5b..93d14144f 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -18,6 +18,7 @@
 #include <string>
 #include <memory>
 #include <algorithm>
+#include <random>
 #include <cassert>
 #include <cstdlib>
 #include <unistd.h>
@@ -31,7 +32,6 @@
 #include <utils/output_verification.hpp>
 #include <graphblas.hpp>
 #include <utils/print_vec_mat.hpp>
-#include <random>
 
 using namespace grb;
 
@@ -126,11 +126,13 @@ namespace test_data {
 
 	const std::vector< std::vector< size_t > > row_blocks = {
 		// {3, 1, 6, 7, 9, 11, 12, 13, 14, 15}, {5, 2, 0, 8, 10}, {4} // for python data files
-		// {2, 4}, {0, 3, 11, 12, 13, 14, 15}, {1, 5, 6, 7, 8, 9, 10}
-		{0, 2, 4, 7, 9, 12, 13, 15}, {1, 3, 6, 8, 11},
+		{0, 2, 4, 7, 9, 12, 13, 15},
+		{1, 3, 6, 8, 11},
 		{5, 10, 14},
 		
 	};
+
+    std::minstd_rand global_rng ( 8 ); // or std::mt19937
 }
 // --- New, minimal runner configuration and result types ---
 struct input {
@@ -249,7 +251,6 @@ void read_vector_data(const std::string &filename, std::vector<Dtype> &data) {
     }
 }
 
-
 template< typename Dtype >
 void read_vector_data_from_array(
 	const std::vector<Dtype> &array, std::vector<Dtype> &data
@@ -278,21 +279,18 @@ EnergyType get_energy(
 			  ){
 	static grb::Vector< JType > tmp ( grb::size( local_fields ) );
 	grb::RC rc = grb::clear( tmp );
-	JType energy = 0.0;
+	EnergyType energy = 0.0;
 
 	rc = rc ? rc : grb::mxv( tmp, couplings, state, ring );
-	rc = rc ? rc : grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeOperator() );
-	rc = rc ? rc : grb::foldl( tmp, local_fields, ring.getAdditiveOperator() );
+	rc = rc ? rc : grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl( tmp, local_fields, ring.getAdditiveMonoid() );
 	rc = rc ? rc : grb::dot( energy, tmp, state, ring );
 	assert( rc == grb::SUCCESS );
 
 	return energy;
 }
 
-
-
 template<
-		// Backend backend=grb::reference,
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
@@ -303,10 +301,14 @@ static EnergyType sequential_sweep_immediate(
 				 const grb::Vector< JType > &local_fields,
 				 grb::Vector< IOType > &state,
 				 const JType &beta,
-				 grb::Vector< JType > &h,
-				 grb::Vector< JType > &log_rand,
-				 grb::Vector< IOType > &delta,
-				 const std::vector< grb::Vector< bool > > &masks,
+				 std::tuple<
+					 grb::Vector< JType >&,
+					 grb::Vector< JType >&,
+					 grb::Vector< IOType >&,
+					 const std::vector< grb::Vector< bool > >&,
+					 grb::Vector< EnergyType >&,
+					 grb::Vector< bool >&
+					 > &data,
 				 const Ring &ring = Ring()
 			  ){
 
@@ -314,23 +316,29 @@ static EnergyType sequential_sweep_immediate(
 		const size_t n = grb::size( state );
 		EnergyType delta_energy = static_cast< JType >(0.0);
 
-		rc = rc ? rc : grb::resize( h, n );
-		rc = rc ? rc : grb::set( h, static_cast< JType >( 0.0 ) );
-		rc = rc ? rc : grb::mxv( h, couplings, state , ring.getAdditiveMonoid(), ring.getMultiplicativeOperator() );
-		rc = rc ? rc : grb::foldl( h, local_fields, ring.getAdditiveOperator() );
+		auto &h 		= std::get<0>(data);
+		auto &log_rand	= std::get<1>(data);
+		auto &delta		= std::get<2>(data);
+		const auto &masks = std::get<3>(data);
+		auto &dn		= std::get<4>(data);
+		auto &accept	= std::get<5>(data);
 
+		rc = rc ? rc : grb::resize( h, n );
 		rc = rc ? rc : grb::resize( log_rand, n );
+		rc = rc ? rc : grb::resize( delta, n );
+		rc = rc ? rc : grb::resize( dn, n );
+		rc = rc ? rc : grb::resize( accept, n );
+
+		rc = rc ? rc : grb::set( h, local_fields );
+		rc = rc ? rc : grb::mxv( h, couplings, state , ring );
+
+		std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
 		for( size_t j = 0 ; j < n ; ++j ){
-			constexpr auto rm = static_cast< JType >( RAND_MAX ) + 2;
-			const auto randi =static_cast< JType >( std::rand() ) + 1;
-			const auto rand = randi / rm ;
-			rc = rc ? rc : grb::setElement(log_rand,  std::log( rand ), j );
+			const auto rnd = rand( test_data::global_rng );
+			rc = rc ? rc : grb::setElement(log_rand,  std::log( rnd ), j );
 		}
 		// print_vector( log_rand, 30, "log_rand" );
 
-		grb::Vector< EnergyType > dn ( n ); // TODO don't allocate O(n) memory
-		grb::Vector< bool > accept ( n );
-
 #ifndef NDEBUG
 		const grb::Vector< IOType > old_state = state;
 		const auto h0 = h;
@@ -344,8 +352,8 @@ static EnergyType sequential_sweep_immediate(
 
 			// dn = (2*state_slice - 1) * h_slice
 			rc = rc ? rc : grb::set( dn, mask, state );
-			rc = rc ? rc : grb::foldl( dn, mask, static_cast< EnergyType >( 2 ), ring.getMultiplicativeOperator()  );
-			rc = rc ? rc : grb::foldl( dn, mask, static_cast< EnergyType >( -1 ), ring.getAdditiveOperator() );
+			rc = rc ? rc : grb::foldl( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
+			rc = rc ? rc : grb::foldl( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
 			rc = rc ? rc : grb::foldl( dn, h, ring.getMultiplicativeMonoid() );
 
 			// ( dn >= 0 ) | ( log_rand < beta * dn )
@@ -358,16 +366,15 @@ static EnergyType sequential_sweep_immediate(
 						}
 					}, accept, log_rand, mask, dn );
 
-
 			// new_state = np.where(accept, 1 - old, old)
-			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeOperator() );
-			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveOperator() );
+			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
 			
 			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 			rc = rc ? rc : grb::clear( delta  );
 			rc = rc ? rc : grb::set( delta, accept, state );
-			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( 2.0 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( -1.0 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( 2 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
 			
 			// Update delta_energy -= dot(dn, accept)
 			rc = rc ? rc : grb::dot( delta_energy, delta, h, ring );
@@ -381,6 +388,7 @@ static EnergyType sequential_sweep_immediate(
 #ifndef NDEBUG
 		if( rc != grb::SUCCESS ){
 			std::cerr << "\n\t Error in some GraphBLAS function " << rc << " : " << grb::toString( rc ) << std::endl;
+			abort();
 		}
 		assert( rc == grb::SUCCESS );
 		const auto new_state = state;
@@ -399,20 +407,25 @@ static EnergyType sequential_sweep_immediate(
 
 
 template<
-		// Backend backend=grb::reference,
+		typename SweepDataType = std::tuple<
+					 grb::Vector< JType >&,
+					 grb::Vector< JType >&,
+					 grb::Vector< IOType >&,
+					 const std::vector< grb::Vector< bool > >&,
+					 grb::Vector< EnergyType >&,
+					 grb::Vector< bool >&
+					 >,
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
-		> >
+		>
+	>
 std::function< EnergyType(
 					 const grb::Matrix< JType >&,
 					 const grb::Vector< JType >&,
 					 grb::Vector< IOType >&,
 					 const JType&,
-					 grb::Vector< JType >&,
-					 grb::Vector< JType >&,
-					 grb::Vector< IOType >&,
-					 const std::vector< grb::Vector< bool > >&,
+					 SweepDataType&,
 					 const Ring&
 				 ) > get_sweep_function( std::string sweep_name ){
 	if( sweep_name != "sequential_sweep_immediate" ){
@@ -432,7 +445,7 @@ void ioProgram( const struct input &data_in, bool &success ) {
     // auto &nnz         = std::get<1>(storage); // nz (nonzeros)
     auto &nsweeps_st  = std::get<2>(storage); // nsweeps
     auto &n_replicas_st = std::get<3>(storage); // n_replicas
-    auto &use_pt      = std::get<4>(storage); // seed
+    auto &use_pt      = std::get<4>(storage); // use_pt
     auto &seed_st     = std::get<5>(storage); // seed
     auto &sweep_name  = std::get<6>(storage); // sweep_name
     auto &Jdata       = std::get<7>(storage); // std::vector<NonzeroT>
@@ -484,7 +497,9 @@ void grbProgram(
 
     /* --- Problem setup --- */
     const size_t n = std::get<0>(Storage::getData());
-	std::cout << "problem size n = " << n << "\n";
+	if( s == 0 ){
+		std::cout << "problem size n = " << n << "\n";
+	}
     grb::Vector< JType > h( n );
     // populate J with test (random) values
     grb::RC rc = grb::SUCCESS;
@@ -559,13 +574,12 @@ void grbProgram(
     for ( size_t r = 0; r < n_replicas; ++r ) {
         states.emplace_back( grb::Vector<IOType>(n) );
         // initialize with random values
-        std::default_random_engine generator( std::get<5>(Storage::getData()) + r );
-        std::uniform_int_distribution< int > distribution(0,1);
+        std::uniform_int_distribution< unsigned short > randint(0,1);
         // we use buildvectorUnique with a random set of indices
         std::vector< IOType > rand_data;
         for ( size_t i = 0; i < n; ++i ) {
             rand_data.emplace_back( static_cast<IOType>(
-                distribution(generator) ) );
+                randint( test_data::global_rng ) ) );
         }
         rc = rc ? rc : grb::buildVector(
             states.back(),
@@ -606,7 +620,18 @@ void grbProgram(
     std::vector< grb::Vector<IOType> > temp_states;
 	grb::Vector< JType > temp_h ( n );
 	grb::Vector< JType > temp_log_rand ( n );
-	grb::Vector< IOType > temp_deltav ( n );
+	grb::Vector< EnergyType > temp_dn ( n );
+	grb::Vector< bool > temp_accept ( n );
+	grb::Vector< IOType > temp_delta ( n );
+	auto sweep_data = std::tie(
+ 			temp_h,
+			temp_log_rand,
+			temp_delta,
+			(const typeof(masks)&) masks,
+			temp_dn,
+			temp_accept
+			);
+
 
 
 	out.rep = data_in.rep;
@@ -614,7 +639,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, states, J, h, energies, betas, temp_states, temp_energies, temp_h, temp_log_rand, temp_deltav, masks, data_in.nsweeps, data_in.use_pt
+				sweep, states, J, h, energies, betas, temp_states, temp_energies, sweep_data, data_in.nsweeps, data_in.use_pt
         );
 
 		rc = rc ? rc : wait();
@@ -660,22 +685,23 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE(
-					sweep, states, J, h, energies, betas, temp_states, temp_energies, temp_h, temp_log_rand, temp_deltav, masks, data_in.nsweeps, data_in.use_pt
+				sweep, states, J, h, energies, betas, temp_states, temp_energies, sweep_data, data_in.nsweeps, data_in.use_pt
                 );
-
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {
 				rc = rc ? rc : wait();
 			}
 		}
 		const double time_taken = timer.time();
-		for ( size_t r = 0; r < n_replicas; ++r ) {
-			std::cout << "Final state replica " << r << ":\n";
-			print_vector( states[r], 50 ,"states values" );  
-			std::cout << "With energy " << energies[ r ] << "\n";
-			std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
-			std::cout << std::endl;
-			assert( ISCLOSE( get_energy( J, h, states[r] ), energies[ r ] ) );
+		if( s == 0 ) {
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				std::cout << "Final state replica " << r << ":\n";
+				print_vector( states[r], 50 ,"states values" );  
+				std::cout << "With energy " << energies[ r ] << "\n";
+				std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
+				std::cout << std::endl;
+				assert( ISCLOSE( get_energy( J, h, states[r] ), energies[ r ] ) );
+			}
 		}
 		for(size_t i = 0 ; i < n_replicas ; ++i ){
 			out.best_energy = std::min( out.best_energy, energies[ i ] );
@@ -683,7 +709,7 @@ void grbProgram(
 
 		out.times.useful = time_taken / static_cast< double >( out.rep );
 		// print timing at root process
-		if( grb::spmd<>::pid() == 0 ) {
+		if( s == 0 ) {
 			std::cout << "Time taken for " << out.rep << " "
 				<< "Simulated Annealing RE calls (hot start): " << out.times.useful << ". "
 				<< "Error code is " << grb::toString( rc ) << std::endl;
@@ -698,8 +724,6 @@ void grbProgram(
 	// start postamble
 	timer.reset();
 
-
-
 	// set error code
 	if( rc == FAILED ) {
 		out.error_code = 30;
@@ -807,7 +831,7 @@ int main( int argc, char ** argv ) {
 
     // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
     std::srand( static_cast<unsigned>( in.seed ) );
-    static std::mt19937 global_rng( static_cast<unsigned>( in.seed ) );
+	test_data::global_rng.seed(in.seed);
 
     std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=" << in.sweep_name << "\n";
 

From bb493ac3c82e411648a6187684fbae74bc64dc51 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 30 Oct 2025 11:10:44 +0100
Subject: [PATCH 10/58] Cleanup of useless ring parameter + function type
 templated

---
 .../algorithms/simulated_annealing_re.hpp     | 46 ++++++------
 tests/smoke/simulated_annealing_re.cpp        | 72 ++++++++++++-------
 2 files changed, 67 insertions(+), 51 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index d4f3cad5d..1fe636209 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -66,27 +66,27 @@ namespace grb {
 			Backend backend
 			>
 		grb::RC pt(
-				const std::vector< grb::Vector< StateType, backend > > &states,
-				const grb::Vector< EnergyType > &energies,
-				grb::Vector< TempType > &betas
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType > &energies,
+				const grb::Vector< TempType > &betas
 				){
 			const size_t n_replicas = states.size();
 
-			for( size_t i = n_replicas-1 ; i > 0 ; --i ){
+			for( size_t i = 1 ; i < n_replicas ; ++i ){
         		const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
 				if( de >= 0 || std::rand() < RAND_MAX * exp( de ) ){
-					std::swap( betas[i], betas[i-1] );
-				}	
+					std::swap( states[i], states[i-1] );
+					std::swap( energies[i], energies[i-1] );
+				}
 			}
 
 			return grb::SUCCESS;
 		}
 
 		/*
-		 * Estimate a solution to a given Quadratic Unconstrained Binary Optimization
-		 * (QUBO) optimization problem. The solution is found using Simulated Annealing-
-		 * Replica Exchange (also known as Parallel Tempering).
+		 * Estimate a solution to a given optimization problem. The solution is found
+		 * using Simulated Annealing-Replica Exchange (also known as Parallel Tempering).
 		 *
 		 * The state will be optimized to minimize the value of the energy $U(x)$,
 		 * where $x$ is the binary state vector, and $couplings$ is the coupling matrix.
@@ -96,7 +96,7 @@ namespace grb {
 		 * returns the variation of energy made from its changes of the state.
 		 *
 		 * @param[in]     sweep      	The sweeping function.
-		 * 								Should return the energy variation implied from the changes that it made on the state.
+		 * 								Should return the energy variation relative to the changes that it made on the state.
 		 * @param[in,out] states        On input: initial states.
 		 *                              On output: optimized states.
 		 * @param[in]     couplings     The square (symmetric) couplings matrix.
@@ -113,7 +113,6 @@ namespace grb {
 		 * @tparam EnergyType	The energy type.
 		 * @tparam TempType		The inverse temperature type.
 		 * @tparam SweepDataType	Type of data to be passed on to the sweep function (e.g. a tuple of references to temporary vectors).
-		 * @tparam Ring			The semiring under which to make the sweeps.
 		 *
 		 */
 		template<
@@ -123,22 +122,18 @@ namespace grb {
 			typename TempType,
 			typename SweepDataType, // type of data to be passed through to the sweep function
 			typename RSI, typename CSI, typename NZI, Backend backend,
-			class Ring = Semiring<
-				grb::operators::add< QType >, grb::operators::mul< QType >,
-				grb::identities::zero, grb::identities::one
-				>
-			>
-		grb::RC simulated_annealing_RE(
-				const std::function< 
+			typename SweepFuncType = std::function< 
 					EnergyType(
 						 const grb::Matrix< QType, backend, RSI, CSI, NZI >&,
 						 const grb::Vector< QType, backend >&,
 						 grb::Vector< StateType, backend >&,
 						 const TempType&,
-						 SweepDataType&,
-						 const Ring&
+						 SweepDataType&
 				 	)
-				> &sweep,
+				>
+			>
+		grb::RC simulated_annealing_RE(
+				const SweepFuncType &sweep,
 				std::vector< grb::Vector< StateType, backend > > &states,
 				const grb::Matrix< QType, backend, RSI, CSI, NZI > &couplings,
 				const grb::Vector< QType, backend > &local_fields,
@@ -148,8 +143,7 @@ namespace grb {
 				grb::Vector< EnergyType > &temp_energies,
 				SweepDataType& temp_sweep,
 				const size_t &n_sweeps = 1,
-				const bool &use_pt = false,
-				const Ring &ring = Ring()
+				const bool &use_pt = false
 				){
 
 			const size_t n_replicas = states.size();
@@ -185,7 +179,9 @@ namespace grb {
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
 				for( size_t j = 0 ; j < n_replicas ; ++j ){
 					
-					energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep, ring );
+					grb::wait();
+					energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep );
+					grb::wait();
 				
 					// update_best state and energy
 					if( energies[j] < temp_energies[j] ){
@@ -215,7 +211,7 @@ namespace grb {
 				states = temp_states;
 				energies = temp_energies;
 			}
-
+			
 			return rc;
 		}
 
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 93d14144f..ff45011cf 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -40,8 +40,8 @@ using namespace grb;
 
 
 // Types
-using IOType = int8_t;   // scalar/vector element type
-using JType  = float;   // coupling (matrix) value type
+using IOType = double;   // scalar/vector element type
+using JType  = double;   // coupling (matrix) value type
 using EnergyType  = double;   // coupling (matrix) value type
 
 /** Parser type */
@@ -126,10 +126,8 @@ namespace test_data {
 
 	const std::vector< std::vector< size_t > > row_blocks = {
 		// {3, 1, 6, 7, 9, 11, 12, 13, 14, 15}, {5, 2, 0, 8, 10}, {4} // for python data files
-		{0, 2, 4, 7, 9, 12, 13, 15},
-		{1, 3, 6, 8, 11},
-		{5, 10, 14},
-		
+		{0, 2, 4, 7, 9, 12, 13, 15}, {1, 3, 6, 8, 11}, {5, 10, 14},
+		// {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, {15}
 	};
 
     std::minstd_rand global_rng ( 8 ); // or std::mt19937
@@ -291,6 +289,14 @@ EnergyType get_energy(
 }
 
 template<
+		typename SweepDataType = std::tuple<
+					 grb::Vector< JType >&,
+					 grb::Vector< JType >&,
+					 grb::Vector< IOType >&,
+					 const std::vector< grb::Vector< bool > >&,
+					 grb::Vector< EnergyType >&,
+					 grb::Vector< bool >&
+					 >,
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
@@ -308,9 +314,10 @@ static EnergyType sequential_sweep_immediate(
 					 const std::vector< grb::Vector< bool > >&,
 					 grb::Vector< EnergyType >&,
 					 grb::Vector< bool >&
-					 > &data,
-				 const Ring &ring = Ring()
+					 > &data
 			  ){
+		const Ring ring = Ring();
+
 
 		grb::RC rc = grb::SUCCESS;
 		const size_t n = grb::size( state );
@@ -323,6 +330,7 @@ static EnergyType sequential_sweep_immediate(
 		auto &dn		= std::get<4>(data);
 		auto &accept	= std::get<5>(data);
 
+		rc = rc ? rc : grb::wait();
 		rc = rc ? rc : grb::resize( h, n );
 		rc = rc ? rc : grb::resize( log_rand, n );
 		rc = rc ? rc : grb::resize( delta, n );
@@ -337,13 +345,12 @@ static EnergyType sequential_sweep_immediate(
 			const auto rnd = rand( test_data::global_rng );
 			rc = rc ? rc : grb::setElement(log_rand,  std::log( rnd ), j );
 		}
+		// rc = rc ? rc : grb::wait();
 		// print_vector( log_rand, 30, "log_rand" );
 
 #ifndef NDEBUG
 		const grb::Vector< IOType > old_state = state;
-		const auto h0 = h;
 #endif
-		grb::wait();
 		for(const auto &mask : masks ){
 
 			rc = rc ? rc : grb::clear( accept  );
@@ -358,17 +365,22 @@ static EnergyType sequential_sweep_immediate(
 
 			// ( dn >= 0 ) | ( log_rand < beta * dn )
 			rc = rc ? rc : grb::set( accept, mask );
+			rc = rc ? rc : grb::wait(); // ERROR: Segmentation Fault with nonblocking backend
 			rc = rc ? rc : grb::eWiseLambda<>(
 					[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
 						(void) i;
 						if( mask[i] ){
 							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
 						}
-					}, accept, log_rand, mask, dn );
+					}, mask, log_rand, dn, accept );
+			// print_vector( log_rand, 30, "log_rand" );
+			// print_vector( mask, 30, "mask" );
+			// print_vector( accept, 30, "accept" );
 
 			// new_state = np.where(accept, 1 - old, old)
 			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
 			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
+			// print_vector( state, 30, "state" );
 			
 			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 			rc = rc ? rc : grb::clear( delta  );
@@ -378,12 +390,13 @@ static EnergyType sequential_sweep_immediate(
 			
 			// Update delta_energy -= dot(dn, accept)
 			rc = rc ? rc : grb::dot( delta_energy, delta, h, ring );
+			// rc = rc ? rc : grb::wait();
 
 			// update h
 			rc = rc ? rc : grb::mxv( h, couplings, delta, ring );
 			
-			grb::wait();
 		}
+		rc = rc ? rc : grb::wait();
 
 #ifndef NDEBUG
 		if( rc != grb::SUCCESS ){
@@ -392,14 +405,18 @@ static EnergyType sequential_sweep_immediate(
 		}
 		assert( rc == grb::SUCCESS );
 		const auto new_state = state;
+		rc = rc ? rc : grb::wait();
 
-		// std::cerr << "\n\t Delta_energy: " << delta_energy;
-		// std::cerr << "\n\t Real delta: " << (get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state));
+		const auto real_delta = get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state);
+		std::cerr << "\n\t Delta_energy: " << delta_energy;
+		std::cerr << "\n\t Real delta: " << real_delta;
+		std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
 		// std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
 		// std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
-		// std::cerr << std::endl;
+		std::cerr << std::endl;
 
-		assert( ISCLOSE(get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state), delta_energy ) );
+		assert( ISCLOSE(real_delta, delta_energy ) );
+		// TODO: assert fails with nonblocking backend -> see issue #397
 #endif
 
 		return delta_energy;
@@ -415,19 +432,19 @@ template<
 					 grb::Vector< EnergyType >&,
 					 grb::Vector< bool >&
 					 >,
+		typename SweepFuncType = std::function< EnergyType(
+					 const grb::Matrix< JType >&,
+					 const grb::Vector< JType >&,
+					 grb::Vector< IOType >&,
+					 const JType&,
+					 SweepDataType&
+				 ) >,
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
 		>
 	>
-std::function< EnergyType(
-					 const grb::Matrix< JType >&,
-					 const grb::Vector< JType >&,
-					 grb::Vector< IOType >&,
-					 const JType&,
-					 SweepDataType&,
-					 const Ring&
-				 ) > get_sweep_function( std::string sweep_name ){
+SweepFuncType get_sweep_function( std::string sweep_name ){
 	if( sweep_name != "sequential_sweep_immediate" ){
 			std::cerr << "Warning: unknown sweep setting. Falling back to  \"sequential_sweep_immediate\"" << std::endl;
 	}
@@ -565,7 +582,9 @@ void grbProgram(
 		for(const auto&i : v ){
 			grb::setElement( masks.back(), 1, i );
 		}
-		print_vector( masks.back(), 30, "MASK" );
+		if( s == 0 ){
+			print_vector( masks.back(), 30, "MASK" );
+		}
 	}
 
     // create states storage and initialize with random 1/0 values
@@ -631,7 +650,7 @@ void grbProgram(
 			temp_dn,
 			temp_accept
 			);
-
+	grb::wait();
 
 
 	out.rep = data_in.rep;
@@ -830,6 +849,7 @@ int main( int argc, char ** argv ) {
     }
 
     // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
+	in.seed += spmd<>::pid();
     std::srand( static_cast<unsigned>( in.seed ) );
 	test_data::global_rng.seed(in.seed);
 

From 3b292d386f79c3c7ff23bcca839b5a2d0b4e74c8 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 31 Oct 2025 11:27:07 +0100
Subject: [PATCH 11/58] Spmd with broadcast - to be tested

---
 .../algorithms/simulated_annealing_re.hpp     | 51 ++++++++++++++++---
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 1fe636209..0b3b50f78 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -71,17 +71,56 @@ namespace grb {
 				const grb::Vector< TempType > &betas
 				){
 			const size_t n_replicas = states.size();
+			const size_t s 		= spmd<>::pid();
+			const size_t nprocs = spmd<>::nprocs();
+			grb::RC rc = grb::SUCCESS;
+			struct data {
+					grb::Vector< StateType, backend > *s;
+					EnergyType e;
+					TempType b;
+					int r;
+				};
+			static struct data msg[ 2 ];
+			int rand = std::rand();
 
-			for( size_t i = 1 ; i < n_replicas ; ++i ){
-        		const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
+			for( int si = static_cast< int >( nprocs ) - 1 ; si >= 0; --si ){
+				if( si == static_cast< int >( s ) ){
+					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
+						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
-				if( de >= 0 || std::rand() < RAND_MAX * exp( de ) ){
-					std::swap( states[i], states[i-1] );
-					std::swap( energies[i], energies[i-1] );
+						if( de >= 0 || std::rand() < RAND_MAX * exp( de ) ){
+							std::swap( states[i], states[i-1] );
+							std::swap( energies[i], energies[i-1] );
+						}
+					}
+					msg[ 1 ].s = &states[ 0 ];
+					msg[ 1 ].e = energies[ 0 ];
+					msg[ 1 ].b = betas[0];
+					msg[ 1 ].r = rand;
+				}else if( si == static_cast< int >( s ) + 1 ){
+					msg[ 0 ].s = &states[ n_replicas - 1 ];
+					msg[ 0 ].e = energies[ n_replicas - 1 ];
+					msg[ 0 ].b = betas[ n_replicas - 1 ];
+					msg[ 0 ].r = rand;
+				}
+				if( si == 0 ) continue;
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ], si-1 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ], si );
+
+				const EnergyType de = ( msg[ 1 ].e - msg[ 0 ].e ) * ( msg[ 1 ].b - msg[ 0 ].b );
+
+				if( de >= 0 || msg[ 0 ].r < RAND_MAX * exp( de ) ){
+					if( si == static_cast< int >( s ) ){
+						states[ 0 ] = *msg[ 0 ].s;
+						energies[ 0 ] = msg[ 0 ].e;
+					}else if( si == static_cast< int >( s ) + 1 ){
+						states[ n_replicas-1 ] = *msg[ 1 ].s;
+						energies[ n_replicas-1 ] = msg[ 1 ].e;
+					}
 				}
 			}
 
-			return grb::SUCCESS;
+			return rc;
 		}
 
 		/*

From 6b7ae56fb2f07a0cc3bb9703474c57cee7335a5f Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 5 Nov 2025 10:22:10 +0100
Subject: [PATCH 12/58] Restructuring and making spmd work - almost

---
 .../algorithms/simulated_annealing_re.hpp     | 129 ++-
 tests/smoke/CMakeLists.txt                    |   5 +
 tests/smoke/simulated_annealing_re.cpp        | 144 +--
 .../smoke/simulated_annealing_re_from_mpi.cpp | 918 ++++++++++++++++++
 tests/utils/print_vec_mat.hpp                 |  12 +-
 5 files changed, 1109 insertions(+), 99 deletions(-)
 create mode 100644 tests/smoke/simulated_annealing_re_from_mpi.cpp

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 0b3b50f78..f38c5d3a9 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -37,7 +37,6 @@
 #include <iostream>
 #endif
 
-
 #include <graphblas.hpp>
 
 namespace grb {
@@ -58,33 +57,79 @@ namespace grb {
 		 * @tparam EnergyType	The energy type.
 		 * @tparam TempType		The inverse temperature type.
 		 *
+		 * This implementation of parallel tempering does not use any spmd characteristics.
 		 */
 		template<
+			Backend backend,
 			typename StateType, 
 			typename EnergyType,
-			typename TempType,
-			Backend backend
+			typename TempType
+			>
+	typename std::enable_if<
+		(grb::_GRB_BACKEND != grb::BSP1D) || (backend == grb::BSP1D),
+		grb::RC >::type
+	pt(
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				const grb::Vector< TempType, backend > &betas
+				){
+
+			const size_t n_replicas = states.size();
+			// const size_t s 		= spmd<>::pid();
+			// const size_t nprocs = spmd<>::nprocs();
+			grb::RC rc = grb::SUCCESS;
+
+			for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
+				const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
+
+				if( de >= 0 || std::rand() < RAND_MAX * exp( de ) ){
+					std::swap( states[i], states[i-1] );
+					std::swap( energies[i], energies[i-1] );
+				}
+			}
+
+			return rc;
+		}
+
+		/*
+		 * Implementation of parallel tempering using spmd.
+		 */
+		template<
+			Backend backend,
+			typename StateType, 
+			typename EnergyType,
+			typename TempType
 			>
-		grb::RC pt(
+			typename std::enable_if<
+				(grb::_GRB_BACKEND == grb::BSP1D) && (backend != grb::BSP1D),
+				grb::RC >::type
+		pt(
 				std::vector< grb::Vector< StateType, backend > > &states,
-				grb::Vector< EnergyType > &energies,
-				const grb::Vector< TempType > &betas
+				grb::Vector< EnergyType, backend > &energies,
+				const grb::Vector< TempType, backend > &betas
 				){
+			static_assert( backend != grb::BSP1D );
+			// static_assert( grb::_GRB_BACKEND == grb::BSP1D );
+
+			const size_t n = grb::size( states[0] );
 			const size_t n_replicas = states.size();
 			const size_t s 		= spmd<>::pid();
 			const size_t nprocs = spmd<>::nprocs();
 			grb::RC rc = grb::SUCCESS;
 			struct data {
-					grb::Vector< StateType, backend > *s;
+					grb::Vector< StateType, backend > s;
 					EnergyType e;
 					TempType b;
 					int r;
 				};
-			static struct data msg[ 2 ];
+			struct data msg[ 2 ];
+			grb::resize( msg[0].s, n );
+			grb::resize( msg[1].s, n );
 			int rand = std::rand();
 
-			for( int si = static_cast< int >( nprocs ) - 1 ; si >= 0; --si ){
-				if( si == static_cast< int >( s ) ){
+			for( size_t si = nprocs ; rc == grb::SUCCESS && si > 0; --si ){
+				std::cerr << "Hello from process " << s << std::endl;
+				if( si == s+1 ){
 					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
@@ -93,29 +138,46 @@ namespace grb {
 							std::swap( energies[i], energies[i-1] );
 						}
 					}
-					msg[ 1 ].s = &states[ 0 ];
+					grb::set( msg[1].s, states[0] );
 					msg[ 1 ].e = energies[ 0 ];
 					msg[ 1 ].b = betas[0];
-					msg[ 1 ].r = rand;
-				}else if( si == static_cast< int >( s ) + 1 ){
-					msg[ 0 ].s = &states[ n_replicas - 1 ];
+					// msg[ 1 ].r = rand;
+				}else if( si == s+2 ){
+					grb::set( msg[0].s, states[ n_replicas - 1 ] );
 					msg[ 0 ].e = energies[ n_replicas - 1 ];
 					msg[ 0 ].b = betas[ n_replicas - 1 ];
 					msg[ 0 ].r = rand;
 				}
-				if( si == 0 ) continue;
-				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ], si-1 );
-				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ], si );
+				if( si == 1 ) continue;
+
+				std::cerr << "Calling broadcasts" << std::endl;
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].s, si-2 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].e, si-2 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].b, si-2 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].r, si-2 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].s, si-1 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].e, si-1 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].b, si-1 );
+
+#ifndef NDEBUG
+	
+				if( rc != grb::SUCCESS ){
+					std::cerr << "\n\t Error in a collective broadcast " << rc << " : " << grb::toString( rc ) << std::endl;
+				}
+				assert( rc == grb::SUCCESS );
+#endif
 
 				const EnergyType de = ( msg[ 1 ].e - msg[ 0 ].e ) * ( msg[ 1 ].b - msg[ 0 ].b );
 
-				if( de >= 0 || msg[ 0 ].r < RAND_MAX * exp( de ) ){
-					if( si == static_cast< int >( s ) ){
-						states[ 0 ] = *msg[ 0 ].s;
+				if( rc == grb::SUCCESS && ( de >= 0 || msg[ 0 ].r < RAND_MAX * exp( de ) ) ){
+					if( si == s+2 ){
+						states[ 0 ] = msg[ 0 ].s;
 						energies[ 0 ] = msg[ 0 ].e;
-					}else if( si == static_cast< int >( s ) + 1 ){
-						states[ n_replicas-1 ] = *msg[ 1 ].s;
+						// betas[ 0 ] = msg[ 0 ].b;
+					}else if( si ==  s+1 ){
+						states[ n_replicas-1 ] = msg[ 1 ].s;
 						energies[ n_replicas-1 ] = msg[ 1 ].e;
+						// betas[ n_replicas-1 ] = msg[ 1 ].b;
 					}
 				}
 			}
@@ -123,6 +185,7 @@ namespace grb {
 			return rc;
 		}
 
+
 		/*
 		 * Estimate a solution to a given optimization problem. The solution is found
 		 * using Simulated Annealing-Replica Exchange (also known as Parallel Tempering).
@@ -155,12 +218,13 @@ namespace grb {
 		 *
 		 */
 		template<
+			Backend backend,
 			typename QType, // type of coupling matrix values
 			typename StateType, // type of state, possibly 0/1
 			typename EnergyType,
 			typename TempType,
 			typename SweepDataType, // type of data to be passed through to the sweep function
-			typename RSI, typename CSI, typename NZI, Backend backend,
+			typename RSI, typename CSI, typename NZI,
 			typename SweepFuncType = std::function< 
 					EnergyType(
 						 const grb::Matrix< QType, backend, RSI, CSI, NZI >&,
@@ -176,15 +240,16 @@ namespace grb {
 				std::vector< grb::Vector< StateType, backend > > &states,
 				const grb::Matrix< QType, backend, RSI, CSI, NZI > &couplings,
 				const grb::Vector< QType, backend > &local_fields,
-				grb::Vector< EnergyType > &energies,
-				grb::Vector< TempType > &betas,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
 				std::vector< grb::Vector< StateType, backend > >  &temp_states,
-				grb::Vector< EnergyType > &temp_energies,
+				grb::Vector< EnergyType, backend > &temp_energies,
 				SweepDataType& temp_sweep,
 				const size_t &n_sweeps = 1,
 				const bool &use_pt = false
 				){
 
+			const size_t s = spmd<>::pid();
 			const size_t n_replicas = states.size();
 			const size_t n = grb::size(states[0]);
 
@@ -198,6 +263,8 @@ namespace grb {
 				assert( n == grb::size( states[ i ] ) );
 			}
 
+			grb::RC rc = grb::SUCCESS;
+
 
 #ifndef NDEBUG
 			if( grb::spmd<>::pid() == 0 ) {
@@ -210,15 +277,12 @@ namespace grb {
 			}
 #endif
 
-			grb::RC rc = grb::SUCCESS;
-
 			temp_energies = energies;
 			temp_states =  states;
 
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
 				for( size_t j = 0 ; j < n_replicas ; ++j ){
 					
-					grb::wait();
 					energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep );
 					grb::wait();
 				
@@ -228,13 +292,13 @@ namespace grb {
 						temp_states[j] = states[j];
 					}
 				} // n_replicas
-
+				// std::cerr << "Iteration " << i_sweep << " " << rc << std::endl;
 				if( rc == SUCCESS && use_pt ){
 					// do a Parallel Tempering move
-					rc = pt( states, energies, betas );
+					rc = pt< backend >( states, energies, betas );
 				}
 #ifndef NDEBUG
-				if( grb::spmd<>::pid() == 0 ) {
+				if( s == 0 ) {
 					std::cerr << "Energy at iteration " << i_sweep << " = " << energies[ 0 ] << std::endl;
 				}
 #endif
@@ -246,6 +310,7 @@ namespace grb {
 					      << __FILE__ << ": " << grb::toString( rc ) << "\n";
 			}
 #endif
+			// grb::collectives<>::reduce(); ?
 			if( rc == SUCCESS ){
 				states = temp_states;
 				energies = temp_energies;
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 698413936..95758a7e5 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -143,6 +143,11 @@ add_grb_executables( conjugate_gradient_complex conjugate_gradient.cpp
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 	COMPILE_DEFINITIONS _CG_COMPLEX
 )
+
+add_grb_executables( simulated_annealing_re_from_mpi simulated_annealing_re_from_mpi.cpp
+	BACKENDS bsp1d
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
 add_grb_executables( simulated_annealing_re simulated_annealing_re.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index ff45011cf..59d70e710 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -130,17 +130,16 @@ namespace test_data {
 		// {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, {15}
 	};
 
-    std::minstd_rand global_rng ( 8 ); // or std::mt19937
 }
 // --- New, minimal runner configuration and result types ---
 struct input {
     bool use_default_data = false;
     std::string filename_Jmatrix;
     std::string filename_h;
-    size_t n_replicas = 3;
-    size_t nsweeps = 2;
-    bool use_pt = true;
-    unsigned seed = 8;
+    size_t n_replicas = test_data::n_replicas;
+    size_t nsweeps = test_data::nsweeps;
+    bool use_pt = test_data::use_pt;
+    unsigned seed = test_data::seed;
     std::string sweep_name = "sequential_sweep_immediate";
     bool verify = false;
     std::string filename_ref_solution;
@@ -188,9 +187,11 @@ void read_matrix_data(const std::string &filename, std::vector<Dtype> &data, boo
 		) {
 			data.push_back( Dtype( *it ) );
 #ifdef DEBUG_IMSB
-			// print last data element from std::vector<NonzeroT> data
-			std::cout << "readmatrix_data: " << data.back().first.first << ", "
-				<< data.back().first.second << ", " << data.back().second << "\n";
+			if( spmd<>::pid() == 0 ){
+				// print last data element from std::vector<NonzeroT> data
+				std::cout << "readmatrix_data: " << data.back().first.first << ", "
+					<< data.back().first.second << ", " << data.back().second << "\n";
+			}
 #endif
 		}
 	} catch( std::exception &e ) {
@@ -211,9 +212,11 @@ void read_matrix_data_from_array(
                 NonzeroT( entry.first.first, entry.first.second, entry.second )
             );
 #ifdef DEBUG_IMSB
-            // print last data element from std::vector<NonzeroT> data
-            std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
-                << data.back().first.second << ", " << data.back().second << "\n";
+			if( spmd<>::pid() < 2 ){
+				// print last data element from std::vector<NonzeroT> data
+				std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
+					<< data.back().first.second << ", " << data.back().second << "\n";
+			}
 #endif
         }
         std::get<0>(Storage::getData()) = test_data::n;
@@ -282,7 +285,7 @@ EnergyType get_energy(
 	rc = rc ? rc : grb::mxv( tmp, couplings, state, ring );
 	rc = rc ? rc : grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
 	rc = rc ? rc : grb::foldl( tmp, local_fields, ring.getAdditiveMonoid() );
-	rc = rc ? rc : grb::dot( energy, tmp, state, ring );
+	rc = rc ? rc : grb::dot<>( energy, tmp, state, ring );
 	assert( rc == grb::SUCCESS );
 
 	return energy;
@@ -297,6 +300,7 @@ template<
 					 grb::Vector< EnergyType >&,
 					 grb::Vector< bool >&
 					 >,
+		grb::Descriptor descr = grb::descriptors::no_operation,
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
@@ -313,7 +317,8 @@ static EnergyType sequential_sweep_immediate(
 					 grb::Vector< IOType >&,
 					 const std::vector< grb::Vector< bool > >&,
 					 grb::Vector< EnergyType >&,
-					 grb::Vector< bool >&
+					 grb::Vector< bool >&,
+					 std::minstd_rand&
 					 > &data
 			  ){
 		const Ring ring = Ring();
@@ -329,6 +334,7 @@ static EnergyType sequential_sweep_immediate(
 		const auto &masks = std::get<3>(data);
 		auto &dn		= std::get<4>(data);
 		auto &accept	= std::get<5>(data);
+		auto &rng       = std::get<6>(data);
 
 		rc = rc ? rc : grb::wait();
 		rc = rc ? rc : grb::resize( h, n );
@@ -342,7 +348,7 @@ static EnergyType sequential_sweep_immediate(
 
 		std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
 		for( size_t j = 0 ; j < n ; ++j ){
-			const auto rnd = rand( test_data::global_rng );
+			const auto rnd = rand( rng );
 			rc = rc ? rc : grb::setElement(log_rand,  std::log( rnd ), j );
 		}
 		// rc = rc ? rc : grb::wait();
@@ -389,7 +395,7 @@ static EnergyType sequential_sweep_immediate(
 			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
 			
 			// Update delta_energy -= dot(dn, accept)
-			rc = rc ? rc : grb::dot( delta_energy, delta, h, ring );
+			rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
 			// rc = rc ? rc : grb::wait();
 
 			// update h
@@ -430,7 +436,8 @@ template<
 					 grb::Vector< IOType >&,
 					 const std::vector< grb::Vector< bool > >&,
 					 grb::Vector< EnergyType >&,
-					 grb::Vector< bool >&
+					 grb::Vector< bool >&,
+					 std::minstd_rand&
 					 >,
 		typename SweepFuncType = std::function< EnergyType(
 					 const grb::Matrix< JType >&,
@@ -455,44 +462,52 @@ void ioProgram( const struct input &data_in, bool &success ) {
 
     using namespace test_data;
 	success = false;
-	// Parse and store matrix in singleton class
-    // Map Storage tuple fields to meaningful names and wire up default data
-    auto &storage = Storage::getData();
-    // auto &n           = std::get<0>(storage); // n (rows/cols)
-    // auto &nnz         = std::get<1>(storage); // nz (nonzeros)
-    auto &nsweeps_st  = std::get<2>(storage); // nsweeps
-    auto &n_replicas_st = std::get<3>(storage); // n_replicas
-    auto &use_pt      = std::get<4>(storage); // use_pt
-    auto &seed_st     = std::get<5>(storage); // seed
-    auto &sweep_name  = std::get<6>(storage); // sweep_name
-    auto &Jdata       = std::get<7>(storage); // std::vector<NonzeroT>
-    auto &h           = std::get<8>(storage); // std::vector<JType>
-
-    // Initialize metadata from input (allow CLI to override defaults)
-    nsweeps_st    = data_in.nsweeps;
-    n_replicas_st = data_in.n_replicas;
-    use_pt        = data_in.use_pt;
-    seed_st       = data_in.seed;
-    sweep_name    = data_in.sweep_name;
-
-    if ( data_in.use_default_data ) {
-        // if no file provided, use default data from file_content
-        read_matrix_data_from_array<NonzeroT>( test_data::j_matrix_data, Jdata );
-        read_vector_data_from_array<JType>( test_data::h_array_data, h );
-        // other data
-    } else {
-        // read from files if provided
-        read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
-        read_vector_data<JType>( data_in.filename_h, h );
-		if(data_in.verify) {
-			if(data_in.filename_ref_solution.empty()) {
-				std::cerr << "Reference solution file not provided for verification\n";
-				return;
+
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+	try {
+		// Parse and store matrix in singleton class
+		// Map Storage tuple fields to meaningful names and wire up default data
+		auto &storage = Storage::getData();
+		// auto &n           = std::get<0>(storage); // n (rows/cols)
+		// auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+		auto &nsweeps_st  = std::get<2>(storage); // nsweeps
+		auto &n_replicas_st = std::get<3>(storage); // n_replicas
+		auto &use_pt      = std::get<4>(storage); // use_pt
+		auto &seed_st     = std::get<5>(storage); // seed
+		auto &sweep_name  = std::get<6>(storage); // sweep_name
+		auto &Jdata       = std::get<7>(storage); // std::vector<NonzeroT>
+		auto &h           = std::get<8>(storage); // std::vector<JType>
+
+		// Initialize metadata from input (allow CLI to override defaults)
+		nsweeps_st    = data_in.nsweeps;
+		n_replicas_st = data_in.n_replicas;
+		use_pt        = data_in.use_pt;
+		seed_st       = data_in.seed;
+		sweep_name    = data_in.sweep_name; // TODO: makes bsp1d backend crash!?
+
+
+		if ( data_in.use_default_data ) {
+			// if no file provided, use default data from file_content
+			read_matrix_data_from_array<NonzeroT>( test_data::j_matrix_data, Jdata );
+			read_vector_data_from_array<JType>( test_data::h_array_data, h );
+			// other data
+		} else {
+			// read from files if provided
+			read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
+			read_vector_data<JType>( data_in.filename_h, h );
+			if(data_in.verify) {
+				if(data_in.filename_ref_solution.empty()) {
+					std::cerr << "Reference solution file not provided for verification\n";
+					return;
+				}
 			}
 		}
-		//read_vector_data<JType>( data_in.filename_ref_solution, sol );
-
-    }
+	} catch( std::exception &e ) {
+		std::cerr << "I/O program failed: " << e.what() << "\n";
+		return;
+	}
 
 	success = true;
 }
@@ -504,11 +519,12 @@ void grbProgram(
 ) {
     std::cout<< "grbProgram: running simulated-annealing RE solver (stub)\n";
 
-
 	// get user process ID
 	const size_t s = spmd<>::pid();
 	assert( s < spmd<>::nprocs() );
 
+	// std::cerr << "Process " << s <<  " running at line " << __LINE__ << std::endl;
+
     grb::utils::Timer timer;
 	timer.reset();
 
@@ -518,6 +534,7 @@ void grbProgram(
 		std::cout << "problem size n = " << n << "\n";
 	}
     grb::Vector< JType > h( n );
+
     // populate J with test (random) values
     grb::RC rc = grb::SUCCESS;
 
@@ -555,10 +572,10 @@ void grbProgram(
 		}
 
 #ifdef DEBUG_IMSB
-	if( s == 0 && grb::ncols( J ) < 40 ) {
-		std::cout << "Matrix J:\n";
-		print_matrix( J );
-	}
+		if( s == 0 && grb::ncols( J ) < 40 ) {
+			std::cout << "Matrix J:\n";
+			print_matrix( J );
+		}
 #endif
 	}
 
@@ -587,6 +604,10 @@ void grbProgram(
 		}
 	}
 
+    // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
+    std::srand( static_cast<unsigned>( data_in.seed + s ) );
+    std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
+
     // create states storage and initialize with random 1/0 values
     const size_t n_replicas = std::get<3>(Storage::getData());
     std::vector< grb::Vector<IOType> > states;
@@ -598,7 +619,7 @@ void grbProgram(
         std::vector< IOType > rand_data;
         for ( size_t i = 0; i < n; ++i ) {
             rand_data.emplace_back( static_cast<IOType>(
-                randint( test_data::global_rng ) ) );
+                randint( rng ) ) );
         }
         rc = rc ? rc : grb::buildVector(
             states.back(),
@@ -648,7 +669,8 @@ void grbProgram(
 			temp_delta,
 			(const typeof(masks)&) masks,
 			temp_dn,
-			temp_accept
+			temp_accept,
+			rng
 			);
 	grb::wait();
 
@@ -848,10 +870,6 @@ int main( int argc, char ** argv ) {
         return 1;
     }
 
-    // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
-	in.seed += spmd<>::pid();
-    std::srand( static_cast<unsigned>( in.seed ) );
-	test_data::global_rng.seed(in.seed);
 
     std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=" << in.sweep_name << "\n";
 
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
new file mode 100644
index 000000000..2fd397f33
--- /dev/null
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -0,0 +1,918 @@
+/*
+  Minimal scaffold adapted from ising_machine_sb.cpp to drive a replica-exchange
+  simulated-annealing (RE-SA) solver.  Algorithmic parts are intentionally left
+  unimplemented (stubs).  This file mirrors the existing IO / launcher /
+  program structure and replaces numpy arrays with grb::Vector and lists of
+  numpy vectors with std::vector< grb::Vector<...> >. Sparse matrices are
+  represented as grb::Matrix< JType >.
+
+  Purpose: allow running internal tests or an external-run mode while the RE-SA
+  algorithm is implemented separately.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <tuple>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <random>
+#include <cassert>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <mpi.h>
+
+#include <graphblas/algorithms/simulated_annealing_re.hpp>
+#include <graphblas/nonzeroStorage.hpp>
+#include <graphblas/utils/timer.hpp>
+#include <graphblas/utils/parser.hpp>
+#include <graphblas/utils/singleton.hpp>
+#include <graphblas/utils/iterators/nonzeroIterator.hpp>
+#include <utils/output_verification.hpp>
+#include <graphblas.hpp>
+#include <utils/print_vec_mat.hpp>
+
+const int LPF_MPI_AUTO_INITIALIZE = 0;
+
+using namespace grb;
+
+#define DEBUG_IMSB 1
+#define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
+
+
+// Types
+using IOType = double;   // scalar/vector element type
+using JType  = double;   // coupling (matrix) value type
+using EnergyType  = double;   // coupling (matrix) value type
+
+/** Parser type */
+typedef grb::utils::MatrixFileReader<
+	JType,
+	std::conditional<
+		(sizeof(grb::config::RowIndexType) > sizeof(grb::config::ColIndexType)),
+		grb::config::RowIndexType,
+		grb::config::ColIndexType
+	>::type
+> Parser;
+
+/** Nonzero type */
+typedef internal::NonzeroStorage<
+	grb::config::RowIndexType,
+	grb::config::ColIndexType,
+	JType
+> NonzeroT;
+
+/** In-memory storage type using tuple */
+typedef grb::utils::Singleton<
+    std::tuple<
+        size_t,                    // n (rows/columns)
+        size_t,                    // nz (nonzeros)
+        size_t,                    // nsweeps
+        size_t,                    // n_replicas
+        bool,                      // use_pt
+        unsigned,                  // seed
+        std::string,               // sweep_name
+        std::vector<NonzeroT>,     // matrix data
+        std::vector<JType>         // h vector
+    >
+> Storage;
+
+namespace test_data {
+    constexpr size_t n = 16;
+    constexpr size_t nsweeps = 2;
+    constexpr size_t n_replicas = 3;
+    constexpr bool use_pt = true; 
+    constexpr unsigned seed = 8;
+
+    const std::vector< std::pair< std::pair< grb::config::RowIndexType, grb::config::ColIndexType >, JType > > j_matrix_data = {
+		{{0, 1}, -0.2752300610319546},
+		{{1, 0}, -0.2752300610319546},
+		{{1, 2}, -0.10636508505639508},
+		{{2, 1}, -0.10636508505639508},
+		{{2, 3}, 0.3961450048806352},
+		{{3, 2}, 0.3961450048806352},
+		{{3, 4}, -0.15453838800213293},
+		{{3, 5}, 0.4847494372852713},
+		{{4, 3}, -0.15453838800213293},
+		{{4, 5}, -0.4712679510367046},
+		{{5, 3}, 0.4847494372852713},
+		{{5, 4}, -0.4712679510367046},
+		{{5, 6}, -0.1483152637298799},
+		{{6, 5}, -0.1483152637298799},
+		{{7, 8}, -0.11904111079614699},
+		{{8, 7}, -0.11904111079614699},
+		{{9, 10}, -0.18031020353297234},
+		{{10, 9}, -0.18031020353297234},
+		{{10, 11}, -0.22985425840853468},
+		{{11, 10}, -0.22985425840853468},
+		{{11, 12}, 0.30105588632639446},
+		{{11, 13}, 0.13823880612312134},
+		{{12, 11}, 0.30105588632639446},
+		{{13, 11}, 0.13823880612312134},
+		{{13, 14}, 0.10364447636911123},
+		{{14, 13}, 0.10364447636911123},
+		{{14, 15}, 0.2955745584289766},
+		{{15, 14}, 0.2955745584289766},
+    };
+
+
+    const size_t nnz = j_matrix_data.size();
+
+    const std::vector< JType > h_array_data = {
+        -0.08910436,  0.58034508,  0.97719304,  0.16792909,
+		-0.9221754 , -0.10715418, -0.62365497,  0.25411129,
+		-0.5693644 , -0.69805978,  0.07228861, -0.79922641,
+		0.46231686 , 0.87930208 ,  0.88663637, -0.25052299,
+    };
+
+	const std::vector< std::vector< size_t > > row_blocks = {
+		// {3, 1, 6, 7, 9, 11, 12, 13, 14, 15}, {5, 2, 0, 8, 10}, {4} // for python data files
+		{0, 2, 4, 7, 9, 12, 13, 15}, {1, 3, 6, 8, 11}, {5, 10, 14},
+		// {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, {15}
+	};
+
+}
+// --- New, minimal runner configuration and result types ---
+struct input {
+    bool use_default_data = false;
+    std::string filename_Jmatrix;
+    std::string filename_h;
+    size_t n_replicas = test_data::n_replicas;
+    size_t nsweeps = test_data::nsweeps;
+    bool use_pt = test_data::use_pt;
+    unsigned seed = test_data::seed;
+    std::string sweep_name = "sequential_sweep_immediate";
+    bool verify = false;
+    std::string filename_ref_solution;
+	bool direct;
+    size_t rep = 0;
+    size_t outer = 1;
+};
+
+struct output {
+    int error_code = 0;
+    // TODO: remove itrations if not applicable
+    size_t iterations = 10; // total number of iterations performed does not make sense since the code does not have convergence criteria
+    EnergyType best_energy = std::numeric_limits< EnergyType >::max();
+	size_t rep;
+	grb::utils::TimerResults times;
+    std::unique_ptr< PinnedVector< JType, grb::reference > > pinnedSolutionVector;
+    std::unique_ptr< PinnedVector< JType, grb::reference > > pinnedRefSolutionVector;
+    // other things like eg: best replicas ...
+};
+
+template< typename Dtype >
+void read_matrix_data(const std::string &filename, std::vector<Dtype> &data, bool direct) {
+    // Implementation for reading matrix data from file
+	try {
+		Parser parser( filename, direct );
+		assert( parser.m() == parser.n() );
+		std::get<0>(Storage::getData()) = parser.n();
+		try {
+			std::get<1>(Storage::getData()) = parser.nz();
+		} catch( ... ) {
+			std::get<1>(Storage::getData()) = parser.entries();
+		}
+		/* Once internal issue #342 is resolved this can be re-enabled
+		for(
+			auto it = parser.begin( PARALLEL );
+			it != parser.end( PARALLEL );
+			++it
+		) {
+			data.push_back( *it );
+		}*/
+		for(
+			auto it = parser.begin( SEQUENTIAL );
+			it != parser.end( SEQUENTIAL );
+			++it
+		) {
+			data.push_back( Dtype( *it ) );
+#ifdef DEBUG_IMSB
+			if( spmd<>::pid() == 0 ){
+				// print last data element from std::vector<NonzeroT> data
+				std::cout << "readmatrix_data: " << data.back().first.first << ", "
+					<< data.back().first.second << ", " << data.back().second << "\n";
+			}
+#endif
+		}
+	} catch( std::exception &e ) {
+		std::cerr << "I/O program failed: " << e.what() << "\n";
+		return;
+	}
+}
+
+template< typename NonzeroT, typename IType, typename VType >
+void read_matrix_data_from_array(
+	const std::vector<std::pair< std::pair< IType, IType >, VType > > &array,
+	std::vector<NonzeroT> &data
+) {
+	// Implementation for reading matrix data from array
+    try {
+        for (const auto &entry : array) {
+            data.emplace_back(
+                NonzeroT( entry.first.first, entry.first.second, entry.second )
+            );
+#ifdef DEBUG_IMSB
+			if( spmd<>::pid() < 2 ){
+				// print last data element from std::vector<NonzeroT> data
+				std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
+					<< data.back().first.second << ", " << data.back().second << "\n";
+			}
+#endif
+        }
+        std::get<0>(Storage::getData()) = test_data::n;
+        std::get<1>(Storage::getData()) = data.size();
+    } catch (const std::exception &e) {
+        std::cerr << "Failed to read matrix data from array: " << e.what() << "\n";
+        return;
+    }
+}
+
+template< typename Dtype >
+void read_vector_data(const std::string &filename, std::vector<Dtype> &data) {
+    // Implementation for reading vector data from file
+    try {
+        std::ifstream file( filename );
+        if( !file.is_open() ) {
+            std::cerr << "Failed to open vector file: " << filename << "\n";
+            return;
+        }
+        std::string line;
+        while( std::getline( file, line ) ) {
+            if( line.empty() ) continue; // skip empty lines
+            std::istringstream iss( line );
+            Dtype v;
+            if( !(iss >> v) ) {
+                throw std::runtime_error( "Failed to parse line in vector file" );
+            }
+            data.push_back( v );
+        }
+    } catch( std::exception &e ) {
+        std::cerr << "I/O program failed: " << e.what() << "\n";
+        return;
+    }
+}
+
+template< typename Dtype >
+void read_vector_data_from_array(
+	const std::vector<Dtype> &array, std::vector<Dtype> &data
+) {
+	// Implementation for reading vector data from array
+	try {
+		for (size_t i = 0; i < array.size(); ++i) {
+			data.push_back(array[i]);
+		}
+	} catch (const std::exception &e) {
+		std::cerr << "Failed to read vector data from array: " << e.what() << "\n";
+		return;
+	}
+}
+
+template<
+	Backend backend,
+	class Ring = Semiring<
+		grb::operators::add< JType >, grb::operators::mul< JType >,
+		grb::identities::zero, grb::identities::one
+	>
+	>
+EnergyType get_energy(
+				 const grb::Matrix< JType, backend >& couplings,
+				 const grb::Vector< JType, backend > &local_fields,
+				 const grb::Vector< IOType, backend > &state,
+				 const Ring &ring = Ring()
+			  ){
+	static grb::Vector< JType, backend > tmp ( grb::size( local_fields ) );
+	grb::RC rc = grb::clear( tmp );
+	EnergyType energy = 0.0;
+
+	rc = rc ? rc : grb::mxv( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot<>( energy, tmp, state, ring );
+	assert( rc == grb::SUCCESS );
+
+	return energy;
+}
+
+template<
+		class Ring = Semiring<
+			grb::operators::add< JType >, grb::operators::mul< JType >,
+			grb::identities::zero, grb::identities::one
+		>,
+		Backend backend = grb::reference,
+		typename SweepDataType = std::tuple<
+					 grb::Vector< JType, backend >&,
+					 grb::Vector< JType, backend >&,
+					 grb::Vector< IOType, backend >&,
+					 const std::vector< grb::Vector< bool, backend > >&,
+					 grb::Vector< EnergyType, backend >&,
+					 grb::Vector< bool, backend >&
+					 >,
+		grb::Descriptor descr = grb::descriptors::no_operation
+	>
+EnergyType sequential_sweep_immediate(
+				 const grb::Matrix< JType, backend >& couplings,
+				 const grb::Vector< JType, backend > &local_fields,
+				 grb::Vector< IOType, backend > &state,
+				 const JType &beta,
+				 std::tuple<
+					 grb::Vector< JType, backend >&,
+					 grb::Vector< JType, backend >&,
+					 grb::Vector< IOType, backend >&,
+					 const std::vector< grb::Vector< bool, backend > >&,
+					 grb::Vector< EnergyType, backend >&,
+					 grb::Vector< bool, backend >&,
+					 std::minstd_rand&
+					 > &data
+			  ){
+		const size_t s = spmd<>::pid();
+		// std::cerr << "Process " << s <<  " running at line " << __LINE__ << std::endl;
+		const Ring ring = Ring();
+
+
+		grb::RC rc = grb::SUCCESS;
+		const size_t n = grb::size( state );
+		EnergyType delta_energy = static_cast< JType >(0.0);
+
+		auto &h 		= std::get<0>(data);
+		auto &log_rand	= std::get<1>(data);
+		auto &delta		= std::get<2>(data);
+		const auto &masks = std::get<3>(data);
+		auto &dn		= std::get<4>(data);
+		auto &accept	= std::get<5>(data);
+		auto &rng       = std::get<6>(data);
+
+		rc = rc ? rc : grb::wait();
+		rc = rc ? rc : grb::resize( h, n );
+		rc = rc ? rc : grb::resize( log_rand, n );
+		rc = rc ? rc : grb::resize( delta, n );
+		rc = rc ? rc : grb::resize( dn, n );
+		rc = rc ? rc : grb::resize( accept, n );
+
+		rc = rc ? rc : grb::set( h, local_fields );
+		rc = rc ? rc : grb::mxv( h, couplings, state , ring );
+
+		std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
+		for( size_t j = 0 ; j < n ; ++j ){
+			const auto rnd = rand( rng );
+			rc = rc ? rc : grb::setElement(log_rand,  std::log( rnd ), j );
+		}
+		// rc = rc ? rc : grb::wait();
+		// print_vector( log_rand, 30, "log_rand" );
+
+#ifndef NDEBUG
+		const grb::Vector< IOType, backend > old_state = state;
+#endif
+		for(const auto &mask : masks ){
+
+			rc = rc ? rc : grb::clear( accept  );
+			rc = rc ? rc : grb::clear( delta  );
+			rc = rc ? rc : grb::clear( dn );
+
+			// dn = (2*state_slice - 1) * h_slice
+			rc = rc ? rc : grb::set( dn, mask, state );
+			rc = rc ? rc : grb::foldl( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
+			rc = rc ? rc : grb::foldl( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl( dn, h, ring.getMultiplicativeMonoid() );
+
+			// ( dn >= 0 ) | ( log_rand < beta * dn )
+			rc = rc ? rc : grb::set( accept, mask );
+			rc = rc ? rc : grb::wait(); // ERROR: Segmentation Fault with nonblocking backend
+			rc = rc ? rc : grb::eWiseLambda<>(
+					[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
+						(void) i;
+						if( mask[i] ){
+							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
+						}
+					}, mask, log_rand, dn, accept );
+
+			// new_state = np.where(accept, 1 - old, old)
+			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
+			
+			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
+			rc = rc ? rc : grb::clear( delta  );
+			rc = rc ? rc : grb::set( delta, accept, state );
+			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( 2 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
+			
+			// Update delta_energy -= dot(dn, accept)
+			rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
+
+			// update h
+			rc = rc ? rc : grb::mxv( h, couplings, delta, ring );
+			
+		}
+		rc = rc ? rc : grb::wait();
+
+#ifndef NDEBUG
+		if( rc != grb::SUCCESS ){
+			std::cerr << "\n\t Error in some GraphBLAS function " << rc << " : " << grb::toString( rc ) << std::endl;
+			abort();
+		}
+		assert( rc == grb::SUCCESS );
+		if(s == 0){
+			const auto new_state = state;
+
+			const auto real_delta = get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state);
+			std::cerr << "\n\t Delta_energy: " << delta_energy;
+			std::cerr << "\n\t Real delta: " << real_delta;
+			std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
+			// std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
+			// std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
+			std::cerr << std::endl;
+
+			assert( ISCLOSE(real_delta, delta_energy ) );
+			// TODO: assert fails with nonblocking backend -> see issue #397
+		}
+#endif
+
+		return delta_energy;
+}
+
+
+template<
+		Backend backend,
+		typename SweepDataType = std::tuple<
+					 grb::Vector< JType, backend >&,
+					 grb::Vector< JType, backend >&,
+					 grb::Vector< IOType, backend >&,
+					 const std::vector< grb::Vector< bool, backend > >&,
+					 grb::Vector< EnergyType, backend >&,
+					 grb::Vector< bool, backend >&,
+					 std::minstd_rand&
+					 >,
+		typename SweepFuncType = std::function< EnergyType(
+					 const grb::Matrix< JType, backend >&,
+					 const grb::Vector< JType, backend >&,
+					 grb::Vector< IOType, backend >&,
+					 const JType&,
+					 SweepDataType&
+				 ) >,
+		class Ring = Semiring<
+			grb::operators::add< JType >, grb::operators::mul< JType >,
+			grb::identities::zero, grb::identities::one
+		>
+	>
+SweepFuncType get_sweep_function( const std::string &sweep_name ){
+	if( sweep_name != "sequential_sweep_immediate" ){
+			std::cerr << "Warning: unknown sweep setting. Falling back to  \"sequential_sweep_immediate\"" << std::endl;
+	}
+	 return sequential_sweep_immediate< Ring >;
+}
+
+void ioProgram( const struct input &data_in, bool &success ) {
+
+    using namespace test_data;
+	success = false;
+
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+	try {
+		// Parse and store matrix in singleton class
+		// Map Storage tuple fields to meaningful names and wire up default data
+		auto &storage = Storage::getData();
+		// auto &n           = std::get<0>(storage); // n (rows/cols)
+		// auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+		auto &nsweeps_st  = std::get<2>(storage); // nsweeps
+		auto &n_replicas_st = std::get<3>(storage); // n_replicas
+		auto &use_pt      = std::get<4>(storage); // use_pt
+		auto &seed_st     = std::get<5>(storage); // seed
+		auto &sweep_name  = std::get<6>(storage); // sweep_name
+		auto &Jdata       = std::get<7>(storage); // std::vector<NonzeroT>
+		auto &h           = std::get<8>(storage); // std::vector<JType>
+
+		// Initialize metadata from input (allow CLI to override defaults)
+		nsweeps_st    = data_in.nsweeps;
+		n_replicas_st = data_in.n_replicas;
+		use_pt        = data_in.use_pt;
+		seed_st       = data_in.seed;
+		// sweep_name    = data_in.sweep_name; // TODO: makes bsp1d backend crash!?
+
+
+		if ( data_in.use_default_data ) {
+			// if no file provided, use default data from file_content
+			read_matrix_data_from_array<NonzeroT>( test_data::j_matrix_data, Jdata );
+			read_vector_data_from_array<JType>( test_data::h_array_data, h );
+			// other data
+		} else {
+			// read from files if provided
+			read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
+			read_vector_data<JType>( data_in.filename_h, h );
+			if(data_in.verify) {
+				if(data_in.filename_ref_solution.empty()) {
+					std::cerr << "Reference solution file not provided for verification\n";
+					return;
+				}
+			}
+		}
+	} catch( std::exception &e ) {
+		std::cerr << "I/O program failed: " << e.what() << "\n";
+		return;
+	}
+
+	success = true;
+}
+
+
+void grbProgram(
+    const struct input &data_in, 
+    struct output &out
+) {
+    std::cout<< "grbProgram: running simulated-annealing RE solver (stub)\n";
+
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+
+    grb::utils::Timer timer;
+	timer.reset();
+
+    /* --- Problem setup --- */
+    const size_t n = std::get<0>(Storage::getData());
+	if( s == 0 ){
+		std::cout << "problem size n = " << n << "\n";
+	}
+    grb::Vector< JType, grb::reference > h( n );
+
+    // populate J with test (random) values
+    grb::RC rc = grb::SUCCESS;
+
+    // load into GraphBLAS
+    grb::Matrix< JType, grb::reference > J( n, n );
+	{
+		const auto &data = std::get<7>(Storage::getData());
+		RC io_rc = buildMatrixUnique(
+			J,
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cbegin() ),
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cend() ),
+			SEQUENTIAL
+		);
+		/* Once internal issue #342 is resolved this can be re-enabled
+		RC io_rc = buildMatrixUnique(
+			J,
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cbegin() ),
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cend() ),
+			PARALLEL
+		);*/
+		io_rc = io_rc ? io_rc : wait();
+		if( io_rc != SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+				<< "(" << toString( io_rc ) << ")." << std::endl;
+			out.error_code = 5;
+			return;
+		}
+
+#ifdef DEBUG_IMSB
+		if( s == 0 && grb::ncols( J ) < 40 ) {
+			std::cout << "Matrix J:\n";
+			print_matrix( J );
+		}
+#endif
+	}
+
+    // build vector h with data from singleton
+    {
+        const auto &h_data = std::get<8>(Storage::getData());
+		rc = rc ? rc : buildVector(
+			h,
+			h_data.cbegin(),
+			h_data.cend(),
+			SEQUENTIAL
+		);
+    }
+
+	// build masks from row block indices
+    std::vector< grb::Vector< bool, grb::reference > > masks;
+	for(const auto&v : test_data::row_blocks ){
+		masks.emplace_back( grb::Vector< bool, grb::reference >( n ) );
+		for(const auto&i : v ){
+			grb::setElement( masks.back(), 1, i );
+		}
+		if( s == 0 ){
+			print_vector( masks.back(), 30, "MASK" );
+		}
+	}
+
+    // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
+    std::srand( static_cast<unsigned>( data_in.seed + s ) );
+    std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
+
+    // create states storage and initialize with random 1/0 values
+    const size_t n_replicas = data_in.n_replicas;
+    std::vector< grb::Vector< IOType, grb::reference > > states;
+    for ( size_t r = 0; r < n_replicas; ++r ) {
+        states.emplace_back( grb::Vector< IOType, grb::reference >(n) );
+        // initialize with random values
+        std::uniform_int_distribution< unsigned short > randint(0,1);
+        // we use buildvectorUnique with a random set of indices
+        std::vector< IOType > rand_data;
+        for ( size_t i = 0; i < n; ++i ) {
+            rand_data.emplace_back( static_cast<IOType>(
+                randint( rng ) ) );
+        }
+        rc = rc ? rc : grb::buildVector(
+            states.back(),
+            rand_data.cbegin(),
+            rand_data.cend(),
+            SEQUENTIAL
+        );
+    }
+	using Ring = Semiring<
+			grb::operators::add< JType >, grb::operators::mul< JType >,
+			grb::identities::zero, grb::identities::one >;
+	
+	const auto sweep = sequential_sweep_immediate< Ring >; // get_sweep_function( data_in.sweep_name );
+
+    // also make betas vector os size n_replicas and initialize with 10.0
+    grb::Vector< JType, grb::reference > betas( n_replicas );
+    grb::Vector< EnergyType, grb::reference > energies( n_replicas );
+    grb::Vector< EnergyType, grb::reference > temp_energies( n_replicas );
+    for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
+        rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r] ), r );
+    }
+
+    #ifdef DEBUG_IMSB
+    if( s == 0 ) {
+        for ( size_t r = 0; r < n_replicas; ++r ) {
+            std::cout << "Process " << s << ": ";
+            std::cout << "Initial state replica " << r << ":\n";
+            print_vector( states[r], 30 ,"states values" );  
+			std::cout << "With energy " << energies[r] << "\n";
+            std::cout << std::endl;
+        }
+
+		// assert( std::abs(get_energy(  J, h, zero ) - 0.5803450826765713) < 1e-4 );
+    }
+    #endif
+    rc = rc ? rc : wait();
+
+    std::vector< grb::Vector< IOType, grb::reference > > temp_states;
+	grb::Vector< JType, grb::reference > temp_h ( n );
+	grb::Vector< JType, grb::reference > temp_log_rand ( n );
+	grb::Vector< EnergyType, grb::reference > temp_dn ( n );
+	grb::Vector< bool, grb::reference > temp_accept ( n );
+	grb::Vector< IOType, grb::reference > temp_delta ( n );
+	auto sweep_data = std::tie(
+ 			temp_h,
+			temp_log_rand,
+			temp_delta,
+			(const typeof(masks)&) masks,
+			temp_dn,
+			temp_accept,
+			rng
+			);
+	grb::wait();
+
+	out.rep = data_in.rep;
+	// time a single call
+	if( out.rep == 0 ) {
+		timer.reset();
+		rc = grb::algorithms::simulated_annealing_RE(
+				sweep, states, J, h, energies, betas, temp_states, temp_energies, sweep_data, data_in.nsweeps, data_in.use_pt
+        );
+
+		rc = rc ? rc : wait();
+		double single_time = timer.time();
+		if( !(rc == SUCCESS || rc == FAILED) ) {
+			std::cerr << "Failure: call to Simulated Annealing RE did not succeed ("
+				<< toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == FAILED ) {
+			std::cout << "Warning: call to Simulated Annealing RE did not converge\n";
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+
+			for(size_t i = 0 ; i < n_replicas ; ++i ){
+				out.best_energy = std::min( out.best_energy, energies[ i ] );
+			}
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		out.rep = static_cast< size_t >( 1000.0 / single_time ) + 1;
+		if( rc == SUCCESS || rc == FAILED ) {
+			if( s == 0 ) {
+				if( rc == FAILED ) {
+					std::cout << "Info: cold Simulated Annealing RE did not converge within ";
+				} else {
+					std::cout << "Info: cold Simulated Annealing RE completed within ";
+				}
+				std::cout << out.iterations << " iterations. "
+					<< "Time taken was " << single_time << " ms. "
+					<< "Deduced inner repetitions parameter of " << out.rep << " "
+					<< "to take 1 second or more per inner benchmark.\n";
+			}
+		}
+	} else {
+		// do benchmark
+		timer.reset();
+		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+			if( rc == SUCCESS ) {
+				out.iterations = data_in.nsweeps;
+
+                rc = grb::algorithms::simulated_annealing_RE(
+				sweep, states, J, h, energies, betas, temp_states, temp_energies, sweep_data, data_in.nsweeps, data_in.use_pt
+                );
+			}
+			if( grb::Properties<>::isNonblockingExecution ) {
+				rc = rc ? rc : wait();
+			}
+		}
+		const double time_taken = timer.time();
+		if( s == 0 ) {
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				std::cout << "Final state replica " << r << ":\n";
+				print_vector( states[r], 50 ,"states values" );  
+				std::cout << "With energy " << energies[ r ] << "\n";
+				std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
+				std::cout << std::endl;
+				assert( ISCLOSE( get_energy( J, h, states[r] ), energies[ r ] ) );
+			}
+		}
+		for(size_t i = 0 ; i < n_replicas ; ++i ){
+			out.best_energy = std::min( out.best_energy, energies[ i ] );
+		}
+
+		out.times.useful = time_taken / static_cast< double >( out.rep );
+		// print timing at root process
+		if( s == 0 ) {
+			std::cout << "Time taken for " << out.rep << " "
+				<< "Simulated Annealing RE calls (hot start): " << out.times.useful << ". "
+				<< "Error code is " << grb::toString( rc ) << std::endl;
+			std::cout << "\tnumber of IM-SB iterations: " << out.iterations << "\n";
+			std::cout << "\tmilliseconds per iteration: "
+				<< ( out.times.useful / static_cast< double >( out.iterations ) )
+				<< "\n";
+		}
+		sleep( 1 );
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+}
+
+
+// --- Simple help / CLI parser for the new runner (no backward compatibility) ---
+void printhelp( char *progname ) {
+    std::cout << "Usage: " << progname << " [--use-default-data] [--j-matrix-fname STR] [--h-fname STR]\n"
+              << "       [--n-replicas INT] [--nsweeps INT] [--seed INT] [--sweep STR]\n"
+              << "       [--verify] [--ref-solution-fname STR] [--help]\n\n"
+              << "Options:\n"
+              << "  --use-default-data         Use embedded default test data\n"
+              << "  --j-matrix-fname STR       Path to J matrix file (matrix-market or supported)\n"
+              << "  --h-fname STR              Path to h (local fields) vector (whitespace separated)\n"
+              << "  --n-replicas INT           Number of replicas (default: 3)\n"
+              << "  --nsweeps INT              Number of sweeps (default: 2)\n"
+              << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
+              << "  --seed INT                 RNG seed (default: 8)\n"
+              << "  --sweep STR                Sweep selector (default: sequential_sweep_immediate)\n"
+              << "  --verify                   Verify output against reference solution\n"
+              << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
+              << "  --help, -h                 Print this help message\n";
+}
+
+bool parse_arguments( input &in, int argc, char ** argv ) {
+    in.filename_Jmatrix.clear();
+    in.filename_h.clear();
+    in.filename_ref_solution.clear();
+    in.direct = true;
+    // map benchmarking configuration to the runner's fields
+    in.rep = grb::config::BENCHMARKING::inner();
+    in.outer = grb::config::BENCHMARKING::outer();
+    // keep verify default (false) unless overridden via CLI
+    in.verify = false;
+
+    for ( int i = 1; i < argc; ++i ) {
+        std::string a = argv[i];
+        if ( a == "--use-default-data" ) {
+            in.use_default_data = true;
+        } else if ( a == "--j-matrix-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--j-matrix-fname requires an argument\n"; return false; }
+            in.filename_Jmatrix = argv[++i];
+        } else if ( a == "--h-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--h-fname requires an argument\n"; return false; }
+            in.filename_h = argv[++i];
+        } else if ( a == "--n-replicas" ) {
+            if ( i+1 >= argc ) { std::cerr << "--n-replicas requires an argument\n"; return false; }
+            in.n_replicas = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--nsweeps" ) {
+            if ( i+1 >= argc ) { std::cerr << "--nsweeps requires an argument\n"; return false; }
+            in.nsweeps = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--use-pt" ) {
+            if ( i+1 >= argc ) { std::cerr << "--use-pt requires an argument\n"; return false; }
+            in.use_pt = static_cast<bool>( std::stoul(argv[++i]) );
+        } else if ( a == "--seed" ) {
+            if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
+            in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
+        } else if ( a == "--sweep" ) {
+            if ( i+1 >= argc ) { std::cerr << "--sweep requires an argument\n"; return false; }
+            in.sweep_name = argv[++i];
+        } else if ( a == "--verify" ) {
+            in.verify = true;
+        } else if ( a == "--ref-solution-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--ref-solution-fname requires an argument\n"; return false; }
+            in.filename_ref_solution = argv[++i];
+        } else if ( a == "--help" || a == "-h" ) {
+            printhelp( argv[0] );
+            return false;
+        } else {
+            std::cerr << "Unknown argument: " << a << "\n";
+            return false;
+        }
+    }
+
+    // basic validation
+    if ( !in.use_default_data ) {
+        if ( in.filename_Jmatrix.empty() || in.filename_h.empty() ) {
+            std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
+            return false;
+        }
+    }
+    if ( in.verify && !in.use_default_data && in.filename_ref_solution.empty() ) {
+        std::cerr << "--ref-solution-fname required when --verify is used without --use-default-data\n";
+        return false;
+    }
+    return true;
+}
+
+// --- Minimal main that uses the existing ioProgram / grbProgram entrypoints ---
+int main( int argc, char ** argv ) {
+    std::cout << "simulated_anealing_re runner\n";
+    input in;
+    output out;
+
+	// init MPI
+	if( MPI_Init( &argc, &argv ) != MPI_SUCCESS ) {
+		std::cerr << "MPI_Init returns with non-SUCCESS exit code." << std::endl;
+		return 10;
+	}
+
+    if ( !parse_arguments( in, argc, argv ) ) {
+        printhelp( argv[0] );
+        return 1;
+    }
+
+
+    std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=" << in.sweep_name << "\n";
+
+    // Run IO program (populates Storage or similar)
+    {
+        bool success = false;
+		grb::Launcher< FROM_MPI > launcher( MPI_COMM_WORLD );
+        grb::RC rc = launcher.exec( &ioProgram, in, success, true );
+        if ( rc != SUCCESS ) {
+            std::cerr << "I/O launcher failed: " << toString(rc) << "\n";
+            return 2;
+        }
+        if ( !success ) {
+            std::cerr << "I/O program reported failure\n";
+            return 3;
+        }
+    }
+
+    // Run main GraphBLAS program that builds data and calls reSA stub
+    {
+		grb::Launcher< FROM_MPI > launcher( MPI_COMM_WORLD );
+        grb::RC rc = launcher.exec( &grbProgram, in, out, true );
+        if ( rc != SUCCESS ) {
+            std::cerr << "grbProgram launcher failed: " << toString(rc) << "\n";
+            return 4;
+        }
+    }
+	
+	// finalise MPI
+	if( MPI_Finalize() != MPI_SUCCESS ) {
+		std::cerr << "MPI_Finalize returns with non-SUCCESS exit code." << std::endl;
+		return 50;
+	}
+
+    std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
+    return out.error_code;
+}
diff --git a/tests/utils/print_vec_mat.hpp b/tests/utils/print_vec_mat.hpp
index 7a95242ed..1f9770cbf 100644
--- a/tests/utils/print_vec_mat.hpp
+++ b/tests/utils/print_vec_mat.hpp
@@ -90,7 +90,7 @@ void print_vector(
 		os << it->second;
 		(void) ++it;
 	} else if( x_size > 0 ) {
-		os << 0;
+		os << '-';
 	}
 	size_t next_nnz, position;
 	next_nnz = it == end ? limit : it->first;
@@ -100,14 +100,18 @@ void print_vector(
 		// print sequence of zeroes
 		for( ; position < zero_streak; ++position ) {
 			os << ", ";
-			os << 0;
+			os << '-';
 		}
 		if( position < limit ) {
 			os << ", ";
 			os << it->second;
 			(void) ++position;
-			(void) ++it;
-			next_nnz = it->first;
+			if( it != end ){
+				(void) ++it;
+				next_nnz = it->first;
+			}else{
+				next_nnz = limit;
+			}
 		}
 	}
 	os << std::endl << "==============" << std::endl << std::endl;

From 2b8791ca51177bdead45dfdbc42efe975678e2aa Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 7 Nov 2025 09:50:01 +0100
Subject: [PATCH 13/58] Simplified signature of simulated_annealing_RE

---
 .../algorithms/simulated_annealing_re.hpp     | 16 ++--------
 tests/smoke/simulated_annealing_re.cpp        | 32 +++++++++++--------
 .../smoke/simulated_annealing_re_from_mpi.cpp | 30 ++++++++++-------
 3 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index f38c5d3a9..c31fe1b5b 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -210,7 +210,6 @@ namespace grb {
 		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
 		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
 		 *
-		 * @tparam QType		The coupling matrix and the local fields type.
 		 * @tparam StateType	The state variable type.
 		 * @tparam EnergyType	The energy type.
 		 * @tparam TempType		The inverse temperature type.
@@ -219,16 +218,12 @@ namespace grb {
 		 */
 		template<
 			Backend backend,
-			typename QType, // type of coupling matrix values
 			typename StateType, // type of state, possibly 0/1
 			typename EnergyType,
 			typename TempType,
 			typename SweepDataType, // type of data to be passed through to the sweep function
-			typename RSI, typename CSI, typename NZI,
 			typename SweepFuncType = std::function< 
 					EnergyType(
-						 const grb::Matrix< QType, backend, RSI, CSI, NZI >&,
-						 const grb::Vector< QType, backend >&,
 						 grb::Vector< StateType, backend >&,
 						 const TempType&,
 						 SweepDataType&
@@ -237,15 +232,13 @@ namespace grb {
 			>
 		grb::RC simulated_annealing_RE(
 				const SweepFuncType &sweep,
+				SweepDataType& sweep_data,
 				std::vector< grb::Vector< StateType, backend > > &states,
-				const grb::Matrix< QType, backend, RSI, CSI, NZI > &couplings,
-				const grb::Vector< QType, backend > &local_fields,
 				grb::Vector< EnergyType, backend > &energies,
 				grb::Vector< TempType, backend > &betas,
 				std::vector< grb::Vector< StateType, backend > >  &temp_states,
 				grb::Vector< EnergyType, backend > &temp_energies,
-				SweepDataType& temp_sweep,
-				const size_t &n_sweeps = 1,
+				const size_t &n_sweeps,
 				const bool &use_pt = false
 				){
 
@@ -255,9 +248,6 @@ namespace grb {
 
 			assert( n_replicas > 0 );
 			assert( n_replicas == grb::size( betas ) );
-			assert( n == grb::ncols( couplings ) );
-			assert( n == grb::nrows( couplings ) );
-			assert( n == grb::size( local_fields ) );
 
 			for(size_t i = 0; i < n_replicas ; ++i ){
 				assert( n == grb::size( states[ i ] ) );
@@ -283,7 +273,7 @@ namespace grb {
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
 				for( size_t j = 0 ; j < n_replicas ; ++j ){
 					
-					energies[j] += sweep( couplings, local_fields, states[j], betas[j], temp_sweep );
+					energies[j] += sweep( states[j], betas[j], sweep_data );
 					grb::wait();
 				
 					// update_best state and energy
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 59d70e710..dea6150df 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -293,6 +293,8 @@ EnergyType get_energy(
 
 template<
 		typename SweepDataType = std::tuple<
+				 	 const grb::Matrix< JType >&,
+				 	 const grb::Vector< JType >&,
 					 grb::Vector< JType >&,
 					 grb::Vector< JType >&,
 					 grb::Vector< IOType >&,
@@ -307,11 +309,11 @@ template<
 		>
 	>
 static EnergyType sequential_sweep_immediate(
-				 const grb::Matrix< JType >& couplings,
-				 const grb::Vector< JType > &local_fields,
 				 grb::Vector< IOType > &state,
 				 const JType &beta,
 				 std::tuple<
+				 	 const grb::Matrix< JType > &,
+				 	 const grb::Vector< JType > &,
 					 grb::Vector< JType >&,
 					 grb::Vector< JType >&,
 					 grb::Vector< IOType >&,
@@ -328,13 +330,15 @@ static EnergyType sequential_sweep_immediate(
 		const size_t n = grb::size( state );
 		EnergyType delta_energy = static_cast< JType >(0.0);
 
-		auto &h 		= std::get<0>(data);
-		auto &log_rand	= std::get<1>(data);
-		auto &delta		= std::get<2>(data);
-		const auto &masks = std::get<3>(data);
-		auto &dn		= std::get<4>(data);
-		auto &accept	= std::get<5>(data);
-		auto &rng       = std::get<6>(data);
+		const auto &couplings 	= std::get<0>(data);
+		const auto &local_fields = std::get<1>(data);
+		auto &h 		= std::get<2>(data);
+		auto &log_rand	= std::get<3>(data);
+		auto &delta		= std::get<4>(data);
+		const auto &masks = std::get<5>(data);
+		auto &dn		= std::get<6>(data);
+		auto &accept	= std::get<7>(data);
+		auto &rng       = std::get<8>(data);
 
 		rc = rc ? rc : grb::wait();
 		rc = rc ? rc : grb::resize( h, n );
@@ -431,6 +435,8 @@ static EnergyType sequential_sweep_immediate(
 
 template<
 		typename SweepDataType = std::tuple<
+				 	 const grb::Matrix< JType >&,
+				 	 const grb::Vector< JType >&,
 					 grb::Vector< JType >&,
 					 grb::Vector< JType >&,
 					 grb::Vector< IOType >&,
@@ -440,8 +446,6 @@ template<
 					 std::minstd_rand&
 					 >,
 		typename SweepFuncType = std::function< EnergyType(
-					 const grb::Matrix< JType >&,
-					 const grb::Vector< JType >&,
 					 grb::Vector< IOType >&,
 					 const JType&,
 					 SweepDataType&
@@ -664,6 +668,8 @@ void grbProgram(
 	grb::Vector< bool > temp_accept ( n );
 	grb::Vector< IOType > temp_delta ( n );
 	auto sweep_data = std::tie(
+			(const typeof(J)&) J,
+			(const typeof(h)&) h,
  			temp_h,
 			temp_log_rand,
 			temp_delta,
@@ -680,7 +686,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, states, J, h, energies, betas, temp_states, temp_energies, sweep_data, data_in.nsweeps, data_in.use_pt
+				sweep, sweep_data, states, energies, betas, temp_states, temp_energies, data_in.nsweeps, data_in.use_pt
         );
 
 		rc = rc ? rc : wait();
@@ -726,7 +732,7 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE(
-				sweep, states, J, h, energies, betas, temp_states, temp_energies, sweep_data, data_in.nsweeps, data_in.use_pt
+				sweep, sweep_data, states, energies, betas, temp_states, temp_energies, data_in.nsweeps, data_in.use_pt
                 );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 2fd397f33..9240b2cfc 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -304,6 +304,8 @@ template<
 		>,
 		Backend backend = grb::reference,
 		typename SweepDataType = std::tuple<
+				 	 const grb::Matrix< JType >&,
+				 	 const grb::Vector< JType >&,
 					 grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
 					 grb::Vector< IOType, backend >&,
@@ -314,11 +316,11 @@ template<
 		grb::Descriptor descr = grb::descriptors::no_operation
 	>
 EnergyType sequential_sweep_immediate(
-				 const grb::Matrix< JType, backend >& couplings,
-				 const grb::Vector< JType, backend > &local_fields,
 				 grb::Vector< IOType, backend > &state,
 				 const JType &beta,
 				 std::tuple<
+				 	 const grb::Matrix< JType, backend >&,
+				 	 const grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
 					 grb::Vector< IOType, backend >&,
@@ -337,13 +339,15 @@ EnergyType sequential_sweep_immediate(
 		const size_t n = grb::size( state );
 		EnergyType delta_energy = static_cast< JType >(0.0);
 
-		auto &h 		= std::get<0>(data);
-		auto &log_rand	= std::get<1>(data);
-		auto &delta		= std::get<2>(data);
-		const auto &masks = std::get<3>(data);
-		auto &dn		= std::get<4>(data);
-		auto &accept	= std::get<5>(data);
-		auto &rng       = std::get<6>(data);
+		const auto &couplings 	= std::get<0>(data);
+		const auto &local_fields = std::get<1>(data);
+		auto &h 		= std::get<2>(data);
+		auto &log_rand	= std::get<3>(data);
+		auto &delta		= std::get<4>(data);
+		const auto &masks = std::get<5>(data);
+		auto &dn		= std::get<6>(data);
+		auto &accept	= std::get<7>(data);
+		auto &rng       = std::get<8>(data);
 
 		rc = rc ? rc : grb::wait();
 		rc = rc ? rc : grb::resize( h, n );
@@ -437,6 +441,8 @@ EnergyType sequential_sweep_immediate(
 template<
 		Backend backend,
 		typename SweepDataType = std::tuple<
+				 	 const grb::Matrix< JType, backend >&,
+				 	 const grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
 					 grb::Vector< IOType, backend >&,
@@ -668,6 +674,8 @@ void grbProgram(
 	grb::Vector< bool, grb::reference > temp_accept ( n );
 	grb::Vector< IOType, grb::reference > temp_delta ( n );
 	auto sweep_data = std::tie(
+			(const typeof(J)&) J,
+			(const typeof(h)&) h,
  			temp_h,
 			temp_log_rand,
 			temp_delta,
@@ -683,7 +691,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, states, J, h, energies, betas, temp_states, temp_energies, sweep_data, data_in.nsweeps, data_in.use_pt
+				sweep, sweep_data, states, energies, betas, temp_states, temp_energies, data_in.nsweeps, data_in.use_pt
         );
 
 		rc = rc ? rc : wait();
@@ -729,7 +737,7 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE(
-				sweep, states, J, h, energies, betas, temp_states, temp_energies, sweep_data, data_in.nsweeps, data_in.use_pt
+					sweep, sweep_data, states, energies, betas, temp_states, temp_energies, data_in.nsweeps, data_in.use_pt
                 );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {

From 6b39d3692d78f71a32f761f0f14aa322526d1911 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 7 Nov 2025 09:57:08 +0100
Subject: [PATCH 14/58] Added signatures for builtin SA optimization of common
 problems

---
 .../algorithms/simulated_annealing_re.hpp     | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index c31fe1b5b..498d63351 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -309,6 +309,101 @@ namespace grb {
 			return rc;
 		}
 
+		/*
+		 * Estimate a solution to a given Ising problem. The solution is found
+		 * using the Simulated Annealing-Replica Exchange function above.
+		 *
+		 *  TODO: expand and complete documentation
+		 *
+		 * @param[in,out] states        On input: initial states.
+		 *                              On output: optimized states.
+		 * @param[in]     couplings     The square (symmetric) couplings matrix.
+		 * @param[in]     local_fields  The vector of local fields.
+		 * @param[in,out] energies      The initial energy of each state.
+		 * @param[in,out] betas     	Inverse temperature of each state.
+		 * @param[in]     n_replicas    Number of replicas to run concurrently.
+		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
+		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
+		 *
+		 * @tparam StateType	The state variable type.
+		 * @tparam QType		The matrix values' type.
+		 * @tparam EnergyType	The energy type.
+		 * @tparam TempType		The inverse temperature type.
+		 * @tparam SweepDataType	Type of data to be passed on to the sweep function (e.g. a tuple of references to temporary vectors).
+		 *
+		 */
+		template<
+			Backend backend,
+			typename StateType, // type of state, possibly 0/1
+			typename QType, // type of coupling matrix values
+			typename EnergyType,
+			typename TempType,
+			typename SweepDataType, // type of data to be passed through to the sweep function
+			typename SweepFuncType = std::function< 
+					EnergyType(
+						 grb::Vector< StateType, backend >&,
+						 const TempType&,
+						 SweepDataType&
+				 	)
+				>,
+				typename RSI, typename CSI, typename NZI
+			>
+		grb::RC simulated_annealing_RE_Ising(
+				const grb::Matrix< QType, backend, RSI, CSI, NZI >& Q,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
+				const size_t &n_sweeps,
+				const bool &use_pt = false
+				);
+
+		/*
+		 * Estimate a solution to a given QUBO problem. The solution is found
+		 * using the Simulated Annealing-Replica Exchange function above.
+		 *
+		 *  TODO: expand and complete documentation
+		 *
+		 * @param[in,out] states        On input: initial states.
+		 *                              On output: optimized states.
+		 * @param[in]     couplings     The square (symmetric) couplings matrix.
+		 * @param[in,out] energies      The initial energy of each state.
+		 * @param[in,out] betas     	Inverse temperature of each state.
+		 * @param[in]     n_replicas    Number of replicas to run concurrently.
+		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
+		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
+		 *
+		 * @tparam StateType	The state variable type.
+		 * @tparam QType		The matrix values' type.
+		 * @tparam EnergyType	The energy type.
+		 * @tparam TempType		The inverse temperature type.
+		 * @tparam SweepDataType	Type of data to be passed on to the sweep function (e.g. a tuple of references to temporary vectors).
+		 *
+		 */
+		template<
+			Backend backend,
+			typename StateType, // type of state, possibly 0/1
+			typename QType, // type of coupling matrix values
+			typename EnergyType,
+			typename TempType,
+			typename SweepDataType, // type of data to be passed through to the sweep function
+			typename SweepFuncType = std::function< 
+					EnergyType(
+						 grb::Vector< StateType, backend >&,
+						 const TempType&,
+						 SweepDataType&
+				 	)
+				>,
+				typename RSI, typename CSI, typename NZI
+			>
+		grb::RC simulated_annealing_RE_QUBO(
+				const grb::Matrix< QType, backend, RSI, CSI, NZI >& Q,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
+				const size_t &n_sweeps,
+				const bool &use_pt = false
+				);
+
 		template< typename T >
 		inline T
 		exp(T x ){

From 493dc4a78645e422760f773efd26a5ac801ff397 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 11 Nov 2025 11:59:30 +0100
Subject: [PATCH 15/58] Cleanup

---
 .../algorithms/simulated_annealing_re.hpp     | 10 +++---
 tests/smoke/simulated_annealing_re.cpp        | 32 +++----------------
 .../smoke/simulated_annealing_re_from_mpi.cpp | 22 ++-----------
 3 files changed, 10 insertions(+), 54 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 498d63351..201aab620 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -30,7 +30,6 @@
 #include <type_traits>
 #include <algorithm>
 #include <cstdlib>
-#include <cassert>
 #include <cmath>
 
 #ifndef NDEBUG
@@ -128,7 +127,6 @@ namespace grb {
 			int rand = std::rand();
 
 			for( size_t si = nprocs ; rc == grb::SUCCESS && si > 0; --si ){
-				std::cerr << "Hello from process " << s << std::endl;
 				if( si == s+1 ){
 					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
@@ -150,7 +148,7 @@ namespace grb {
 				}
 				if( si == 1 ) continue;
 
-				std::cerr << "Calling broadcasts" << std::endl;
+				// std::cerr << "Calling broadcasts" << std::endl;
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].s, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].e, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].b, si-2 );
@@ -282,7 +280,6 @@ namespace grb {
 						temp_states[j] = states[j];
 					}
 				} // n_replicas
-				// std::cerr << "Iteration " << i_sweep << " " << rc << std::endl;
 				if( rc == SUCCESS && use_pt ){
 					// do a Parallel Tempering move
 					rc = pt< backend >( states, energies, betas );
@@ -315,13 +312,14 @@ namespace grb {
 		 *
 		 *  TODO: expand and complete documentation
 		 *
+		 * This function allocates O(n*n_replicas) memory for temporary vectors.
+		 *
 		 * @param[in,out] states        On input: initial states.
 		 *                              On output: optimized states.
 		 * @param[in]     couplings     The square (symmetric) couplings matrix.
 		 * @param[in]     local_fields  The vector of local fields.
 		 * @param[in,out] energies      The initial energy of each state.
 		 * @param[in,out] betas     	Inverse temperature of each state.
-		 * @param[in]     n_replicas    Number of replicas to run concurrently.
 		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
 		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
 		 *
@@ -396,7 +394,7 @@ namespace grb {
 				typename RSI, typename CSI, typename NZI
 			>
 		grb::RC simulated_annealing_RE_QUBO(
-				const grb::Matrix< QType, backend, RSI, CSI, NZI >& Q,
+				const grb::Matrix< QType, backend, RSI, CSI, NZI > &Q,
 				std::vector< grb::Vector< StateType, backend > > &states,
 				grb::Vector< EnergyType, backend > &energies,
 				grb::Vector< TempType, backend > &betas,
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index dea6150df..199525eef 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -292,21 +292,11 @@ EnergyType get_energy(
 }
 
 template<
-		typename SweepDataType = std::tuple<
-				 	 const grb::Matrix< JType >&,
-				 	 const grb::Vector< JType >&,
-					 grb::Vector< JType >&,
-					 grb::Vector< JType >&,
-					 grb::Vector< IOType >&,
-					 const std::vector< grb::Vector< bool > >&,
-					 grb::Vector< EnergyType >&,
-					 grb::Vector< bool >&
-					 >,
-		grb::Descriptor descr = grb::descriptors::no_operation,
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
-		>
+		>,
+		grb::Descriptor descr = grb::descriptors::no_operation
 	>
 static EnergyType sequential_sweep_immediate(
 				 grb::Vector< IOType > &state,
@@ -328,7 +318,7 @@ static EnergyType sequential_sweep_immediate(
 
 		grb::RC rc = grb::SUCCESS;
 		const size_t n = grb::size( state );
-		EnergyType delta_energy = static_cast< JType >(0.0);
+		EnergyType delta_energy = static_cast< EnergyType >(0.0);
 
 		const auto &couplings 	= std::get<0>(data);
 		const auto &local_fields = std::get<1>(data);
@@ -350,13 +340,11 @@ static EnergyType sequential_sweep_immediate(
 		rc = rc ? rc : grb::set( h, local_fields );
 		rc = rc ? rc : grb::mxv( h, couplings, state , ring );
 
-		std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
+		static std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
 		for( size_t j = 0 ; j < n ; ++j ){
 			const auto rnd = rand( rng );
 			rc = rc ? rc : grb::setElement(log_rand,  std::log( rnd ), j );
 		}
-		// rc = rc ? rc : grb::wait();
-		// print_vector( log_rand, 30, "log_rand" );
 
 #ifndef NDEBUG
 		const grb::Vector< IOType > old_state = state;
@@ -383,14 +371,10 @@ static EnergyType sequential_sweep_immediate(
 							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
 						}
 					}, mask, log_rand, dn, accept );
-			// print_vector( log_rand, 30, "log_rand" );
-			// print_vector( mask, 30, "mask" );
-			// print_vector( accept, 30, "accept" );
 
 			// new_state = np.where(accept, 1 - old, old)
 			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
 			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
-			// print_vector( state, 30, "state" );
 			
 			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 			rc = rc ? rc : grb::clear( delta  );
@@ -400,7 +384,6 @@ static EnergyType sequential_sweep_immediate(
 			
 			// Update delta_energy -= dot(dn, accept)
 			rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
-			// rc = rc ? rc : grb::wait();
 
 			// update h
 			rc = rc ? rc : grb::mxv( h, couplings, delta, ring );
@@ -421,12 +404,9 @@ static EnergyType sequential_sweep_immediate(
 		std::cerr << "\n\t Delta_energy: " << delta_energy;
 		std::cerr << "\n\t Real delta: " << real_delta;
 		std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
-		// std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
-		// std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
 		std::cerr << std::endl;
 
 		assert( ISCLOSE(real_delta, delta_energy ) );
-		// TODO: assert fails with nonblocking backend -> see issue #397
 #endif
 
 		return delta_energy;
@@ -527,8 +507,6 @@ void grbProgram(
 	const size_t s = spmd<>::pid();
 	assert( s < spmd<>::nprocs() );
 
-	// std::cerr << "Process " << s <<  " running at line " << __LINE__ << std::endl;
-
     grb::utils::Timer timer;
 	timer.reset();
 
@@ -644,8 +622,6 @@ void grbProgram(
 			std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
             std::cout << std::endl;
         }
-
-		// assert( std::abs(get_energy(  J, h, zero ) - 0.5803450826765713) < 1e-4 );
     }
     #endif
 
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 9240b2cfc..8a95a853b 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -303,16 +303,6 @@ template<
 			grb::identities::zero, grb::identities::one
 		>,
 		Backend backend = grb::reference,
-		typename SweepDataType = std::tuple<
-				 	 const grb::Matrix< JType >&,
-				 	 const grb::Vector< JType >&,
-					 grb::Vector< JType, backend >&,
-					 grb::Vector< JType, backend >&,
-					 grb::Vector< IOType, backend >&,
-					 const std::vector< grb::Vector< bool, backend > >&,
-					 grb::Vector< EnergyType, backend >&,
-					 grb::Vector< bool, backend >&
-					 >,
 		grb::Descriptor descr = grb::descriptors::no_operation
 	>
 EnergyType sequential_sweep_immediate(
@@ -331,13 +321,12 @@ EnergyType sequential_sweep_immediate(
 					 > &data
 			  ){
 		const size_t s = spmd<>::pid();
-		// std::cerr << "Process " << s <<  " running at line " << __LINE__ << std::endl;
 		const Ring ring = Ring();
 
 
 		grb::RC rc = grb::SUCCESS;
 		const size_t n = grb::size( state );
-		EnergyType delta_energy = static_cast< JType >(0.0);
+		EnergyType delta_energy = static_cast< EnergyType >(0.0);
 
 		const auto &couplings 	= std::get<0>(data);
 		const auto &local_fields = std::get<1>(data);
@@ -364,8 +353,6 @@ EnergyType sequential_sweep_immediate(
 			const auto rnd = rand( rng );
 			rc = rc ? rc : grb::setElement(log_rand,  std::log( rnd ), j );
 		}
-		// rc = rc ? rc : grb::wait();
-		// print_vector( log_rand, 30, "log_rand" );
 
 #ifndef NDEBUG
 		const grb::Vector< IOType, backend > old_state = state;
@@ -384,7 +371,7 @@ EnergyType sequential_sweep_immediate(
 
 			// ( dn >= 0 ) | ( log_rand < beta * dn )
 			rc = rc ? rc : grb::set( accept, mask );
-			rc = rc ? rc : grb::wait(); // ERROR: Segmentation Fault with nonblocking backend
+			rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
 			rc = rc ? rc : grb::eWiseLambda<>(
 					[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
 						(void) i;
@@ -425,12 +412,9 @@ EnergyType sequential_sweep_immediate(
 			std::cerr << "\n\t Delta_energy: " << delta_energy;
 			std::cerr << "\n\t Real delta: " << real_delta;
 			std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
-			// std::cerr << "\n\t Old energy: " << get_energy(couplings, local_fields, old_state) ;
-			// std::cerr << "\n\t New energy: " << get_energy(couplings, local_fields, new_state);
 			std::cerr << std::endl;
 
 			assert( ISCLOSE(real_delta, delta_energy ) );
-			// TODO: assert fails with nonblocking backend -> see issue #397
 		}
 #endif
 
@@ -661,8 +645,6 @@ void grbProgram(
 			std::cout << "With energy " << energies[r] << "\n";
             std::cout << std::endl;
         }
-
-		// assert( std::abs(get_energy(  J, h, zero ) - 0.5803450826765713) < 1e-4 );
     }
     #endif
     rc = rc ? rc : wait();

From 578b4e9429a9a67fb5781ca1f143eb6152ca6a60 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 11 Nov 2025 14:12:47 +0100
Subject: [PATCH 16/58] Changed signature of simulated_annealing_re to return
 single best solution

---
 .../algorithms/simulated_annealing_re.hpp     | 24 +++++++++----------
 tests/smoke/simulated_annealing_re.cpp        | 15 +++---------
 .../smoke/simulated_annealing_re_from_mpi.cpp | 13 +++-------
 3 files changed, 17 insertions(+), 35 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 201aab620..022682c2a 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -202,9 +202,8 @@ namespace grb {
 		 * @param[in]     couplings     The square (symmetric) couplings matrix.
 		 * @param[in,out] energies      The initial energy of each state.
 		 * @param[in,out] betas     	Inverse temperature of each state.
-		 * @param[in,out] temp_states   Inverse temperature of each state.
-		 * @param[in,out] temp_energies Inverse temperature of each state.
-		 * @param[in]     n_replicas    Number of replicas to run concurrently.
+		 * @param[in,out] best_state
+		 * @param[in,out] best_energy
 		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
 		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
 		 *
@@ -234,8 +233,8 @@ namespace grb {
 				std::vector< grb::Vector< StateType, backend > > &states,
 				grb::Vector< EnergyType, backend > &energies,
 				grb::Vector< TempType, backend > &betas,
-				std::vector< grb::Vector< StateType, backend > >  &temp_states,
-				grb::Vector< EnergyType, backend > &temp_energies,
+				grb::Vector< StateType, backend >  &best_state,
+				EnergyType &best_energy,
 				const size_t &n_sweeps,
 				const bool &use_pt = false
 				){
@@ -265,8 +264,8 @@ namespace grb {
 			}
 #endif
 
-			temp_energies = energies;
-			temp_states =  states;
+			best_energy = std::numeric_limits< EnergyType >::max();
+			assert( grb::size(best_state) >= n );
 
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
 				for( size_t j = 0 ; j < n_replicas ; ++j ){
@@ -275,9 +274,9 @@ namespace grb {
 					grb::wait();
 				
 					// update_best state and energy
-					if( energies[j] < temp_energies[j] ){
-						temp_energies[j] = energies[j];
-						temp_states[j] = states[j];
+					if( energies[j] < best_energy ){
+						best_energy = energies[j];
+						best_state = states[j];
 					}
 				} // n_replicas
 				if( rc == SUCCESS && use_pt ){
@@ -297,10 +296,9 @@ namespace grb {
 					      << __FILE__ << ": " << grb::toString( rc ) << "\n";
 			}
 #endif
-			// grb::collectives<>::reduce(); ?
 			if( rc == SUCCESS ){
-				states = temp_states;
-				energies = temp_energies;
+				rc = rc ? rc : grb::collectives<>::allreduce(
+						best_energy, grb::operators::min< EnergyType >() );
 			}
 			
 			return rc;
diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 199525eef..2ce39d20f 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -387,7 +387,6 @@ static EnergyType sequential_sweep_immediate(
 
 			// update h
 			rc = rc ? rc : grb::mxv( h, couplings, delta, ring );
-			
 		}
 		rc = rc ? rc : grb::wait();
 
@@ -629,7 +628,6 @@ void grbProgram(
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType > betas( n_replicas );
     grb::Vector< EnergyType > energies( n_replicas );
-    grb::Vector< EnergyType > temp_energies( n_replicas );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
         rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
         rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r] ), r );
@@ -637,7 +635,7 @@ void grbProgram(
     rc = rc ? rc : wait();
 
 
-    std::vector< grb::Vector<IOType> > temp_states;
+    grb::Vector<IOType> best_state ( n );
 	grb::Vector< JType > temp_h ( n );
 	grb::Vector< JType > temp_log_rand ( n );
 	grb::Vector< EnergyType > temp_dn ( n );
@@ -662,7 +660,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, temp_states, temp_energies, data_in.nsweeps, data_in.use_pt
+				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
         );
 
 		rc = rc ? rc : wait();
@@ -677,10 +675,6 @@ void grbProgram(
 		}
 		if( rc == SUCCESS ) {
 			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
-
-			for(size_t i = 0 ; i < n_replicas ; ++i ){
-				out.best_energy = std::min( out.best_energy, energies[ i ] );
-			}
 		}
 		if( rc != SUCCESS ) {
 			out.error_code = 25;
@@ -708,7 +702,7 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, temp_states, temp_energies, data_in.nsweeps, data_in.use_pt
+				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
                 );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {
@@ -726,9 +720,6 @@ void grbProgram(
 				assert( ISCLOSE( get_energy( J, h, states[r] ), energies[ r ] ) );
 			}
 		}
-		for(size_t i = 0 ; i < n_replicas ; ++i ){
-			out.best_energy = std::min( out.best_energy, energies[ i ] );
-		}
 
 		out.times.useful = time_taken / static_cast< double >( out.rep );
 		// print timing at root process
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 8a95a853b..5b29009fe 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -630,7 +630,6 @@ void grbProgram(
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType, grb::reference > betas( n_replicas );
     grb::Vector< EnergyType, grb::reference > energies( n_replicas );
-    grb::Vector< EnergyType, grb::reference > temp_energies( n_replicas );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
         rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
         rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r] ), r );
@@ -649,7 +648,7 @@ void grbProgram(
     #endif
     rc = rc ? rc : wait();
 
-    std::vector< grb::Vector< IOType, grb::reference > > temp_states;
+    grb::Vector< IOType, grb::reference > best_state ( n );
 	grb::Vector< JType, grb::reference > temp_h ( n );
 	grb::Vector< JType, grb::reference > temp_log_rand ( n );
 	grb::Vector< EnergyType, grb::reference > temp_dn ( n );
@@ -673,7 +672,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, temp_states, temp_energies, data_in.nsweeps, data_in.use_pt
+				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
         );
 
 		rc = rc ? rc : wait();
@@ -689,9 +688,6 @@ void grbProgram(
 		if( rc == SUCCESS ) {
 			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
 
-			for(size_t i = 0 ; i < n_replicas ; ++i ){
-				out.best_energy = std::min( out.best_energy, energies[ i ] );
-			}
 		}
 		if( rc != SUCCESS ) {
 			out.error_code = 25;
@@ -719,7 +715,7 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE(
-					sweep, sweep_data, states, energies, betas, temp_states, temp_energies, data_in.nsweeps, data_in.use_pt
+					sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
                 );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {
@@ -737,9 +733,6 @@ void grbProgram(
 				assert( ISCLOSE( get_energy( J, h, states[r] ), energies[ r ] ) );
 			}
 		}
-		for(size_t i = 0 ; i < n_replicas ; ++i ){
-			out.best_energy = std::min( out.best_energy, energies[ i ] );
-		}
 
 		out.times.useful = time_taken / static_cast< double >( out.rep );
 		// print timing at root process

From 1b2bc64fa1e826ae18a355e181a086ae6565bfa0 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 11 Nov 2025 16:34:05 +0100
Subject: [PATCH 17/58] Added descriptors in simulated_annealing functions

---
 tests/smoke/simulated_annealing_re.cpp        | 48 ++++++++++---------
 .../smoke/simulated_annealing_re_from_mpi.cpp | 40 ++++++++--------
 2 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re.cpp
index 2ce39d20f..8360f3d6d 100644
--- a/tests/smoke/simulated_annealing_re.cpp
+++ b/tests/smoke/simulated_annealing_re.cpp
@@ -268,6 +268,7 @@ void read_vector_data_from_array(
 }
 
 template<
+		grb::Descriptor descr = grb::descriptors::no_operation,
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
@@ -279,13 +280,14 @@ EnergyType get_energy(
 				 const Ring &ring = Ring()
 			  ){
 	static grb::Vector< JType > tmp ( grb::size( local_fields ) );
-	grb::RC rc = grb::clear( tmp );
+	grb::RC rc = grb::SUCCESS;
 	EnergyType energy = 0.0;
 
-	rc = rc ? rc : grb::mxv( tmp, couplings, state, ring );
-	rc = rc ? rc : grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
-	rc = rc ? rc : grb::foldl( tmp, local_fields, ring.getAdditiveMonoid() );
-	rc = rc ? rc : grb::dot<>( energy, tmp, state, ring );
+	rc = rc ? rc : grb::set( tmp, 0.0 );
+	rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< descr | grb::descriptors::dense >( energy, tmp, state, ring );
 	assert( rc == grb::SUCCESS );
 
 	return energy;
@@ -337,8 +339,9 @@ static EnergyType sequential_sweep_immediate(
 		rc = rc ? rc : grb::resize( dn, n );
 		rc = rc ? rc : grb::resize( accept, n );
 
-		rc = rc ? rc : grb::set( h, local_fields );
-		rc = rc ? rc : grb::mxv( h, couplings, state , ring );
+		rc = rc ? rc : grb::set< descr >( h, 0.0 );
+		rc = rc ? rc : grb::set< descr | grb::descriptors::dense >( h, local_fields );
+		rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( h, couplings, state , ring );
 
 		static std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
 		for( size_t j = 0 ; j < n ; ++j ){
@@ -356,15 +359,15 @@ static EnergyType sequential_sweep_immediate(
 			rc = rc ? rc : grb::clear( dn );
 
 			// dn = (2*state_slice - 1) * h_slice
-			rc = rc ? rc : grb::set( dn, mask, state );
-			rc = rc ? rc : grb::foldl( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
-			rc = rc ? rc : grb::foldl( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
-			rc = rc ? rc : grb::foldl( dn, h, ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::set< descr >( dn, mask, state );
+			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
+			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
 
 			// ( dn >= 0 ) | ( log_rand < beta * dn )
-			rc = rc ? rc : grb::set( accept, mask );
-			rc = rc ? rc : grb::wait(); // ERROR: Segmentation Fault with nonblocking backend
-			rc = rc ? rc : grb::eWiseLambda<>(
+			rc = rc ? rc : grb::set< descr >( accept, mask );
+			rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
+			rc = rc ? rc : grb::eWiseLambda< descr >(
 					[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
 						(void) i;
 						if( mask[i] ){
@@ -373,20 +376,20 @@ static EnergyType sequential_sweep_immediate(
 					}, mask, log_rand, dn, accept );
 
 			// new_state = np.where(accept, 1 - old, old)
-			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
 			
 			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 			rc = rc ? rc : grb::clear( delta  );
-			rc = rc ? rc : grb::set( delta, accept, state );
-			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( 2 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::set< descr >( delta, accept, state );
+			rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< IOType >( 2 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
 			
 			// Update delta_energy -= dot(dn, accept)
 			rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
 
 			// update h
-			rc = rc ? rc : grb::mxv( h, couplings, delta, ring );
+			rc = rc ? rc : grb::mxv< descr >( h, couplings, delta, ring );
 		}
 		rc = rc ? rc : grb::wait();
 
@@ -432,13 +435,14 @@ template<
 		class Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
-		>
+		>,
+		grb::Descriptor descr = grb::descriptors::no_operation
 	>
 SweepFuncType get_sweep_function( std::string sweep_name ){
 	if( sweep_name != "sequential_sweep_immediate" ){
 			std::cerr << "Warning: unknown sweep setting. Falling back to  \"sequential_sweep_immediate\"" << std::endl;
 	}
-	 return sequential_sweep_immediate< Ring >;
+	 return sequential_sweep_immediate< Ring, descr >;
 }
 
 void ioProgram( const struct input &data_in, bool &success ) {
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 5b29009fe..02e1fbc96 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -273,6 +273,7 @@ void read_vector_data_from_array(
 
 template<
 	Backend backend,
+	grb::Descriptor descr = grb::descriptors::no_operation,
 	class Ring = Semiring<
 		grb::operators::add< JType >, grb::operators::mul< JType >,
 		grb::identities::zero, grb::identities::one
@@ -285,13 +286,14 @@ EnergyType get_energy(
 				 const Ring &ring = Ring()
 			  ){
 	static grb::Vector< JType, backend > tmp ( grb::size( local_fields ) );
-	grb::RC rc = grb::clear( tmp );
+	grb::RC rc = grb::SUCCESS;
 	EnergyType energy = 0.0;
 
-	rc = rc ? rc : grb::mxv( tmp, couplings, state, ring );
-	rc = rc ? rc : grb::foldl( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
-	rc = rc ? rc : grb::foldl( tmp, local_fields, ring.getAdditiveMonoid() );
-	rc = rc ? rc : grb::dot<>( energy, tmp, state, ring );
+	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
+	rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< descr | grb::descriptors::dense >( energy, tmp, state, ring );
 	assert( rc == grb::SUCCESS );
 
 	return energy;
@@ -345,8 +347,8 @@ EnergyType sequential_sweep_immediate(
 		rc = rc ? rc : grb::resize( dn, n );
 		rc = rc ? rc : grb::resize( accept, n );
 
-		rc = rc ? rc : grb::set( h, local_fields );
-		rc = rc ? rc : grb::mxv( h, couplings, state , ring );
+		rc = rc ? rc : grb::set< descr | grb::descriptors::dense >( h, local_fields );
+		rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( h, couplings, state , ring );
 
 		std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
 		for( size_t j = 0 ; j < n ; ++j ){
@@ -364,15 +366,15 @@ EnergyType sequential_sweep_immediate(
 			rc = rc ? rc : grb::clear( dn );
 
 			// dn = (2*state_slice - 1) * h_slice
-			rc = rc ? rc : grb::set( dn, mask, state );
-			rc = rc ? rc : grb::foldl( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
-			rc = rc ? rc : grb::foldl( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
-			rc = rc ? rc : grb::foldl( dn, h, ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::set< descr >( dn, mask, state );
+			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
+			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
 
 			// ( dn >= 0 ) | ( log_rand < beta * dn )
-			rc = rc ? rc : grb::set( accept, mask );
+			rc = rc ? rc : grb::set< descr >( accept, mask );
 			rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
-			rc = rc ? rc : grb::eWiseLambda<>(
+			rc = rc ? rc : grb::eWiseLambda< descr >(
 					[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
 						(void) i;
 						if( mask[i] ){
@@ -381,20 +383,20 @@ EnergyType sequential_sweep_immediate(
 					}, mask, log_rand, dn, accept );
 
 			// new_state = np.where(accept, 1 - old, old)
-			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
 			
 			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 			rc = rc ? rc : grb::clear( delta  );
-			rc = rc ? rc : grb::set( delta, accept, state );
-			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( 2 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::set< descr >( delta, accept, state );
+			rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< IOType >( 2 ), ring.getMultiplicativeMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
 			
 			// Update delta_energy -= dot(dn, accept)
 			rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
 
 			// update h
-			rc = rc ? rc : grb::mxv( h, couplings, delta, ring );
+			rc = rc ? rc : grb::mxv< descr >( h, couplings, delta, ring );
 			
 		}
 		rc = rc ? rc : grb::wait();

From 186654b186182be61bdd070938c40d32da5843df Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 11 Nov 2025 17:12:48 +0100
Subject: [PATCH 18/58] Added implementation of simulated_annealing_RE_Ising
 and simulated_annealing_RE_QUBO

---
 .../algorithms/simulated_annealing_re.hpp     | 254 +++++++++++++++---
 1 file changed, 217 insertions(+), 37 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 022682c2a..c89aa703f 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -28,6 +28,8 @@
 
 #include <vector>
 #include <type_traits>
+#include <tuple>
+#include <random>
 #include <algorithm>
 #include <cstdlib>
 #include <cmath>
@@ -38,7 +40,34 @@
 
 #include <graphblas.hpp>
 
+#define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
+
 namespace grb {
+	namespace internal {
+		/*
+		 * The following functions are used to ensure the correct type of the value in
+		 * in the exponential function.
+		 */
+		template< typename T >
+		inline T exp(T x ){
+			static_assert(
+					std::is_same<T, float>::value
+				 || std::is_same<T, double>::value
+				 || std::is_same<T, long double>::value
+					);
+			return std::exp( x );
+		}
+
+		template< typename T >
+		inline T log(T x ){
+			static_assert(
+					std::is_same<T, float>::value
+				 || std::is_same<T, double>::value
+				 || std::is_same<T, long double>::value
+					);
+			return std::log( x );
+		}
+	} // namespace internal
 
 	namespace algorithms {
 
@@ -81,7 +110,7 @@ namespace grb {
 			for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 				const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
-				if( de >= 0 || std::rand() < RAND_MAX * exp( de ) ){
+				if( de >= 0 || std::rand() < RAND_MAX * internal::exp( de ) ){
 					std::swap( states[i], states[i-1] );
 					std::swap( energies[i], energies[i-1] );
 				}
@@ -131,7 +160,7 @@ namespace grb {
 					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
-						if( de >= 0 || std::rand() < RAND_MAX * exp( de ) ){
+						if( de >= 0 || std::rand() < RAND_MAX * internal::exp( de ) ){
 							std::swap( states[i], states[i-1] );
 							std::swap( energies[i], energies[i-1] );
 						}
@@ -167,7 +196,7 @@ namespace grb {
 
 				const EnergyType de = ( msg[ 1 ].e - msg[ 0 ].e ) * ( msg[ 1 ].b - msg[ 0 ].b );
 
-				if( rc == grb::SUCCESS && ( de >= 0 || msg[ 0 ].r < RAND_MAX * exp( de ) ) ){
+				if( rc == grb::SUCCESS && ( de >= 0 || msg[ 0 ].r < RAND_MAX * internal::exp( de ) ) ){
 					if( si == s+2 ){
 						states[ 0 ] = msg[ 0 ].s;
 						energies[ 0 ] = msg[ 0 ].e;
@@ -315,6 +344,7 @@ namespace grb {
 		 * @param[in,out] states        On input: initial states.
 		 *                              On output: optimized states.
 		 * @param[in]     couplings     The square (symmetric) couplings matrix.
+		 *                              The diagonal has to be zero!
 		 * @param[in]     local_fields  The vector of local fields.
 		 * @param[in,out] energies      The initial energy of each state.
 		 * @param[in,out] betas     	Inverse temperature of each state.
@@ -330,28 +360,176 @@ namespace grb {
 		 */
 		template<
 			Backend backend,
+			grb::Descriptor descr = grb::descriptors::no_operation,
+			bool empty_local_fields = false,
 			typename StateType, // type of state, possibly 0/1
 			typename QType, // type of coupling matrix values
 			typename EnergyType,
 			typename TempType,
-			typename SweepDataType, // type of data to be passed through to the sweep function
-			typename SweepFuncType = std::function< 
-					EnergyType(
-						 grb::Vector< StateType, backend >&,
-						 const TempType&,
-						 SweepDataType&
-				 	)
-				>,
-				typename RSI, typename CSI, typename NZI
+			typename RSI, typename CSI, typename NZI,
+			class Ring = Semiring<
+				grb::operators::add< QType >, grb::operators::mul< QType >,
+				grb::identities::zero, grb::identities::one
+			>
 			>
 		grb::RC simulated_annealing_RE_Ising(
-				const grb::Matrix< QType, backend, RSI, CSI, NZI >& Q,
+				const grb::Matrix< QType, backend, RSI, CSI, NZI > &couplings,
+				const grb::Vector< QType, backend> &local_fields,
 				std::vector< grb::Vector< StateType, backend > > &states,
 				grb::Vector< EnergyType, backend > &energies,
 				grb::Vector< TempType, backend > &betas,
+				grb::Vector< StateType, backend > &best_state,
+				EnergyType &best_energy,
 				const size_t &n_sweeps,
-				const bool &use_pt = false
-				);
+				const bool &use_pt = false,
+				const int seed = 42,
+				const Ring &ring = Ring()
+				){
+			const size_t n = grb::size(local_fields);
+			const size_t n_replicas = grb::size(betas);
+			const size_t s 		= spmd<>::pid();
+			grb::RC rc = grb::SUCCESS;
+
+			assert( grb::size(states[0]) == n );
+			assert( grb::nnz(states[0]) == n ); // state is dense
+			assert( states.size() == n_replicas );
+
+			EnergyType energy;
+			grb::Vector< EnergyType > tmp_calc_energy ( n );
+			const auto get_energy = [&couplings, &local_fields, &tmp_calc_energy, &ring](
+					EnergyType &energy, const grb::Vector< StateType > &state
+					){
+				grb::RC rc = grb::SUCCESS;
+				grb::set( tmp_calc_energy, static_cast<EnergyType>( 0.0 ) );
+				rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( tmp_calc_energy, couplings, state, ring );
+				rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp_calc_energy, static_cast< EnergyType >( 0.5 ),
+						ring.getMultiplicativeMonoid() );
+				if( !empty_local_fields) {
+					rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp_calc_energy, local_fields, ring.getAdditiveMonoid() );
+				}
+				rc = rc ? rc : grb::dot< descr | grb::descriptors::dense >( energy, tmp_calc_energy, state, ring );
+				return rc;
+			};
+
+			// it is reasonable to allow the energies to be allocated and evaluated by this function
+			if( grb::nnz(energies) == 0 ){
+				grb::resize( energies, n_replicas );
+
+				for(size_t i = 0 ; i < n_replicas ; ++i){
+					energy = static_cast< EnergyType >( 0.0 );
+					rc = rc ? rc : get_energy( energy, states[i] );
+					grb::setElement( energies, energy, i );
+				}
+			}
+
+			std::vector< grb::Vector< bool > > masks ;
+			for( size_t i = 0 ; i < n ; ++i ){
+				masks.push_back( grb::Vector< bool >(n) );
+				grb::setElement( masks[i], true, i );
+			}
+			grb::Vector< QType > h ( n );
+			grb::Vector< QType > log_rand ( n );
+			grb::Vector< StateType > delta ( n );
+			grb::Vector< EnergyType > dn ( n );
+			grb::Vector< bool > accept ( n );
+    		std::srand( static_cast<unsigned>( seed + s ) );
+    		std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
+
+			auto sweep_data = std::tie(energy);
+
+			const auto ising_sweep = [&](
+				 grb::Vector< StateType > &state,
+				 const TempType &beta,
+				 typeof(sweep_data) &data
+			  ){
+				(void) data;
+				const size_t n = grb::size( state );
+				EnergyType delta_energy = static_cast< EnergyType >(0.0);
+				grb::RC rc = grb::SUCCESS;
+
+				if( !empty_local_fields) {
+					rc = rc ? rc : grb::set< descr >( h, local_fields );
+				}else {
+					rc = rc ? rc : grb::set< descr >( h, static_cast< QType >( 0.0 ) );
+				}
+				rc = rc ? rc : grb::mxv< descr >( h, couplings, state , ring );
+				std::uniform_real_distribution< QType > rand ( 0.0, 1.0 );
+				for( size_t j = 0 ; j < n ; ++j ){
+					const auto rnd = rand( rng );
+					rc = rc ? rc : grb::setElement(log_rand,  log( rnd ), j );
+				}
+#ifndef NDEBUG
+				const grb::Vector< StateType > old_state = state;
+
+#endif
+				for(const auto &mask : masks ){
+					rc = rc ? rc : grb::clear( accept  );
+					rc = rc ? rc : grb::clear( delta  );
+					rc = rc ? rc : grb::clear( dn );
+
+					// dn = (2*state_slice - 1) * h_slice
+					rc = rc ? rc : grb::set< descr >( dn, mask, state );
+					rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
+					rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
+					rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
+
+					// ( dn >= 0 ) | ( log_rand < beta * dn )
+					rc = rc ? rc : grb::set< descr >( accept, mask );
+					rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
+					rc = rc ? rc : grb::eWiseLambda< descr >(
+							[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
+								(void) i;
+								if( mask[i] ){
+									accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
+								}
+							}, mask, log_rand, dn, accept );
+
+					// new_state = np.where(accept, 1 - old, old)
+					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( -1 ), ring.getMultiplicativeMonoid() );
+					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( 1 ), ring.getAdditiveMonoid() );
+					
+					// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
+					rc = rc ? rc : grb::clear( delta  );
+					rc = rc ? rc : grb::set< descr >( delta, accept, state );
+					rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< StateType >( 2 ), ring.getMultiplicativeMonoid() );
+					rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< StateType >( -1 ), ring.getAdditiveMonoid() );
+					
+					// Update delta_energy -= dot(dn, accept)
+					rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
+
+					// update h
+					rc = rc ? rc : grb::mxv< descr >( h, couplings, delta, ring );
+				}
+				rc = rc ? rc : grb::wait();
+
+#ifndef NDEBUG
+				if( rc != grb::SUCCESS ){
+					std::cerr << "\n\t Error in some GraphBLAS function of ising_sweep " << rc << " : " << grb::toString( rc ) << std::endl;
+					abort();
+				}
+				assert( rc == grb::SUCCESS );
+				const auto new_state = state;
+				rc = rc ? rc : grb::wait();
+
+				EnergyType e1 = static_cast< EnergyType >( 0.0 ),
+						   e2 = static_cast< EnergyType >( 0.0 );
+				get_energy(e1, old_state);
+				get_energy(e2, new_state);
+				const auto real_delta = e2 - e1;
+				std::cerr << "\n\t Delta_energy: " << delta_energy;
+				std::cerr << "\n\t Real delta: " << real_delta;
+				std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
+				std::cerr << std::endl;
+
+				assert( ISCLOSE(real_delta, delta_energy ) );
+#endif
+				return delta_energy;
+			};
+
+			return simulated_annealing_RE(
+					ising_sweep, sweep_data, states, energies, betas, best_state, best_energy, n_sweeps, use_pt
+					);
+		}
 
 		/*
 		 * Estimate a solution to a given QUBO problem. The solution is found
@@ -359,9 +537,11 @@ namespace grb {
 		 *
 		 *  TODO: expand and complete documentation
 		 *
-		 * @param[in,out] states        On input: initial states.
-		 *                              On output: optimized states.
-		 * @param[in]     couplings     The square (symmetric) couplings matrix.
+		 * Warning: This function allocates O(n*n_replicas) memory for temporary vectors.
+		 *
+		 * @param[in,out] states        On input: initial (dense) states.
+		 *                              On output: optimized (dense) states.
+		 * @param[in]     Q             The square symmetric $Q$ matrix.
 		 * @param[in,out] energies      The initial energy of each state.
 		 * @param[in,out] betas     	Inverse temperature of each state.
 		 * @param[in]     n_replicas    Number of replicas to run concurrently.
@@ -377,40 +557,40 @@ namespace grb {
 		 */
 		template<
 			Backend backend,
+			grb::Descriptor descr = grb::descriptors::no_operation,
 			typename StateType, // type of state, possibly 0/1
 			typename QType, // type of coupling matrix values
 			typename EnergyType,
 			typename TempType,
-			typename SweepDataType, // type of data to be passed through to the sweep function
-			typename SweepFuncType = std::function< 
-					EnergyType(
-						 grb::Vector< StateType, backend >&,
-						 const TempType&,
-						 SweepDataType&
-				 	)
-				>,
-				typename RSI, typename CSI, typename NZI
+			typename RSI, typename CSI, typename NZI,
+				class Ring = Semiring<
+					grb::operators::add< QType >, grb::operators::mul< QType >,
+					grb::identities::zero, grb::identities::one
+				>
 			>
 		grb::RC simulated_annealing_RE_QUBO(
 				const grb::Matrix< QType, backend, RSI, CSI, NZI > &Q,
 				std::vector< grb::Vector< StateType, backend > > &states,
 				grb::Vector< EnergyType, backend > &energies,
 				grb::Vector< TempType, backend > &betas,
+				grb::Vector< StateType, backend > &best_state,
+				EnergyType &best_energy,
 				const size_t &n_sweeps,
-				const bool &use_pt = false
-				);
+				const bool &use_pt = false,
+				const int seed = 42,
+				const Ring &ring = Ring()
+				){
+			grb::Vector< QType > empty_local_fields ( grb::ncols( Q ) );
 
-		template< typename T >
-		inline T
-		exp(T x ){
-			static_assert(std::is_same<T, float>::value ||
-				std::is_same<T, double>::value ||
-				std::is_same<T, long double>::value);
-			return std::exp( x );
+			return simulated_annealing_RE_Ising< backend, descr, true >(
+					Q, empty_local_fields, states, energies, betas, best_state, best_energy, n_sweeps, use_pt, seed, ring
+					);
 		}
-	} // namespace algorithms
 
+	
+	} // namespace algorithms
 } // end namespace grb
+#undef ISCLOSE
 
 #endif // end _H_GRB_ALGORITHMS_SA-RE
 

From 52bfa6b432ae0d9011bb8e67aee8364b590aa61d Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 11 Nov 2025 17:35:28 +0100
Subject: [PATCH 19/58] Added some documentation about simulated_annealing

---
 .../algorithms/simulated_annealing_re.hpp     | 58 +++++++++++++------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index c89aa703f..6b51adb15 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -76,8 +76,10 @@ namespace grb {
 		 * This means exchanging states at low temperature with states at higher temperature.
 		 * To make the code simpler, this will be done by exchanging the temperatures instead.
 		 *
-		 * @param[in] states        On input: initial states.
-		 * @param[in] energies      The initial energy of each state.
+		 * TODO: Fix this documentation.
+		 *
+		 * @param[in,out] states        On input: initial states.
+		 * @param[in,out] energies      The initial energy of each state.
 		 * @param[in,out] betas     Inverse temperature of each state.
 		 * 							The betas may be permuted.
 		 *
@@ -189,7 +191,8 @@ namespace grb {
 #ifndef NDEBUG
 	
 				if( rc != grb::SUCCESS ){
-					std::cerr << "\n\t Error in a collective broadcast " << rc << " : " << grb::toString( rc ) << std::endl;
+					std::cerr << "\n\t Error in a collective broadcast " << rc << " : " <<
+						grb::toString( rc ) << std::endl;
 				}
 				assert( rc == grb::SUCCESS );
 #endif
@@ -217,29 +220,36 @@ namespace grb {
 		 * Estimate a solution to a given optimization problem. The solution is found
 		 * using Simulated Annealing-Replica Exchange (also known as Parallel Tempering).
 		 *
-		 * The state will be optimized to minimize the value of the energy $U(x)$,
-		 * where $x$ is the binary state vector, and $couplings$ is the coupling matrix.
-		 * Energies will be changed when changing the states, so that each energy is
-		 * the actual energy of the relative state.
-		 * The parameter sweep is a function that (randomly) changes a given state and
-		 * returns the variation of energy made from its changes of the state.
+		 * The state will be optimized to minimize the value of an energy function $U(x)$,
+		 * where $x$ is the state vector. Energies will be changed when changing the
+		 * states, so that each energy is the actual energy of the relative state.
+		 *
+		 * The parameter sweep is a user-defined function that changes a given state
+		 * (possibly randomly) and returns the variation of energy made from its
+		 * changes of the state. It should take three parameters: a state vector, the
+		 * inverse temperature (a scalar) and sweep_data.
 		 *
 		 * @param[in]     sweep      	The sweeping function.
-		 * 								Should return the energy variation relative to the changes that it made on the state.
+		 * 								Should return the energy variation relative to the changes that it
+		 * 								made on the state.
+		 * @param[in]     sweep_data    Additional data to be passed to the sweep function.
 		 * @param[in,out] states        On input: initial states.
 		 *                              On output: optimized states.
-		 * @param[in]     couplings     The square (symmetric) couplings matrix.
 		 * @param[in,out] energies      The initial energy of each state.
 		 * @param[in,out] betas     	Inverse temperature of each state.
-		 * @param[in,out] best_state
-		 * @param[in,out] best_energy
+		 * @param[in,out] best_state	The state with the minimum energy found by the algorithm.
+		 * @param[in,out] best_energy	The minimum value of an energy found.
 		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
 		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
 		 *
+		 * @tparam backend		The backend used for the single objects
 		 * @tparam StateType	The state variable type.
 		 * @tparam EnergyType	The energy type.
 		 * @tparam TempType		The inverse temperature type.
-		 * @tparam SweepDataType	Type of data to be passed on to the sweep function (e.g. a tuple of references to temporary vectors).
+		 * @tparam SweepDataType	Type of data to be passed on to the sweep function
+		 * (e.g. a tuple of references to temporary vectors).
+		 * @tparam SweepFuncType    The type of the function.
+		 * The default value suggests the signature that the function should have.
 		 *
 		 */
 		template<
@@ -337,12 +347,18 @@ namespace grb {
 		 * Estimate a solution to a given Ising problem. The solution is found
 		 * using the Simulated Annealing-Replica Exchange function above.
 		 *
+		 * The function minimized is $U(x) = x^T(Jx/2 + h)$, where $J$ is the supplied
+		 * couplings matrix and $h$ is the local_fields vector. The solution is searched
+		 * in the space of vectors $x$ with entries $0$ or $1$.
+		 *
+		 * states should be a vector of already initialized and filled dense grb::Vector.
+		 *
 		 *  TODO: expand and complete documentation
 		 *
-		 * This function allocates O(n*n_replicas) memory for temporary vectors.
+		 * Warning: This function allocates O(n*n_replicas) memory for temporary vectors.
 		 *
-		 * @param[in,out] states        On input: initial states.
-		 *                              On output: optimized states.
+		 * @param[in,out] states        On input: initial (dense) states.
+		 *                              On output: optimized (dense) states.
 		 * @param[in]     couplings     The square (symmetric) couplings matrix.
 		 *                              The diagonal has to be zero!
 		 * @param[in]     local_fields  The vector of local fields.
@@ -355,7 +371,8 @@ namespace grb {
 		 * @tparam QType		The matrix values' type.
 		 * @tparam EnergyType	The energy type.
 		 * @tparam TempType		The inverse temperature type.
-		 * @tparam SweepDataType	Type of data to be passed on to the sweep function (e.g. a tuple of references to temporary vectors).
+		 * @tparam SweepDataType	Type of data to be passed on to the sweep function
+		 * (e.g. a tuple of references to temporary vectors).
 		 *
 		 */
 		template<
@@ -535,13 +552,16 @@ namespace grb {
 		 * Estimate a solution to a given QUBO problem. The solution is found
 		 * using the Simulated Annealing-Replica Exchange function above.
 		 *
+		 * The function optimized is $U(x) = x^TQx$, with the constraint that $x$ is a
+		 * 0/1 vector.
+		 *
 		 *  TODO: expand and complete documentation
 		 *
 		 * Warning: This function allocates O(n*n_replicas) memory for temporary vectors.
 		 *
 		 * @param[in,out] states        On input: initial (dense) states.
 		 *                              On output: optimized (dense) states.
-		 * @param[in]     Q             The square symmetric $Q$ matrix.
+		 * @param[in]     Q             The square (symmetric) Q matrix.
 		 * @param[in,out] energies      The initial energy of each state.
 		 * @param[in,out] betas     	Inverse temperature of each state.
 		 * @param[in]     n_replicas    Number of replicas to run concurrently.

From f0fc8defdc76daf39cfb233baa842530523043a2 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 13 Nov 2025 12:38:33 +0100
Subject: [PATCH 20/58] Added simple working matrix partitioning/graph coloring
 algorithm

---
 .../algorithms/simulated_annealing_re.hpp     | 122 +++++++++++++++---
 1 file changed, 103 insertions(+), 19 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 6b51adb15..e8817b2a2 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -343,6 +343,96 @@ namespace grb {
 			return rc;
 		}
 
+		/*
+		 * Create a set of independent masks.
+		 *
+		 * Uses a graph coloring algorithm.
+		 * Adapted from Alg. 2 of `Graph Coloring on the GPU, M. Osama, M. Truong, C. Yang, A. Buluc, J.D. Owens`.
+		 *
+		 * @param[out] masks            The vector of constructed masks.
+		 * @param[in]     A             The square (symmetric) A matrix.
+		 *
+		 * @tparam MaskType	The state variable type.
+		 * @tparam AType		The matrix values' type.
+		 *
+		 */
+		template<
+			grb::Descriptor descr = grb::descriptors::no_operation,
+			Backend backend,
+			typename MaskType,
+			typename AType,
+			typename RSI, typename CSI, typename NZI
+		>
+		grb::RC matrix_partition(
+				std::vector< grb::Vector< MaskType, backend > > &masks,
+				const grb::Matrix< AType, backend, RSI, CSI, NZI > &A,
+				const int seed = 42
+				) {
+			masks.clear();
+			grb::RC rc = grb::SUCCESS;
+			const size_t n = grb::nrows( A );
+			assert( n == grb::ncols( A ) ); // A needs to be square
+
+			grb::Vector< AType, backend > frontier ( n );
+			grb::Vector< AType, backend > w ( n );
+
+    		std::minstd_rand rng ( seed );
+			std::uniform_real_distribution< AType > rand ( 0.1, 2.0 );
+
+			for( size_t i = 0 ; i < n ; ++i ){
+				rc = rc ? rc : grb::setElement( w, rand( rng ), i );
+			}
+
+			const grb::Semiring<
+			grb::operators::max< AType >, grb::operators::right_assign< AType >,
+			grb::identities::negative_infinity, grb::identities::zero
+			> maxTimesRing;
+			const grb::Monoid< grb::operators::add< AType >, grb::identities::zero > addMonoid;
+			const grb::operators::greater_than< AType > gtOp;
+			const grb::operators::logical_and< bool > orOp;
+			const grb::operators::not_equal< bool > xorOp;
+
+			grb::Vector< bool > remaining ( n );
+			grb::set( remaining, true );
+			for( size_t i = 0; rc == grb::SUCCESS && i < n ; ++i ) {
+				// find max of neighbors
+				const auto w1 = w;
+				rc = rc ? rc : grb::clear( frontier );
+				rc = rc ? rc : grb::mxv< descr >( frontier, A, w1, maxTimesRing );
+				rc = rc ? rc : grb::foldl< descr >( frontier, w1, gtOp );
+
+				AType succ = static_cast< AType >( 0 );
+				rc = rc ? rc : grb::foldl< descr >( succ, frontier, addMonoid );
+				if( succ <= 0 ){
+					break;
+				}
+				if( masks.size() <= i ) {
+					masks.emplace_back( grb::Vector< bool >( n ) );
+				}else{
+					grb::clear( masks.at(i) );
+				}
+				auto &new_mask = masks.at(i);
+				rc = rc ? rc : grb::resize( new_mask, n );
+				new_mask = remaining;
+				rc = rc ? rc : grb::foldl< descr >( new_mask, frontier, orOp);
+
+				rc = rc ? rc : grb::foldl< descr >( remaining, new_mask, xorOp );
+				rc = rc ? rc : grb::set< descr >( w, remaining, w1 );
+			}
+			assert( rc == grb::SUCCESS );
+
+#ifndef NDEBUG
+			std::cerr << "Final masks: \n";
+			for(const auto&mask : masks ){
+				for( const auto &x : mask ){
+					if( x.second ) std::cerr << x.first << ", ";
+				}
+				std::cerr << std::endl;
+			}
+#endif
+			return rc;
+		}
+	
 		/*
 		 * Estimate a solution to a given Ising problem. The solution is found
 		 * using the Simulated Annealing-Replica Exchange function above.
@@ -402,7 +492,7 @@ namespace grb {
 				const int seed = 42,
 				const Ring &ring = Ring()
 				){
-			const size_t n = grb::size(local_fields);
+			const size_t n = grb::size( states[0] );
 			const size_t n_replicas = grb::size(betas);
 			const size_t s 		= spmd<>::pid();
 			grb::RC rc = grb::SUCCESS;
@@ -412,7 +502,7 @@ namespace grb {
 			assert( states.size() == n_replicas );
 
 			EnergyType energy;
-			grb::Vector< EnergyType > tmp_calc_energy ( n );
+			grb::Vector< EnergyType, backend > tmp_calc_energy ( n );
 			const auto get_energy = [&couplings, &local_fields, &tmp_calc_energy, &ring](
 					EnergyType &energy, const grb::Vector< StateType > &state
 					){
@@ -439,23 +529,21 @@ namespace grb {
 				}
 			}
 
-			std::vector< grb::Vector< bool > > masks ;
-			for( size_t i = 0 ; i < n ; ++i ){
-				masks.push_back( grb::Vector< bool >(n) );
-				grb::setElement( masks[i], true, i );
-			}
-			grb::Vector< QType > h ( n );
-			grb::Vector< QType > log_rand ( n );
-			grb::Vector< StateType > delta ( n );
-			grb::Vector< EnergyType > dn ( n );
-			grb::Vector< bool > accept ( n );
+			std::vector< grb::Vector< bool, backend > > masks ;
+			rc = rc ? rc : matrix_partition< descr >( masks, couplings, seed );
+
+			grb::Vector< QType, backend > h ( n );
+			grb::Vector< QType, backend > log_rand ( n );
+			grb::Vector< StateType, backend > delta ( n );
+			grb::Vector< EnergyType, backend > dn ( n );
+			grb::Vector< bool, backend > accept ( n );
     		std::srand( static_cast<unsigned>( seed + s ) );
     		std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
 
 			auto sweep_data = std::tie(energy);
 
 			const auto ising_sweep = [&](
-				 grb::Vector< StateType > &state,
+				 grb::Vector< StateType, backend > &state,
 				 const TempType &beta,
 				 typeof(sweep_data) &data
 			  ){
@@ -473,11 +561,10 @@ namespace grb {
 				std::uniform_real_distribution< QType > rand ( 0.0, 1.0 );
 				for( size_t j = 0 ; j < n ; ++j ){
 					const auto rnd = rand( rng );
-					rc = rc ? rc : grb::setElement(log_rand,  log( rnd ), j );
+					rc = rc ? rc : grb::setElement(log_rand,  internal::log( rnd ), j );
 				}
 #ifndef NDEBUG
 				const grb::Vector< StateType > old_state = state;
-
 #endif
 				for(const auto &mask : masks ){
 					rc = rc ? rc : grb::clear( accept  );
@@ -572,7 +659,6 @@ namespace grb {
 		 * @tparam QType		The matrix values' type.
 		 * @tparam EnergyType	The energy type.
 		 * @tparam TempType		The inverse temperature type.
-		 * @tparam SweepDataType	Type of data to be passed on to the sweep function (e.g. a tuple of references to temporary vectors).
 		 *
 		 */
 		template<
@@ -600,14 +686,12 @@ namespace grb {
 				const int seed = 42,
 				const Ring &ring = Ring()
 				){
-			grb::Vector< QType > empty_local_fields ( grb::ncols( Q ) );
+			grb::Vector< QType > empty_local_fields ( 0 );
 
 			return simulated_annealing_RE_Ising< backend, descr, true >(
 					Q, empty_local_fields, states, energies, betas, best_state, best_energy, n_sweeps, use_pt, seed, ring
 					);
 		}
-
-	
 	} // namespace algorithms
 } // end namespace grb
 #undef ISCLOSE

From ded14fcfa2a6a12ee7914d90583dbe4753d74cee Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 14 Nov 2025 10:17:26 +0100
Subject: [PATCH 21/58] Many improvements in matrix_partition and
 simulated_annealing_RE_Ising

---
 .../algorithms/simulated_annealing_re.hpp     | 97 ++++++++++++-------
 1 file changed, 61 insertions(+), 36 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index e8817b2a2..1a4c374a9 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -350,10 +350,15 @@ namespace grb {
 		 * Adapted from Alg. 2 of `Graph Coloring on the GPU, M. Osama, M. Truong, C. Yang, A. Buluc, J.D. Owens`.
 		 *
 		 * @param[out] masks            The vector of constructed masks.
-		 * @param[in]     A             The square (symmetric) A matrix.
+		 * @param[in]     A             The square (symmetric) A matrix. It should not contain any explicit zeros!
+		 * @param[in]     frontier      Temporary (dense) vector used in the function
+		 * @param[in]     w      Temporary (dense) vector used in the function
+		 * @param[in]     seed      seed for randomization
 		 *
+		 * @tparam descr	grb::Descriptor for matrix operations. Should probably not be used
+		 * @tparam backend	The backend of GraphBLAS to be used
 		 * @tparam MaskType	The state variable type.
-		 * @tparam AType		The matrix values' type.
+		 * @tparam AType	The matrix values' type.
 		 *
 		 */
 		template<
@@ -366,6 +371,8 @@ namespace grb {
 		grb::RC matrix_partition(
 				std::vector< grb::Vector< MaskType, backend > > &masks,
 				const grb::Matrix< AType, backend, RSI, CSI, NZI > &A,
+				grb::Vector< AType, backend > &frontier,
+				grb::Vector< AType, backend > &w,
 				const int seed = 42
 				) {
 			masks.clear();
@@ -373,8 +380,8 @@ namespace grb {
 			const size_t n = grb::nrows( A );
 			assert( n == grb::ncols( A ) ); // A needs to be square
 
-			grb::Vector< AType, backend > frontier ( n );
-			grb::Vector< AType, backend > w ( n );
+			grb::resize( frontier, n );
+			grb::resize( w, n );
 
     		std::minstd_rand rng ( seed );
 			std::uniform_real_distribution< AType > rand ( 0.1, 2.0 );
@@ -389,46 +396,50 @@ namespace grb {
 			> maxTimesRing;
 			const grb::Monoid< grb::operators::add< AType >, grb::identities::zero > addMonoid;
 			const grb::operators::greater_than< AType > gtOp;
-			const grb::operators::logical_and< bool > orOp;
-			const grb::operators::not_equal< bool > xorOp;
-
-			grb::Vector< bool > remaining ( n );
-			grb::set( remaining, true );
+			const grb::operators::right_assign< AType > right_assign;
+ 
 			for( size_t i = 0; rc == grb::SUCCESS && i < n ; ++i ) {
 				// find max of neighbors
-				const auto w1 = w;
-				rc = rc ? rc : grb::clear( frontier );
-				rc = rc ? rc : grb::mxv< descr >( frontier, A, w1, maxTimesRing );
-				rc = rc ? rc : grb::foldl< descr >( frontier, w1, gtOp );
+				rc = rc ? rc : grb::set< descr >( frontier, static_cast< AType >( 0 ) );
+				rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( frontier, A, w, maxTimesRing );
+				rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( frontier, w, gtOp );
 
+				// is there any new node?
 				AType succ = static_cast< AType >( 0 );
 				rc = rc ? rc : grb::foldl< descr >( succ, frontier, addMonoid );
 				if( succ <= 0 ){
 					break;
 				}
-				if( masks.size() <= i ) {
-					masks.emplace_back( grb::Vector< bool >( n ) );
-				}else{
-					grb::clear( masks.at(i) );
-				}
+
+				// add new mask
+				masks.emplace_back( grb::Vector< bool >( n ) );
 				auto &new_mask = masks.at(i);
 				rc = rc ? rc : grb::resize( new_mask, n );
-				new_mask = remaining;
-				rc = rc ? rc : grb::foldl< descr >( new_mask, frontier, orOp);
+				rc = rc ? rc : grb::set< descr >( new_mask, frontier, static_cast< MaskType >(true) );
 
-				rc = rc ? rc : grb::foldl< descr >( remaining, new_mask, xorOp );
-				rc = rc ? rc : grb::set< descr >( w, remaining, w1 );
+				// do not consider the weights of used nodes
+				rc = rc ? rc : grb::foldl< descr >( w, new_mask,
+						static_cast< AType >( 0 ), right_assign );
 			}
 			assert( rc == grb::SUCCESS );
 
 #ifndef NDEBUG
+			if( rc != grb::SUCCESS) {
+				std::cerr << "Error in matrix_partition: " << rc << " " << grb::toString(rc) << std::endl;
+
+			}
+			size_t cnt = 0;
 			std::cerr << "Final masks: \n";
 			for(const auto&mask : masks ){
 				for( const auto &x : mask ){
-					if( x.second ) std::cerr << x.first << ", ";
+					if( x.second ){
+						std::cerr << x.first << ", ";
+						cnt ++;
+					}
 				}
 				std::cerr << std::endl;
 			}
+			assert( cnt == n );
 #endif
 			return rc;
 		}
@@ -437,7 +448,7 @@ namespace grb {
 		 * Estimate a solution to a given Ising problem. The solution is found
 		 * using the Simulated Annealing-Replica Exchange function above.
 		 *
-		 * The function minimized is $U(x) = x^T(Jx/2 + h)$, where $J$ is the supplied
+		 * The function minimized is $U(x) = x^T(\frac{1}{2}Jx + h)$, where $J$ is the supplied
 		 * couplings matrix and $h$ is the local_fields vector. The solution is searched
 		 * in the space of vectors $x$ with entries $0$ or $1$.
 		 *
@@ -445,7 +456,7 @@ namespace grb {
 		 *
 		 *  TODO: expand and complete documentation
 		 *
-		 * Warning: This function allocates O(n*n_replicas) memory for temporary vectors.
+		 * Warning: This function allocates $O(n)$ memory for temporary vectors.
 		 *
 		 * @param[in,out] states        On input: initial (dense) states.
 		 *                              On output: optimized (dense) states.
@@ -503,9 +514,14 @@ namespace grb {
 
 			EnergyType energy;
 			grb::Vector< EnergyType, backend > tmp_calc_energy ( n );
+
 			const auto get_energy = [&couplings, &local_fields, &tmp_calc_energy, &ring](
 					EnergyType &energy, const grb::Vector< StateType > &state
 					){
+				const size_t n = grb::size( local_fields );
+				assert( n == grb::size( state ) );
+				assert( n == grb::ncols( couplings ) );
+				assert( n == grb::nrows( couplings ) );
 				grb::RC rc = grb::SUCCESS;
 				grb::set( tmp_calc_energy, static_cast<EnergyType>( 0.0 ) );
 				rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( tmp_calc_energy, couplings, state, ring );
@@ -529,9 +545,6 @@ namespace grb {
 				}
 			}
 
-			std::vector< grb::Vector< bool, backend > > masks ;
-			rc = rc ? rc : matrix_partition< descr >( masks, couplings, seed );
-
 			grb::Vector< QType, backend > h ( n );
 			grb::Vector< QType, backend > log_rand ( n );
 			grb::Vector< StateType, backend > delta ( n );
@@ -540,6 +553,17 @@ namespace grb {
     		std::srand( static_cast<unsigned>( seed + s ) );
     		std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
 
+			grb::resize( h, n );
+			grb::resize( log_rand, n );
+			grb::resize( delta, n );
+			grb::resize( dn, n );
+			grb::resize( accept, n );
+
+			std::vector< grb::Vector< bool, backend > > masks ;
+			rc = rc ? rc : matrix_partition< descr >( masks, couplings, h, log_rand, seed );
+			grb::clear(h);
+			constexpr auto dense_descr = descr | grb::descriptors::dense;
+
 			auto sweep_data = std::tie(energy);
 
 			const auto ising_sweep = [&](
@@ -557,7 +581,7 @@ namespace grb {
 				}else {
 					rc = rc ? rc : grb::set< descr >( h, static_cast< QType >( 0.0 ) );
 				}
-				rc = rc ? rc : grb::mxv< descr >( h, couplings, state , ring );
+				rc = rc ? rc : grb::mxv< dense_descr >( h, couplings, state , ring );
 				std::uniform_real_distribution< QType > rand ( 0.0, 1.0 );
 				for( size_t j = 0 ; j < n ; ++j ){
 					const auto rnd = rand( rng );
@@ -566,6 +590,7 @@ namespace grb {
 #ifndef NDEBUG
 				const grb::Vector< StateType > old_state = state;
 #endif
+				rc = rc ? rc : grb::wait();
 				for(const auto &mask : masks ){
 					rc = rc ? rc : grb::clear( accept  );
 					rc = rc ? rc : grb::clear( delta  );
@@ -579,14 +604,14 @@ namespace grb {
 
 					// ( dn >= 0 ) | ( log_rand < beta * dn )
 					rc = rc ? rc : grb::set< descr >( accept, mask );
+					const auto lambda_fun = [ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
+						if( mask[i] ){
+							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
+						}
+					};
 					rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
 					rc = rc ? rc : grb::eWiseLambda< descr >(
-							[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
-								(void) i;
-								if( mask[i] ){
-									accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
-								}
-							}, mask, log_rand, dn, accept );
+							lambda_fun, mask, log_rand, dn, accept );
 
 					// new_state = np.where(accept, 1 - old, old)
 					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( -1 ), ring.getMultiplicativeMonoid() );
@@ -639,7 +664,7 @@ namespace grb {
 		 * Estimate a solution to a given QUBO problem. The solution is found
 		 * using the Simulated Annealing-Replica Exchange function above.
 		 *
-		 * The function optimized is $U(x) = x^TQx$, with the constraint that $x$ is a
+		 * The function optimized is $U(x) = \frac{1}{2}x^TQx$, with the constraint that $x$ is a
 		 * 0/1 vector.
 		 *
 		 *  TODO: expand and complete documentation

From c5df161236f8d77d4677df79fcec234c5dea131a Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 14 Nov 2025 10:41:04 +0100
Subject: [PATCH 22/58] renamed simulated_annealing_re.cpp to
 simulated_annealing_re_ising.cpp

---
 ...imulated_annealing_re.cpp => simulated_annealing_re_ising.cpp} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/smoke/{simulated_annealing_re.cpp => simulated_annealing_re_ising.cpp} (100%)

diff --git a/tests/smoke/simulated_annealing_re.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
similarity index 100%
rename from tests/smoke/simulated_annealing_re.cpp
rename to tests/smoke/simulated_annealing_re_ising.cpp

From 89b4ac9704e155fa77886e89319d5c335babe22e Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 14 Nov 2025 11:07:56 +0100
Subject: [PATCH 23/58] Updating tests after changes in simulated_annealing

---
 tests/smoke/CMakeLists.txt                    |   3 +-
 .../smoke/simulated_annealing_re_from_mpi.cpp |  27 +-
 tests/smoke/simulated_annealing_re_ising.cpp  | 257 +++---------------
 3 files changed, 55 insertions(+), 232 deletions(-)

diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 95758a7e5..284799402 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -148,7 +148,8 @@ add_grb_executables( simulated_annealing_re_from_mpi simulated_annealing_re_from
 	BACKENDS bsp1d
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
-add_grb_executables( simulated_annealing_re simulated_annealing_re.cpp
+
+add_grb_executables( simulated_annealing_re_ising simulated_annealing_re_ising.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 02e1fbc96..a5eb377dc 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -283,9 +283,14 @@ EnergyType get_energy(
 				 const grb::Matrix< JType, backend >& couplings,
 				 const grb::Vector< JType, backend > &local_fields,
 				 const grb::Vector< IOType, backend > &state,
+				 grb::Vector< JType, backend > &tmp,
 				 const Ring &ring = Ring()
 			  ){
-	static grb::Vector< JType, backend > tmp ( grb::size( local_fields ) );
+	const size_t n = grb::size( local_fields );
+	assert( n == grb::size( state ) );
+	assert( n == grb::ncols( couplings ) );
+	assert( n == grb::nrows( couplings ) );
+	grb::resize( tmp, n );
 	grb::RC rc = grb::SUCCESS;
 	EnergyType energy = 0.0;
 
@@ -403,14 +408,14 @@ EnergyType sequential_sweep_immediate(
 
 #ifndef NDEBUG
 		if( rc != grb::SUCCESS ){
-			std::cerr << "\n\t Error in some GraphBLAS function " << rc << " : " << grb::toString( rc ) << std::endl;
+			std::cerr << "\n\t Error in some GraphBLAS function in sequential_sweep_immediate " << rc << " : " << grb::toString( rc ) << std::endl;
 			abort();
 		}
 		assert( rc == grb::SUCCESS );
 		if(s == 0){
 			const auto new_state = state;
 
-			const auto real_delta = get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state);
+			const auto real_delta = get_energy(couplings, local_fields, new_state, h) - get_energy(couplings, local_fields, old_state, h);
 			std::cerr << "\n\t Delta_energy: " << delta_energy;
 			std::cerr << "\n\t Real delta: " << real_delta;
 			std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
@@ -468,8 +473,8 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		// Parse and store matrix in singleton class
 		// Map Storage tuple fields to meaningful names and wire up default data
 		auto &storage = Storage::getData();
-		// auto &n           = std::get<0>(storage); // n (rows/cols)
-		// auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+		auto &n           = std::get<0>(storage); // n (rows/cols)
+		auto &nnz         = std::get<1>(storage); // nz (nonzeros)
 		auto &nsweeps_st  = std::get<2>(storage); // nsweeps
 		auto &n_replicas_st = std::get<3>(storage); // n_replicas
 		auto &use_pt      = std::get<4>(storage); // use_pt
@@ -479,11 +484,14 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		auto &h           = std::get<8>(storage); // std::vector<JType>
 
 		// Initialize metadata from input (allow CLI to override defaults)
+		(void) n;
+		(void) nnz;
 		nsweeps_st    = data_in.nsweeps;
 		n_replicas_st = data_in.n_replicas;
 		use_pt        = data_in.use_pt;
 		seed_st       = data_in.seed;
-		// sweep_name    = data_in.sweep_name; // TODO: makes bsp1d backend crash!?
+		(void) sweep_name;
+		sweep_name    = data_in.sweep_name; // TODO: makes bsp1d backend crash!?
 
 
 		if ( data_in.use_default_data ) {
@@ -632,9 +640,10 @@ void grbProgram(
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType, grb::reference > betas( n_replicas );
     grb::Vector< EnergyType, grb::reference > energies( n_replicas );
+    grb::Vector< EnergyType, grb::reference > tmp_energy( n );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
         rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
-        rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r] ), r );
+        rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
     }
 
     #ifdef DEBUG_IMSB
@@ -730,9 +739,9 @@ void grbProgram(
 				std::cout << "Final state replica " << r << ":\n";
 				print_vector( states[r], 50 ,"states values" );  
 				std::cout << "With energy " << energies[ r ] << "\n";
-				std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
+				std::cout << "With energy " << get_energy(  J, h, states[r], tmp_energy ) << "\n";
 				std::cout << std::endl;
-				assert( ISCLOSE( get_energy( J, h, states[r] ), energies[ r ] ) );
+				assert( ISCLOSE( get_energy( J, h, states[r], tmp_energy ), energies[ r ] ) );
 			}
 		}
 
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index 8360f3d6d..b2589ee62 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -70,7 +70,6 @@ typedef grb::utils::Singleton<
         size_t,                    // n_replicas
         bool,                      // use_pt
         unsigned,                  // seed
-        std::string,               // sweep_name
         std::vector<NonzeroT>,     // matrix data
         std::vector<JType>         // h vector
     >
@@ -140,7 +139,6 @@ struct input {
     size_t nsweeps = test_data::nsweeps;
     bool use_pt = test_data::use_pt;
     unsigned seed = test_data::seed;
-    std::string sweep_name = "sequential_sweep_immediate";
     bool verify = false;
     std::string filename_ref_solution;
 	bool direct;
@@ -268,18 +266,25 @@ void read_vector_data_from_array(
 }
 
 template<
-		grb::Descriptor descr = grb::descriptors::no_operation,
-		class Ring = Semiring<
-			grb::operators::add< JType >, grb::operators::mul< JType >,
-			grb::identities::zero, grb::identities::one
-		> >
+	Backend backend,
+	grb::Descriptor descr = grb::descriptors::no_operation,
+	class Ring = Semiring<
+		grb::operators::add< JType >, grb::operators::mul< JType >,
+		grb::identities::zero, grb::identities::one
+	>
+	>
 EnergyType get_energy(
-				 const grb::Matrix< JType >& couplings,
-				 const grb::Vector< JType > &local_fields,
-				 const grb::Vector< IOType > &state,
+				 const grb::Matrix< JType, backend >& couplings,
+				 const grb::Vector< JType, backend > &local_fields,
+				 const grb::Vector< IOType, backend > &state,
+				 grb::Vector< JType, backend > &tmp,
 				 const Ring &ring = Ring()
 			  ){
-	static grb::Vector< JType > tmp ( grb::size( local_fields ) );
+	const size_t n = grb::size( local_fields );
+	assert( n == grb::size( state ) );
+	assert( n == grb::ncols( couplings ) );
+	assert( n == grb::nrows( couplings ) );
+	grb::resize( tmp, n );
 	grb::RC rc = grb::SUCCESS;
 	EnergyType energy = 0.0;
 
@@ -293,158 +298,6 @@ EnergyType get_energy(
 	return energy;
 }
 
-template<
-		class Ring = Semiring<
-			grb::operators::add< JType >, grb::operators::mul< JType >,
-			grb::identities::zero, grb::identities::one
-		>,
-		grb::Descriptor descr = grb::descriptors::no_operation
-	>
-static EnergyType sequential_sweep_immediate(
-				 grb::Vector< IOType > &state,
-				 const JType &beta,
-				 std::tuple<
-				 	 const grb::Matrix< JType > &,
-				 	 const grb::Vector< JType > &,
-					 grb::Vector< JType >&,
-					 grb::Vector< JType >&,
-					 grb::Vector< IOType >&,
-					 const std::vector< grb::Vector< bool > >&,
-					 grb::Vector< EnergyType >&,
-					 grb::Vector< bool >&,
-					 std::minstd_rand&
-					 > &data
-			  ){
-		const Ring ring = Ring();
-
-
-		grb::RC rc = grb::SUCCESS;
-		const size_t n = grb::size( state );
-		EnergyType delta_energy = static_cast< EnergyType >(0.0);
-
-		const auto &couplings 	= std::get<0>(data);
-		const auto &local_fields = std::get<1>(data);
-		auto &h 		= std::get<2>(data);
-		auto &log_rand	= std::get<3>(data);
-		auto &delta		= std::get<4>(data);
-		const auto &masks = std::get<5>(data);
-		auto &dn		= std::get<6>(data);
-		auto &accept	= std::get<7>(data);
-		auto &rng       = std::get<8>(data);
-
-		rc = rc ? rc : grb::wait();
-		rc = rc ? rc : grb::resize( h, n );
-		rc = rc ? rc : grb::resize( log_rand, n );
-		rc = rc ? rc : grb::resize( delta, n );
-		rc = rc ? rc : grb::resize( dn, n );
-		rc = rc ? rc : grb::resize( accept, n );
-
-		rc = rc ? rc : grb::set< descr >( h, 0.0 );
-		rc = rc ? rc : grb::set< descr | grb::descriptors::dense >( h, local_fields );
-		rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( h, couplings, state , ring );
-
-		static std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
-		for( size_t j = 0 ; j < n ; ++j ){
-			const auto rnd = rand( rng );
-			rc = rc ? rc : grb::setElement(log_rand,  std::log( rnd ), j );
-		}
-
-#ifndef NDEBUG
-		const grb::Vector< IOType > old_state = state;
-#endif
-		for(const auto &mask : masks ){
-
-			rc = rc ? rc : grb::clear( accept  );
-			rc = rc ? rc : grb::clear( delta  );
-			rc = rc ? rc : grb::clear( dn );
-
-			// dn = (2*state_slice - 1) * h_slice
-			rc = rc ? rc : grb::set< descr >( dn, mask, state );
-			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
-			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
-			rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
-
-			// ( dn >= 0 ) | ( log_rand < beta * dn )
-			rc = rc ? rc : grb::set< descr >( accept, mask );
-			rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
-			rc = rc ? rc : grb::eWiseLambda< descr >(
-					[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
-						(void) i;
-						if( mask[i] ){
-							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
-						}
-					}, mask, log_rand, dn, accept );
-
-			// new_state = np.where(accept, 1 - old, old)
-			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
-			
-			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
-			rc = rc ? rc : grb::clear( delta  );
-			rc = rc ? rc : grb::set< descr >( delta, accept, state );
-			rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< IOType >( 2 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
-			
-			// Update delta_energy -= dot(dn, accept)
-			rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
-
-			// update h
-			rc = rc ? rc : grb::mxv< descr >( h, couplings, delta, ring );
-		}
-		rc = rc ? rc : grb::wait();
-
-#ifndef NDEBUG
-		if( rc != grb::SUCCESS ){
-			std::cerr << "\n\t Error in some GraphBLAS function " << rc << " : " << grb::toString( rc ) << std::endl;
-			abort();
-		}
-		assert( rc == grb::SUCCESS );
-		const auto new_state = state;
-		rc = rc ? rc : grb::wait();
-
-		const auto real_delta = get_energy(couplings, local_fields, new_state) - get_energy(couplings, local_fields, old_state);
-		std::cerr << "\n\t Delta_energy: " << delta_energy;
-		std::cerr << "\n\t Real delta: " << real_delta;
-		std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
-		std::cerr << std::endl;
-
-		assert( ISCLOSE(real_delta, delta_energy ) );
-#endif
-
-		return delta_energy;
-}
-
-
-template<
-		typename SweepDataType = std::tuple<
-				 	 const grb::Matrix< JType >&,
-				 	 const grb::Vector< JType >&,
-					 grb::Vector< JType >&,
-					 grb::Vector< JType >&,
-					 grb::Vector< IOType >&,
-					 const std::vector< grb::Vector< bool > >&,
-					 grb::Vector< EnergyType >&,
-					 grb::Vector< bool >&,
-					 std::minstd_rand&
-					 >,
-		typename SweepFuncType = std::function< EnergyType(
-					 grb::Vector< IOType >&,
-					 const JType&,
-					 SweepDataType&
-				 ) >,
-		class Ring = Semiring<
-			grb::operators::add< JType >, grb::operators::mul< JType >,
-			grb::identities::zero, grb::identities::one
-		>,
-		grb::Descriptor descr = grb::descriptors::no_operation
-	>
-SweepFuncType get_sweep_function( std::string sweep_name ){
-	if( sweep_name != "sequential_sweep_immediate" ){
-			std::cerr << "Warning: unknown sweep setting. Falling back to  \"sequential_sweep_immediate\"" << std::endl;
-	}
-	 return sequential_sweep_immediate< Ring, descr >;
-}
-
 void ioProgram( const struct input &data_in, bool &success ) {
 
     using namespace test_data;
@@ -457,22 +310,22 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		// Parse and store matrix in singleton class
 		// Map Storage tuple fields to meaningful names and wire up default data
 		auto &storage = Storage::getData();
-		// auto &n           = std::get<0>(storage); // n (rows/cols)
-		// auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+		auto &n           = std::get<0>(storage); // n (rows/cols)
+		auto &nnz         = std::get<1>(storage); // nz (nonzeros)
 		auto &nsweeps_st  = std::get<2>(storage); // nsweeps
 		auto &n_replicas_st = std::get<3>(storage); // n_replicas
 		auto &use_pt      = std::get<4>(storage); // use_pt
 		auto &seed_st     = std::get<5>(storage); // seed
-		auto &sweep_name  = std::get<6>(storage); // sweep_name
-		auto &Jdata       = std::get<7>(storage); // std::vector<NonzeroT>
-		auto &h           = std::get<8>(storage); // std::vector<JType>
+		auto &Jdata       = std::get<6>(storage); // std::vector<NonzeroT>
+		auto &h           = std::get<7>(storage); // std::vector<JType>
 
 		// Initialize metadata from input (allow CLI to override defaults)
+		(void) n;
+		(void) nnz;
 		nsweeps_st    = data_in.nsweeps;
 		n_replicas_st = data_in.n_replicas;
 		use_pt        = data_in.use_pt;
 		seed_st       = data_in.seed;
-		sweep_name    = data_in.sweep_name; // TODO: makes bsp1d backend crash!?
 
 
 		if ( data_in.use_default_data ) {
@@ -526,7 +379,7 @@ void grbProgram(
     // load into GraphBLAS
     grb::Matrix< JType > J( n, n );
 	{
-		const auto &data = std::get<7>(Storage::getData());
+		const auto &data = std::get<6>(Storage::getData());
 		RC io_rc = buildMatrixUnique(
 			J,
 			utils::makeNonzeroIterator<
@@ -566,7 +419,7 @@ void grbProgram(
 
     // build vector h with data from singleton
     {
-        const auto &h_data = std::get<8>(Storage::getData());
+        const auto &h_data = std::get<7>(Storage::getData());
 		rc = rc ? rc : buildVector(
 			h,
 			h_data.cbegin(),
@@ -575,20 +428,6 @@ void grbProgram(
 		);
     }
 
-	assert( grb::nnz( grb::Vector< bool >( n ) ) == 0 );
-
-	// build masks from row block indices
-    std::vector< grb::Vector< bool > > masks;
-	for(const auto&v : test_data::row_blocks ){
-		masks.emplace_back( grb::Vector< bool >( n ) );
-		for(const auto&i : v ){
-			grb::setElement( masks.back(), 1, i );
-		}
-		if( s == 0 ){
-			print_vector( masks.back(), 30, "MASK" );
-		}
-	}
-
     // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
     std::srand( static_cast<unsigned>( data_in.seed + s ) );
     std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
@@ -613,16 +452,14 @@ void grbProgram(
             SEQUENTIAL
         );
     }
-	
-	const auto sweep = get_sweep_function( data_in.sweep_name );
-
 
     #ifdef DEBUG_IMSB
+	grb::Vector< EnergyType > tmp_energy ( n );
     if( s == 0 ) {
         for ( size_t r = 0; r < n_replicas; ++r ) {
             std::cout << "Initial state replica " << r << ":\n";
             print_vector( states[r], 30 ,"states values" );  
-			std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
+			std::cout << "With energy " << get_energy(  J, h, states[r], tmp_energy ) << "\n";
             std::cout << std::endl;
         }
     }
@@ -634,37 +471,17 @@ void grbProgram(
     grb::Vector< EnergyType > energies( n_replicas );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
         rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
-        rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r] ), r );
+        // rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
     }
-    rc = rc ? rc : wait();
-
-
-    grb::Vector<IOType> best_state ( n );
-	grb::Vector< JType > temp_h ( n );
-	grb::Vector< JType > temp_log_rand ( n );
-	grb::Vector< EnergyType > temp_dn ( n );
-	grb::Vector< bool > temp_accept ( n );
-	grb::Vector< IOType > temp_delta ( n );
-	auto sweep_data = std::tie(
-			(const typeof(J)&) J,
-			(const typeof(h)&) h,
- 			temp_h,
-			temp_log_rand,
-			temp_delta,
-			(const typeof(masks)&) masks,
-			temp_dn,
-			temp_accept,
-			rng
-			);
-	grb::wait();
 
+	grb::Vector< IOType > best_state ( n );
 
 	out.rep = data_in.rep;
 	// time a single call
 	if( out.rep == 0 ) {
 		timer.reset();
-		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+		rc = grb::algorithms::simulated_annealing_RE_Ising(
+				 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
         );
 
 		rc = rc ? rc : wait();
@@ -705,8 +522,8 @@ void grbProgram(
 			if( rc == SUCCESS ) {
 				out.iterations = data_in.nsweeps;
 
-                rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+                rc = grb::algorithms::simulated_annealing_RE_Ising(
+				 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
                 );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {
@@ -719,9 +536,9 @@ void grbProgram(
 				std::cout << "Final state replica " << r << ":\n";
 				print_vector( states[r], 50 ,"states values" );  
 				std::cout << "With energy " << energies[ r ] << "\n";
-				std::cout << "With energy " << get_energy(  J, h, states[r] ) << "\n";
+				std::cout << "With energy " << get_energy(  J, h, states[r], tmp_energy ) << "\n";
 				std::cout << std::endl;
-				assert( ISCLOSE( get_energy( J, h, states[r] ), energies[ r ] ) );
+				assert( ISCLOSE( get_energy( J, h, states[r], tmp_energy ), energies[ r ] ) );
 			}
 		}
 
@@ -766,7 +583,6 @@ void printhelp( char *progname ) {
               << "  --nsweeps INT              Number of sweeps (default: 2)\n"
               << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
               << "  --seed INT                 RNG seed (default: 8)\n"
-              << "  --sweep STR                Sweep selector (default: sequential_sweep_immediate)\n"
               << "  --verify                   Verify output against reference solution\n"
               << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
               << "  --help, -h                 Print this help message\n";
@@ -805,9 +621,6 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
         } else if ( a == "--seed" ) {
             if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
             in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
-        } else if ( a == "--sweep" ) {
-            if ( i+1 >= argc ) { std::cerr << "--sweep requires an argument\n"; return false; }
-            in.sweep_name = argv[++i];
         } else if ( a == "--verify" ) {
             in.verify = true;
         } else if ( a == "--ref-solution-fname" ) {
@@ -848,7 +661,7 @@ int main( int argc, char ** argv ) {
     }
 
 
-    std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=" << in.sweep_name << "\n";
+    std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=ising_sweep" << "\n";
 
     // Run IO program (populates Storage or similar)
     {

From 355bd2628e05795e94673e06c5c8feac4018c1fc Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 18 Nov 2025 11:14:32 +0100
Subject: [PATCH 24/58] Many changes around (eg string -> char[]) to run bsp1d
 backend correctly

---
 .../algorithms/simulated_annealing_re.hpp     |  81 +++++++++----
 .../smoke/simulated_annealing_re_from_mpi.cpp | 112 +++++++++---------
 tests/smoke/simulated_annealing_re_ising.cpp  | 107 ++++++++---------
 3 files changed, 161 insertions(+), 139 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 1a4c374a9..c7f191d10 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -378,6 +378,7 @@ namespace grb {
 			masks.clear();
 			grb::RC rc = grb::SUCCESS;
 			const size_t n = grb::nrows( A );
+			const size_t s = spmd<>::pid();
 			assert( n == grb::ncols( A ) ); // A needs to be square
 
 			grb::resize( frontier, n );
@@ -395,8 +396,8 @@ namespace grb {
 			grb::identities::negative_infinity, grb::identities::zero
 			> maxTimesRing;
 			const grb::Monoid< grb::operators::add< AType >, grb::identities::zero > addMonoid;
-			const grb::operators::greater_than< AType > gtOp;
-			const grb::operators::right_assign< AType > right_assign;
+			const  grb::operators::greater_than< AType > gtOp;
+			const grb::Monoid< grb::operators::right_assign< AType >, grb::identities::zero > right_assign;
  
 			for( size_t i = 0; rc == grb::SUCCESS && i < n ; ++i ) {
 				// find max of neighbors
@@ -412,7 +413,7 @@ namespace grb {
 				}
 
 				// add new mask
-				masks.emplace_back( grb::Vector< bool >( n ) );
+				masks.emplace_back( grb::Vector< bool, backend >( n ) );
 				auto &new_mask = masks.at(i);
 				rc = rc ? rc : grb::resize( new_mask, n );
 				rc = rc ? rc : grb::set< descr >( new_mask, frontier, static_cast< MaskType >(true) );
@@ -429,17 +430,25 @@ namespace grb {
 
 			}
 			size_t cnt = 0;
-			std::cerr << "Final masks: \n";
+			if( s == 0 ) {
+				std::cerr << "Final masks: \n";
+			}
 			for(const auto&mask : masks ){
 				for( const auto &x : mask ){
 					if( x.second ){
-						std::cerr << x.first << ", ";
+						if( s == 0 ) {
+							std::cerr << x.first << ", ";
+						}
 						cnt ++;
 					}
 				}
-				std::cerr << std::endl;
+				if( s == 0 ) {
+					std::cerr << std::endl;
+				}
+			}
+			if( s == 0 ){
+				assert( cnt == n );
 			}
-			assert( cnt == n );
 #endif
 			return rc;
 		}
@@ -506,6 +515,7 @@ namespace grb {
 			const size_t n = grb::size( states[0] );
 			const size_t n_replicas = grb::size(betas);
 			const size_t s 		= spmd<>::pid();
+			(void) s;
 			grb::RC rc = grb::SUCCESS;
 
 			assert( grb::size(states[0]) == n );
@@ -550,7 +560,7 @@ namespace grb {
 			grb::Vector< StateType, backend > delta ( n );
 			grb::Vector< EnergyType, backend > dn ( n );
 			grb::Vector< bool, backend > accept ( n );
-    		std::srand( static_cast<unsigned>( seed + s ) );
+			std::srand( static_cast<unsigned>( seed ) );
     		std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
 
 			grb::resize( h, n );
@@ -564,14 +574,37 @@ namespace grb {
 			grb::clear(h);
 			constexpr auto dense_descr = descr | grb::descriptors::dense;
 
-			auto sweep_data = std::tie(energy);
+			auto sweep_data = std::tie(
+					(const typeof(couplings)&) couplings,
+					(const typeof(local_fields)&) local_fields,
+					(const typeof(masks)&) masks,
+					h,
+					log_rand,
+					delta,
+					dn,
+					accept,
+					rng,
+					(const typeof(ring)&) ring
+					);
 
-			const auto ising_sweep = [&](
+			const auto ising_sweep = [](
 				 grb::Vector< StateType, backend > &state,
 				 const TempType &beta,
 				 typeof(sweep_data) &data
 			  ){
-				(void) data;
+				const size_t s 		= spmd<>::pid();
+				std::cerr << "Process " << s << " inside sweep... " << std::endl;
+				const auto &couplings = std::get<0>(data);
+				const auto &local_fields = std::get<1>(data);
+				const auto &masks = std::get<2>(data);
+				auto &h = std::get<3>(data);
+				auto &log_rand = std::get<4>(data);
+				auto &delta = std::get<5>(data);
+				auto &dn = std::get<6>(data);
+				auto &accept = std::get<7>(data);
+				auto &rng = std::get<8>(data);
+				const auto &ring = std::get<9>(data);
+
 				const size_t n = grb::size( state );
 				EnergyType delta_energy = static_cast< EnergyType >(0.0);
 				grb::RC rc = grb::SUCCESS;
@@ -604,14 +637,14 @@ namespace grb {
 
 					// ( dn >= 0 ) | ( log_rand < beta * dn )
 					rc = rc ? rc : grb::set< descr >( accept, mask );
-					const auto lambda_fun = [ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
+					rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
+					std::cerr << "\t calling eWiseLambda" << std::endl;
+					rc = rc ? rc : grb::eWiseLambda< descr >(
+							[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
 						if( mask[i] ){
 							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
 						}
-					};
-					rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
-					rc = rc ? rc : grb::eWiseLambda< descr >(
-							lambda_fun, mask, log_rand, dn, accept );
+					}, mask, log_rand, dn, accept );
 
 					// new_state = np.where(accept, 1 - old, old)
 					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( -1 ), ring.getMultiplicativeMonoid() );
@@ -642,15 +675,17 @@ namespace grb {
 
 				EnergyType e1 = static_cast< EnergyType >( 0.0 ),
 						   e2 = static_cast< EnergyType >( 0.0 );
-				get_energy(e1, old_state);
-				get_energy(e2, new_state);
+				// get_energy(e1, old_state);
+				// get_energy(e2, new_state);
 				const auto real_delta = e2 - e1;
-				std::cerr << "\n\t Delta_energy: " << delta_energy;
-				std::cerr << "\n\t Real delta: " << real_delta;
-				std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
-				std::cerr << std::endl;
+				if( s == 0 ){
+					std::cerr << "\n\t Delta_energy: " << delta_energy;
+					std::cerr << "\n\t Real delta: " << real_delta;
+					std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
+					std::cerr << std::endl;
+				}
 
-				assert( ISCLOSE(real_delta, delta_energy ) );
+				// assert( ISCLOSE(real_delta, delta_energy ) );
 #endif
 				return delta_energy;
 			};
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index a5eb377dc..a90037eac 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -15,7 +15,6 @@
 #include <sstream>
 #include <vector>
 #include <tuple>
-#include <string>
 #include <memory>
 #include <algorithm>
 #include <random>
@@ -42,12 +41,16 @@ using namespace grb;
 #define DEBUG_IMSB 1
 #define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
 
+constexpr size_t MAX_FN_SIZE = 255;
 
 // Types
 using IOType = double;   // scalar/vector element type
 using JType  = double;   // coupling (matrix) value type
 using EnergyType  = double;   // coupling (matrix) value type
 
+// Backend to be used inside each process
+constexpr grb::Backend internal_backend = grb::reference;
+
 /** Parser type */
 typedef grb::utils::MatrixFileReader<
 	JType,
@@ -74,7 +77,7 @@ typedef grb::utils::Singleton<
         size_t,                    // n_replicas
         bool,                      // use_pt
         unsigned,                  // seed
-        std::string,               // sweep_name
+        char[MAX_FN_SIZE],         // sweep_name
         std::vector<NonzeroT>,     // matrix data
         std::vector<JType>         // h vector
     >
@@ -128,25 +131,19 @@ namespace test_data {
 		0.46231686 , 0.87930208 ,  0.88663637, -0.25052299,
     };
 
-	const std::vector< std::vector< size_t > > row_blocks = {
-		// {3, 1, 6, 7, 9, 11, 12, 13, 14, 15}, {5, 2, 0, 8, 10}, {4} // for python data files
-		{0, 2, 4, 7, 9, 12, 13, 15}, {1, 3, 6, 8, 11}, {5, 10, 14},
-		// {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, {15}
-	};
-
 }
 // --- New, minimal runner configuration and result types ---
 struct input {
     bool use_default_data = false;
-    std::string filename_Jmatrix;
-    std::string filename_h;
     size_t n_replicas = test_data::n_replicas;
     size_t nsweeps = test_data::nsweeps;
     bool use_pt = test_data::use_pt;
     unsigned seed = test_data::seed;
-    std::string sweep_name = "sequential_sweep_immediate";
+    char sweep_name [ MAX_FN_SIZE ] = "sequential_sweep_immediate";
     bool verify = false;
-    std::string filename_ref_solution;
+    char filename_Jmatrix [ MAX_FN_SIZE ];
+    char filename_h [ MAX_FN_SIZE ];
+    char filename_ref_solution [ MAX_FN_SIZE ];
 	bool direct;
     size_t rep = 0;
     size_t outer = 1;
@@ -159,8 +156,8 @@ struct output {
     EnergyType best_energy = std::numeric_limits< EnergyType >::max();
 	size_t rep;
 	grb::utils::TimerResults times;
-    std::unique_ptr< PinnedVector< JType, grb::reference > > pinnedSolutionVector;
-    std::unique_ptr< PinnedVector< JType, grb::reference > > pinnedRefSolutionVector;
+    std::unique_ptr< PinnedVector< JType, internal_backend > > pinnedSolutionVector;
+    std::unique_ptr< PinnedVector< JType, internal_backend > > pinnedRefSolutionVector;
     // other things like eg: best replicas ...
 };
 
@@ -282,7 +279,7 @@ template<
 EnergyType get_energy(
 				 const grb::Matrix< JType, backend >& couplings,
 				 const grb::Vector< JType, backend > &local_fields,
-				 const grb::Vector< IOType, backend > &state,
+				 const grb::Vector< IOType,backend > &state,
 				 grb::Vector< JType, backend > &tmp,
 				 const Ring &ring = Ring()
 			  ){
@@ -313,7 +310,7 @@ template<
 		grb::Descriptor descr = grb::descriptors::no_operation
 	>
 EnergyType sequential_sweep_immediate(
-				 grb::Vector< IOType, backend > &state,
+				 grb::Vector< IOType, internal_backend > &state,
 				 const JType &beta,
 				 std::tuple<
 				 	 const grb::Matrix< JType, backend >&,
@@ -330,7 +327,6 @@ EnergyType sequential_sweep_immediate(
 		const size_t s = spmd<>::pid();
 		const Ring ring = Ring();
 
-
 		grb::RC rc = grb::SUCCESS;
 		const size_t n = grb::size( state );
 		EnergyType delta_energy = static_cast< EnergyType >(0.0);
@@ -352,8 +348,8 @@ EnergyType sequential_sweep_immediate(
 		rc = rc ? rc : grb::resize( dn, n );
 		rc = rc ? rc : grb::resize( accept, n );
 
-		rc = rc ? rc : grb::set< descr | grb::descriptors::dense >( h, local_fields );
-		rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( h, couplings, state , ring );
+		rc = rc ? rc : grb::set< descr >( h, local_fields );
+		rc = rc ? rc : grb::mxv< descr >( h, couplings, state , ring );
 
 		std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
 		for( size_t j = 0 ; j < n ; ++j ){
@@ -454,8 +450,8 @@ template<
 			grb::identities::zero, grb::identities::one
 		>
 	>
-SweepFuncType get_sweep_function( const std::string &sweep_name ){
-	if( sweep_name != "sequential_sweep_immediate" ){
+SweepFuncType get_sweep_function( const char sweep_name[] ){
+	if( std::strcmp(sweep_name, "sequential_sweep_immediate") != 0 ){
 			std::cerr << "Warning: unknown sweep setting. Falling back to  \"sequential_sweep_immediate\"" << std::endl;
 	}
 	 return sequential_sweep_immediate< Ring >;
@@ -490,8 +486,7 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		n_replicas_st = data_in.n_replicas;
 		use_pt        = data_in.use_pt;
 		seed_st       = data_in.seed;
-		(void) sweep_name;
-		sweep_name    = data_in.sweep_name; // TODO: makes bsp1d backend crash!?
+        std::strncpy( sweep_name, data_in.sweep_name, MAX_FN_SIZE );
 
 
 		if ( data_in.use_default_data ) {
@@ -503,8 +498,8 @@ void ioProgram( const struct input &data_in, bool &success ) {
 			// read from files if provided
 			read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
 			read_vector_data<JType>( data_in.filename_h, h );
-			if(data_in.verify) {
-				if(data_in.filename_ref_solution.empty()) {
+			if( data_in.verify ) {
+				if( std::strlen(data_in.filename_ref_solution) == 0 ) {
 					std::cerr << "Reference solution file not provided for verification\n";
 					return;
 				}
@@ -538,13 +533,13 @@ void grbProgram(
 	if( s == 0 ){
 		std::cout << "problem size n = " << n << "\n";
 	}
-    grb::Vector< JType, grb::reference > h( n );
+    grb::Vector< JType, internal_backend > h( n );
 
     // populate J with test (random) values
     grb::RC rc = grb::SUCCESS;
 
     // load into GraphBLAS
-    grb::Matrix< JType, grb::reference > J( n, n );
+    grb::Matrix< JType, internal_backend > J( n, n );
 	{
 		const auto &data = std::get<7>(Storage::getData());
 		RC io_rc = buildMatrixUnique(
@@ -595,27 +590,15 @@ void grbProgram(
 		);
     }
 
-	// build masks from row block indices
-    std::vector< grb::Vector< bool, grb::reference > > masks;
-	for(const auto&v : test_data::row_blocks ){
-		masks.emplace_back( grb::Vector< bool, grb::reference >( n ) );
-		for(const auto&i : v ){
-			grb::setElement( masks.back(), 1, i );
-		}
-		if( s == 0 ){
-			print_vector( masks.back(), 30, "MASK" );
-		}
-	}
-
     // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
     std::srand( static_cast<unsigned>( data_in.seed + s ) );
     std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
 
     // create states storage and initialize with random 1/0 values
     const size_t n_replicas = data_in.n_replicas;
-    std::vector< grb::Vector< IOType, grb::reference > > states;
+    std::vector< grb::Vector< IOType, internal_backend > > states;
     for ( size_t r = 0; r < n_replicas; ++r ) {
-        states.emplace_back( grb::Vector< IOType, grb::reference >(n) );
+        states.emplace_back( grb::Vector< IOType, internal_backend >(n) );
         // initialize with random values
         std::uniform_int_distribution< unsigned short > randint(0,1);
         // we use buildvectorUnique with a random set of indices
@@ -638,9 +621,9 @@ void grbProgram(
 	const auto sweep = sequential_sweep_immediate< Ring >; // get_sweep_function( data_in.sweep_name );
 
     // also make betas vector os size n_replicas and initialize with 10.0
-    grb::Vector< JType, grb::reference > betas( n_replicas );
-    grb::Vector< EnergyType, grb::reference > energies( n_replicas );
-    grb::Vector< EnergyType, grb::reference > tmp_energy( n );
+    grb::Vector< JType, internal_backend > betas( n_replicas );
+    grb::Vector< EnergyType, internal_backend > energies( n_replicas );
+    grb::Vector< EnergyType, internal_backend > tmp_energy( n );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
         rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
         rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
@@ -659,12 +642,21 @@ void grbProgram(
     #endif
     rc = rc ? rc : wait();
 
-    grb::Vector< IOType, grb::reference > best_state ( n );
-	grb::Vector< JType, grb::reference > temp_h ( n );
-	grb::Vector< JType, grb::reference > temp_log_rand ( n );
-	grb::Vector< EnergyType, grb::reference > temp_dn ( n );
-	grb::Vector< bool, grb::reference > temp_accept ( n );
-	grb::Vector< IOType, grb::reference > temp_delta ( n );
+	// we allocate temporary vectors
+	grb::Vector< JType, internal_backend > temp_h ( n );
+	grb::Vector< JType, internal_backend > temp_log_rand ( n );
+	grb::Vector< IOType, internal_backend > best_state ( n );
+	grb::Vector< EnergyType, internal_backend > temp_dn ( n );
+	grb::Vector< bool, internal_backend > temp_accept ( n );
+	grb::Vector< IOType, internal_backend > temp_delta ( n );
+
+	// build masks, we'll use two of the above temporary vectors
+    std::vector< grb::Vector< bool, internal_backend > > masks;
+	rc = rc ? rc : grb::algorithms::matrix_partition( masks, J, temp_h, temp_log_rand, test_data::seed );
+
+	if( s == 0 ){
+		print_vector( masks.back(), 30, "MASK" );
+	}
 	auto sweep_data = std::tie(
 			(const typeof(J)&) J,
 			(const typeof(h)&) h,
@@ -793,9 +785,9 @@ void printhelp( char *progname ) {
 }
 
 bool parse_arguments( input &in, int argc, char ** argv ) {
-    in.filename_Jmatrix.clear();
-    in.filename_h.clear();
-    in.filename_ref_solution.clear();
+	std::fill( in.filename_Jmatrix, in.filename_Jmatrix + MAX_FN_SIZE, '\0' );
+	std::fill( in.filename_h, in.filename_h + MAX_FN_SIZE, '\0' );
+	std::fill( in.filename_ref_solution, in.filename_ref_solution + MAX_FN_SIZE, '\0' );
     in.direct = true;
     // map benchmarking configuration to the runner's fields
     in.rep = grb::config::BENCHMARKING::inner();
@@ -809,10 +801,10 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
             in.use_default_data = true;
         } else if ( a == "--j-matrix-fname" ) {
             if ( i+1 >= argc ) { std::cerr << "--j-matrix-fname requires an argument\n"; return false; }
-            in.filename_Jmatrix = argv[++i];
+			std::strncpy( in.filename_Jmatrix, argv[++i], MAX_FN_SIZE );
         } else if ( a == "--h-fname" ) {
             if ( i+1 >= argc ) { std::cerr << "--h-fname requires an argument\n"; return false; }
-            in.filename_h = argv[++i];
+			std::strncpy( in.filename_h, argv[++i], MAX_FN_SIZE );
         } else if ( a == "--n-replicas" ) {
             if ( i+1 >= argc ) { std::cerr << "--n-replicas requires an argument\n"; return false; }
             in.n_replicas = static_cast<size_t>( std::stoul(argv[++i]) );
@@ -827,12 +819,12 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
             in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
         } else if ( a == "--sweep" ) {
             if ( i+1 >= argc ) { std::cerr << "--sweep requires an argument\n"; return false; }
-            in.sweep_name = argv[++i];
+			std::strncpy( in.sweep_name, argv[++i], MAX_FN_SIZE );
         } else if ( a == "--verify" ) {
             in.verify = true;
         } else if ( a == "--ref-solution-fname" ) {
             if ( i+1 >= argc ) { std::cerr << "--ref-solution-fname requires an argument\n"; return false; }
-            in.filename_ref_solution = argv[++i];
+			std::strncpy( in.filename_ref_solution, argv[++i], MAX_FN_SIZE );
         } else if ( a == "--help" || a == "-h" ) {
             printhelp( argv[0] );
             return false;
@@ -844,12 +836,14 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
 
     // basic validation
     if ( !in.use_default_data ) {
-        if ( in.filename_Jmatrix.empty() || in.filename_h.empty() ) {
+        if ( std::strlen( in.filename_Jmatrix ) == 0
+				|| std::strlen( in.filename_h ) == 0 ) {
             std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
             return false;
         }
     }
-    if ( in.verify && !in.use_default_data && in.filename_ref_solution.empty() ) {
+    if ( in.verify && !in.use_default_data
+			&& std::strlen( in.filename_ref_solution ) == 0 ) {
         std::cerr << "--ref-solution-fname required when --verify is used without --use-default-data\n";
         return false;
     }
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index b2589ee62..bd27e6168 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -38,6 +38,7 @@ using namespace grb;
 #define DEBUG_IMSB 1
 #define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
 
+constexpr size_t MAX_FN_SIZE = 255;
 
 // Types
 using IOType = double;   // scalar/vector element type
@@ -82,7 +83,7 @@ namespace test_data {
     constexpr bool use_pt = true; 
     constexpr unsigned seed = 8;
 
-    const std::vector< std::pair< std::pair< grb::config::RowIndexType, grb::config::ColIndexType >, JType > > j_matrix_data = {
+    const std::vector<  std::pair< std::pair< grb::config::RowIndexType, grb::config::ColIndexType >, JType > > j_matrix_data = {
 		{{0, 1}, -0.2752300610319546},
 		{{1, 0}, -0.2752300610319546},
 		{{1, 2}, -0.10636508505639508},
@@ -113,34 +114,28 @@ namespace test_data {
 		{{15, 14}, 0.2955745584289766},
     };
 
-
     const size_t nnz = j_matrix_data.size();
 
-    const std::vector< JType > h_array_data = {
+    const auto h_array_data = {
         -0.08910436,  0.58034508,  0.97719304,  0.16792909,
 		-0.9221754 , -0.10715418, -0.62365497,  0.25411129,
 		-0.5693644 , -0.69805978,  0.07228861, -0.79922641,
 		0.46231686 , 0.87930208 ,  0.88663637, -0.25052299,
     };
-
-	const std::vector< std::vector< size_t > > row_blocks = {
-		// {3, 1, 6, 7, 9, 11, 12, 13, 14, 15}, {5, 2, 0, 8, 10}, {4} // for python data files
-		{0, 2, 4, 7, 9, 12, 13, 15}, {1, 3, 6, 8, 11}, {5, 10, 14},
-		// {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, {15}
-	};
-
 }
 // --- New, minimal runner configuration and result types ---
 struct input {
-    bool use_default_data = false;
-    std::string filename_Jmatrix;
-    std::string filename_h;
+    size_t n = test_data::n;
     size_t n_replicas = test_data::n_replicas;
     size_t nsweeps = test_data::nsweeps;
     bool use_pt = test_data::use_pt;
     unsigned seed = test_data::seed;
+    bool use_default_data = false;
+    char filename_Jmatrix [ MAX_FN_SIZE ];
+    char filename_h [ MAX_FN_SIZE ];
+    char sweep_name [ MAX_FN_SIZE ]= "sequential_sweep_immediate";
     bool verify = false;
-    std::string filename_ref_solution;
+    char filename_ref_solution [ MAX_FN_SIZE ];
 	bool direct;
     size_t rep = 0;
     size_t outer = 1;
@@ -210,7 +205,7 @@ void read_matrix_data_from_array(
                 NonzeroT( entry.first.first, entry.first.second, entry.second )
             );
 #ifdef DEBUG_IMSB
-			if( spmd<>::pid() < 2 ){
+			if( spmd<>::pid() < 1 ){
 				// print last data element from std::vector<NonzeroT> data
 				std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
 					<< data.back().first.second << ", " << data.back().second << "\n";
@@ -299,35 +294,29 @@ EnergyType get_energy(
 }
 
 void ioProgram( const struct input &data_in, bool &success ) {
-
-    using namespace test_data;
 	success = false;
 
-	const size_t s = spmd<>::pid();
-	assert( s < spmd<>::nprocs() );
+	// Parse and store matrix in singleton class
+	// Map Storage tuple fields to meaningful names and wire up default data
+	auto &storage = Storage::getData();
+	auto &n           = std::get<0>(storage); // n (rows/cols)
+	auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+	auto &nsweeps_st  = std::get<2>(storage); // nsweeps
+	auto &n_replicas_st = std::get<3>(storage); // n_replicas
+	auto &use_pt      = std::get<4>(storage); // use_pt
+	auto &seed_st     = std::get<5>(storage); // seed
+	auto &Jdata       = std::get<6>(storage); // std::vector<NonzeroT>
+	auto &h           = std::get<7>(storage); // std::vector<JType>
 
 	try {
-		// Parse and store matrix in singleton class
-		// Map Storage tuple fields to meaningful names and wire up default data
-		auto &storage = Storage::getData();
-		auto &n           = std::get<0>(storage); // n (rows/cols)
-		auto &nnz         = std::get<1>(storage); // nz (nonzeros)
-		auto &nsweeps_st  = std::get<2>(storage); // nsweeps
-		auto &n_replicas_st = std::get<3>(storage); // n_replicas
-		auto &use_pt      = std::get<4>(storage); // use_pt
-		auto &seed_st     = std::get<5>(storage); // seed
-		auto &Jdata       = std::get<6>(storage); // std::vector<NonzeroT>
-		auto &h           = std::get<7>(storage); // std::vector<JType>
-
 		// Initialize metadata from input (allow CLI to override defaults)
-		(void) n;
-		(void) nnz;
+		(void) n; // initialized by read_matrix_*
+		(void) nnz; // initialized by read_matrix_*
 		nsweeps_st    = data_in.nsweeps;
 		n_replicas_st = data_in.n_replicas;
 		use_pt        = data_in.use_pt;
 		seed_st       = data_in.seed;
 
-
 		if ( data_in.use_default_data ) {
 			// if no file provided, use default data from file_content
 			read_matrix_data_from_array<NonzeroT>( test_data::j_matrix_data, Jdata );
@@ -338,7 +327,7 @@ void ioProgram( const struct input &data_in, bool &success ) {
 			read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
 			read_vector_data<JType>( data_in.filename_h, h );
 			if(data_in.verify) {
-				if(data_in.filename_ref_solution.empty()) {
+				if( std::strlen(data_in.filename_ref_solution) == 0 ) {
 					std::cerr << "Reference solution file not provided for verification\n";
 					return;
 				}
@@ -368,6 +357,7 @@ void grbProgram(
 
     /* --- Problem setup --- */
     const size_t n = std::get<0>(Storage::getData());
+    const size_t n_replicas = std::get<3>(Storage::getData());
 	if( s == 0 ){
 		std::cout << "problem size n = " << n << "\n";
 	}
@@ -433,7 +423,6 @@ void grbProgram(
     std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
 
     // create states storage and initialize with random 1/0 values
-    const size_t n_replicas = std::get<3>(Storage::getData());
     std::vector< grb::Vector<IOType> > states;
     for ( size_t r = 0; r < n_replicas; ++r ) {
         states.emplace_back( grb::Vector<IOType>(n) );
@@ -455,14 +444,15 @@ void grbProgram(
 
     #ifdef DEBUG_IMSB
 	grb::Vector< EnergyType > tmp_energy ( n );
-    if( s == 0 ) {
-        for ( size_t r = 0; r < n_replicas; ++r ) {
-            std::cout << "Initial state replica " << r << ":\n";
-            print_vector( states[r], 30 ,"states values" );  
-			std::cout << "With energy " << get_energy(  J, h, states[r], tmp_energy ) << "\n";
-            std::cout << std::endl;
-        }
-    }
+	for ( size_t r = 0; r < n_replicas; ++r ) {
+		const auto en = get_energy(  J, h, states[r], tmp_energy );
+		if( s == 0 ) {
+			std::cout << "Initial state replica " << r << ":\n";
+			print_vector( states[r], 30 ,"states values" );
+			std::cout << "With energy " << en << "\n";
+			std::cout << std::endl;
+		}
+	}
     #endif
 
 
@@ -473,6 +463,7 @@ void grbProgram(
         rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
         // rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
     }
+	assert( rc == grb::SUCCESS );
 
 	grb::Vector< IOType > best_state ( n );
 
@@ -531,15 +522,15 @@ void grbProgram(
 			}
 		}
 		const double time_taken = timer.time();
-		if( s == 0 ) {
-			for ( size_t r = 0; r < n_replicas; ++r ) {
+		for ( size_t r = 0; r < n_replicas; ++r ) {
+			const auto energy = energies[r];
+			if( s == 0 ) {
 				std::cout << "Final state replica " << r << ":\n";
 				print_vector( states[r], 50 ,"states values" );  
-				std::cout << "With energy " << energies[ r ] << "\n";
-				std::cout << "With energy " << get_energy(  J, h, states[r], tmp_energy ) << "\n";
+				std::cout << "With energy " << energy << "\n";
 				std::cout << std::endl;
-				assert( ISCLOSE( get_energy( J, h, states[r], tmp_energy ), energies[ r ] ) );
 			}
+			assert( ISCLOSE( get_energy( J, h, states[r], tmp_energy ), energies[ r ] ) );
 		}
 
 		out.times.useful = time_taken / static_cast< double >( out.rep );
@@ -589,9 +580,9 @@ void printhelp( char *progname ) {
 }
 
 bool parse_arguments( input &in, int argc, char ** argv ) {
-    in.filename_Jmatrix.clear();
-    in.filename_h.clear();
-    in.filename_ref_solution.clear();
+	std::fill( in.filename_Jmatrix, in.filename_Jmatrix + MAX_FN_SIZE, '\0' );
+	std::fill( in.filename_h, in.filename_h + MAX_FN_SIZE, '\0' );
+	std::fill( in.filename_ref_solution, in.filename_ref_solution + MAX_FN_SIZE, '\0' );
     in.direct = true;
     // map benchmarking configuration to the runner's fields
     in.rep = grb::config::BENCHMARKING::inner();
@@ -605,10 +596,10 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
             in.use_default_data = true;
         } else if ( a == "--j-matrix-fname" ) {
             if ( i+1 >= argc ) { std::cerr << "--j-matrix-fname requires an argument\n"; return false; }
-            in.filename_Jmatrix = argv[++i];
+            std::strncpy( in.filename_Jmatrix, argv[++i], MAX_FN_SIZE );
         } else if ( a == "--h-fname" ) {
             if ( i+1 >= argc ) { std::cerr << "--h-fname requires an argument\n"; return false; }
-            in.filename_h = argv[++i];
+			std::strncpy( in.filename_h, argv[++i], MAX_FN_SIZE );
         } else if ( a == "--n-replicas" ) {
             if ( i+1 >= argc ) { std::cerr << "--n-replicas requires an argument\n"; return false; }
             in.n_replicas = static_cast<size_t>( std::stoul(argv[++i]) );
@@ -625,7 +616,7 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
             in.verify = true;
         } else if ( a == "--ref-solution-fname" ) {
             if ( i+1 >= argc ) { std::cerr << "--ref-solution-fname requires an argument\n"; return false; }
-            in.filename_ref_solution = argv[++i];
+			std::strncpy( in.filename_ref_solution, argv[++i], MAX_FN_SIZE );
         } else if ( a == "--help" || a == "-h" ) {
             printhelp( argv[0] );
             return false;
@@ -637,12 +628,14 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
 
     // basic validation
     if ( !in.use_default_data ) {
-        if ( in.filename_Jmatrix.empty() || in.filename_h.empty() ) {
+        if ( std::strlen( in.filename_Jmatrix ) == 0
+				|| std::strlen( in.filename_h ) == 0 ) {
             std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
             return false;
         }
     }
-    if ( in.verify && !in.use_default_data && in.filename_ref_solution.empty() ) {
+    if ( in.verify && !in.use_default_data
+			&& std::strlen( in.filename_ref_solution ) == 0 ) {
         std::cerr << "--ref-solution-fname required when --verify is used without --use-default-data\n";
         return false;
     }

From dc72b269f71d0f15f475e5799aaae4ae016ebf5c Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 18 Nov 2025 12:02:09 +0100
Subject: [PATCH 25/58] Added small test of simulated_annealing

---
 tests/smoke/simulated_annealing_re_ising.cpp | 7 +++++++
 tests/smoke/smoketests.sh                    | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index bd27e6168..faf423876 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -682,5 +682,12 @@ int main( int argc, char ** argv ) {
     }
 
     std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
+	if( in.verify && in.use_default_data ){
+		if( out.best_energy <= -5 ){
+    		std::cout << "Test OK" << std::endl;
+		}else{
+    		std::cout << "Test FAILED" << std::endl;
+		}
+	}
     return out.error_code;
 }
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index ce6c3ce6b..b2d99d471 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -436,6 +436,14 @@ for BACKEND in ${BACKENDS[@]}; do
 				grep "Test OK" ${TEST_OUT_DIR}/fuselets.log || echo "Test FAILED"
 				echo " "
 			fi
+
+			if [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "reference"  ] || [ "$BACKEND" = "hyperdags" ] || [ "$BACKEND" = "nonblocking"  ]; then
+				echo ">>>      [x]           [ ]       Tests Simulated Annealing-Replica Exchange on a"
+				echo "                                 small 16x16 matrix."
+				echo "Functional test executable: ${TEST_BIN_DIR}/simulated_annealing_re_ising_reference"
+				$runner ${TEST_BIN_DIR}/simulated_annealing_re_ising_${BACKEND} --use-default-data --verify &> ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log
+				( grep "Test OK" ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log ) || printf 'Test FAILED.\n'
+			fi
 		done
 	done
 

From c7802efb29591127124ee4f7ab5dc143172d0a62 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 18 Nov 2025 13:22:06 +0100
Subject: [PATCH 26/58] Suppressing warnings in release + removed debug prints

---
 .../algorithms/simulated_annealing_re.hpp         |  8 ++++++--
 tests/smoke/simulated_annealing_re_from_mpi.cpp   | 15 +++++++++------
 tests/smoke/simulated_annealing_re_ising.cpp      |  8 ++++----
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index c7f191d10..2fd9d437e 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -281,6 +281,8 @@ namespace grb {
 			const size_t s = spmd<>::pid();
 			const size_t n_replicas = states.size();
 			const size_t n = grb::size(states[0]);
+			(void) n;
+			(void) s;
 
 			assert( n_replicas > 0 );
 			assert( n_replicas == grb::size( betas ) );
@@ -380,6 +382,7 @@ namespace grb {
 			const size_t n = grb::nrows( A );
 			const size_t s = spmd<>::pid();
 			assert( n == grb::ncols( A ) ); // A needs to be square
+			(void) s;
 
 			grb::resize( frontier, n );
 			grb::resize( w, n );
@@ -529,6 +532,7 @@ namespace grb {
 					EnergyType &energy, const grb::Vector< StateType > &state
 					){
 				const size_t n = grb::size( local_fields );
+				(void) n;
 				assert( n == grb::size( state ) );
 				assert( n == grb::ncols( couplings ) );
 				assert( n == grb::nrows( couplings ) );
@@ -593,7 +597,8 @@ namespace grb {
 				 typeof(sweep_data) &data
 			  ){
 				const size_t s 		= spmd<>::pid();
-				std::cerr << "Process " << s << " inside sweep... " << std::endl;
+				(void) s;
+
 				const auto &couplings = std::get<0>(data);
 				const auto &local_fields = std::get<1>(data);
 				const auto &masks = std::get<2>(data);
@@ -638,7 +643,6 @@ namespace grb {
 					// ( dn >= 0 ) | ( log_rand < beta * dn )
 					rc = rc ? rc : grb::set< descr >( accept, mask );
 					rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
-					std::cerr << "\t calling eWiseLambda" << std::endl;
 					rc = rc ? rc : grb::eWiseLambda< descr >(
 							[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
 						if( mask[i] ){
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index a90037eac..ad0d6c3c7 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -77,7 +77,7 @@ typedef grb::utils::Singleton<
         size_t,                    // n_replicas
         bool,                      // use_pt
         unsigned,                  // seed
-        char[MAX_FN_SIZE],         // sweep_name
+        char[ MAX_FN_SIZE + 1 ],   // sweep_name
         std::vector<NonzeroT>,     // matrix data
         std::vector<JType>         // h vector
     >
@@ -139,11 +139,11 @@ struct input {
     size_t nsweeps = test_data::nsweeps;
     bool use_pt = test_data::use_pt;
     unsigned seed = test_data::seed;
-    char sweep_name [ MAX_FN_SIZE ] = "sequential_sweep_immediate";
+    char sweep_name [ MAX_FN_SIZE + 1 ] = "sequential_sweep_immediate";
     bool verify = false;
-    char filename_Jmatrix [ MAX_FN_SIZE ];
-    char filename_h [ MAX_FN_SIZE ];
-    char filename_ref_solution [ MAX_FN_SIZE ];
+    char filename_Jmatrix [ MAX_FN_SIZE + 1 ];
+    char filename_h [ MAX_FN_SIZE + 1 ];
+    char filename_ref_solution [ MAX_FN_SIZE + 1 ];
 	bool direct;
     size_t rep = 0;
     size_t outer = 1;
@@ -326,6 +326,7 @@ EnergyType sequential_sweep_immediate(
 			  ){
 		const size_t s = spmd<>::pid();
 		const Ring ring = Ring();
+		(void) s;
 
 		grb::RC rc = grb::SUCCESS;
 		const size_t n = grb::size( state );
@@ -463,6 +464,7 @@ void ioProgram( const struct input &data_in, bool &success ) {
 	success = false;
 
 	const size_t s = spmd<>::pid();
+	(void) s;
 	assert( s < spmd<>::nprocs() );
 
 	try {
@@ -486,7 +488,7 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		n_replicas_st = data_in.n_replicas;
 		use_pt        = data_in.use_pt;
 		seed_st       = data_in.seed;
-        std::strncpy( sweep_name, data_in.sweep_name, MAX_FN_SIZE );
+        std::strncpy( sweep_name, data_in.sweep_name, MAX_FN_SIZE+1 );
 
 
 		if ( data_in.use_default_data ) {
@@ -530,6 +532,7 @@ void grbProgram(
 
     /* --- Problem setup --- */
     const size_t n = std::get<0>(Storage::getData());
+	(void) n;
 	if( s == 0 ){
 		std::cout << "problem size n = " << n << "\n";
 	}
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index faf423876..c2436cd0a 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -131,11 +131,11 @@ struct input {
     bool use_pt = test_data::use_pt;
     unsigned seed = test_data::seed;
     bool use_default_data = false;
-    char filename_Jmatrix [ MAX_FN_SIZE ];
-    char filename_h [ MAX_FN_SIZE ];
-    char sweep_name [ MAX_FN_SIZE ]= "sequential_sweep_immediate";
+    char filename_Jmatrix [ MAX_FN_SIZE + 1 ];
+    char filename_h [ MAX_FN_SIZE + 1 ];
+    char sweep_name [ MAX_FN_SIZE + 1 ]= "sequential_sweep_immediate";
     bool verify = false;
-    char filename_ref_solution [ MAX_FN_SIZE ];
+    char filename_ref_solution [ MAX_FN_SIZE + 1 ];
 	bool direct;
     size_t rep = 0;
     size_t outer = 1;

From a492113bd9da0493888666c23eb030e504fe2984 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 18 Nov 2025 13:32:43 +0100
Subject: [PATCH 27/58] fixup! Added small test of simulated_annealing

---
 tests/smoke/smoketests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index b2d99d471..3150b5bd9 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -440,7 +440,7 @@ for BACKEND in ${BACKENDS[@]}; do
 			if [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "reference"  ] || [ "$BACKEND" = "hyperdags" ] || [ "$BACKEND" = "nonblocking"  ]; then
 				echo ">>>      [x]           [ ]       Tests Simulated Annealing-Replica Exchange on a"
 				echo "                                 small 16x16 matrix."
-				echo "Functional test executable: ${TEST_BIN_DIR}/simulated_annealing_re_ising_reference"
+				echo "Functional test executable: ${TEST_BIN_DIR}/simulated_annealing_re_ising_${BACKEND}"
 				$runner ${TEST_BIN_DIR}/simulated_annealing_re_ising_${BACKEND} --use-default-data --verify &> ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log
 				( grep "Test OK" ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log ) || printf 'Test FAILED.\n'
 			fi

From 5619642251e2930ab886ce13705796d348fea6dd Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 20 Nov 2025 11:53:03 +0100
Subject: [PATCH 28/58] Fixes and additional checks for spmd variant

---
 .../algorithms/simulated_annealing_re.hpp     | 53 ++++++++++++-------
 .../smoke/simulated_annealing_re_from_mpi.cpp | 36 ++++++++-----
 2 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 2fd9d437e..81f4e3f8e 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -146,19 +146,30 @@ namespace grb {
 			const size_t s 		= spmd<>::pid();
 			const size_t nprocs = spmd<>::nprocs();
 			grb::RC rc = grb::SUCCESS;
+
+#ifndef NDEBUG
+			assert( grb::size(energies) == n_replicas );
+			assert( grb::size(betas) == n_replicas );
+#endif
 			struct data {
-					grb::Vector< StateType, backend > s;
 					EnergyType e;
 					TempType b;
 					int r;
 				};
+			grb::Vector< StateType, backend > s0 ( n );
+			grb::Vector< StateType, backend > s1 ( n );
+			grb::set( s0, static_cast< StateType >( 0 ) );
+			grb::set( s1, static_cast< StateType >( 0 ) );
+
+
 			struct data msg[ 2 ];
-			grb::resize( msg[0].s, n );
-			grb::resize( msg[1].s, n );
+			rc = rc ? rc : grb::resize( s0, n );
+			rc = rc ? rc : grb::resize( s1, n );
+			if( rc != grb::SUCCESS ) return rc;
 			int rand = std::rand();
 
 			for( size_t si = nprocs ; rc == grb::SUCCESS && si > 0; --si ){
-				if( si == s+1 ){
+				if( si-1 == s ){
 					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
@@ -167,27 +178,33 @@ namespace grb {
 							std::swap( energies[i], energies[i-1] );
 						}
 					}
-					grb::set( msg[1].s, states[0] );
+					grb::set( s1, states[0] );
 					msg[ 1 ].e = energies[ 0 ];
 					msg[ 1 ].b = betas[0];
 					// msg[ 1 ].r = rand;
-				}else if( si == s+2 ){
-					grb::set( msg[0].s, states[ n_replicas - 1 ] );
+				}else if( si-2 == s ){
+					grb::set( s0, states[ n_replicas - 1 ] );
 					msg[ 0 ].e = energies[ n_replicas - 1 ];
 					msg[ 0 ].b = betas[ n_replicas - 1 ];
 					msg[ 0 ].r = rand;
 				}
 				if( si == 1 ) continue;
 
-				// std::cerr << "Calling broadcasts" << std::endl;
-				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].s, si-2 );
+#ifdef _GRB_WITH_LPF
+				rc = rc ? rc : grb::internal::broadcast( s0, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].e, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].b, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].r, si-2 );
-				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].s, si-1 );
+				rc = rc ? rc : grb::internal::broadcast( s1, si-1 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].e, si-1 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].b, si-1 );
 
+				assert( grb::nnz(s0) == n ); // state has to be dense!
+				assert( grb::nnz(s1) == n ); // state has to be dense!
+#else
+				assert( false ); // this should never run
+#endif
+
 #ifndef NDEBUG
 	
 				if( rc != grb::SUCCESS ){
@@ -200,18 +217,16 @@ namespace grb {
 				const EnergyType de = ( msg[ 1 ].e - msg[ 0 ].e ) * ( msg[ 1 ].b - msg[ 0 ].b );
 
 				if( rc == grb::SUCCESS && ( de >= 0 || msg[ 0 ].r < RAND_MAX * internal::exp( de ) ) ){
-					if( si == s+2 ){
-						states[ 0 ] = msg[ 0 ].s;
-						energies[ 0 ] = msg[ 0 ].e;
-						// betas[ 0 ] = msg[ 0 ].b;
-					}else if( si ==  s+1 ){
-						states[ n_replicas-1 ] = msg[ 1 ].s;
-						energies[ n_replicas-1 ] = msg[ 1 ].e;
-						// betas[ n_replicas-1 ] = msg[ 1 ].b;
+					if( si == s+1 ){
+
+						rc = rc ? rc : grb::set( states[ n_replicas - 1 ], s0 );
+						rc = rc ? rc : grb::setElement(energies, msg[ 0 ].e, n_replicas - 1 );
+					}else if( si ==  s+2 ){
+						rc = rc ? rc : grb::set( states[ 0 ], s1 );
+						rc = rc ? rc : grb::setElement(energies, msg[ 1 ].e, 0 );
 					}
 				}
 			}
-
 			return rc;
 		}
 
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index ad0d6c3c7..1c2073bed 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -290,12 +290,13 @@ EnergyType get_energy(
 	grb::resize( tmp, n );
 	grb::RC rc = grb::SUCCESS;
 	EnergyType energy = 0.0;
+	constexpr auto dense_descr = descr | grb::descriptors::dense;
 
 	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
-	rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( tmp, couplings, state, ring );
-	rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
-	rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp, local_fields, ring.getAdditiveMonoid() );
-	rc = rc ? rc : grb::dot< descr | grb::descriptors::dense >( energy, tmp, state, ring );
+	rc = rc ? rc : grb::mxv< dense_descr >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< dense_descr >( energy, tmp, state, ring );
 	assert( rc == grb::SUCCESS );
 
 	return energy;
@@ -306,11 +307,11 @@ template<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one
 		>,
-		Backend backend = grb::reference,
+		Backend backend = internal_backend,
 		grb::Descriptor descr = grb::descriptors::no_operation
 	>
 EnergyType sequential_sweep_immediate(
-				 grb::Vector< IOType, internal_backend > &state,
+				 grb::Vector< IOType, backend > &state,
 				 const JType &beta,
 				 std::tuple<
 				 	 const grb::Matrix< JType, backend >&,
@@ -330,8 +331,9 @@ EnergyType sequential_sweep_immediate(
 
 		grb::RC rc = grb::SUCCESS;
 		const size_t n = grb::size( state );
-		EnergyType delta_energy = static_cast< EnergyType >(0.0);
+		assert( grb::nnz(state) == n ); // state has to be dense!
 
+		EnergyType delta_energy = static_cast< EnergyType >(0.0);
 		const auto &couplings 	= std::get<0>(data);
 		const auto &local_fields = std::get<1>(data);
 		auto &h 		= std::get<2>(data);
@@ -409,17 +411,17 @@ EnergyType sequential_sweep_immediate(
 			abort();
 		}
 		assert( rc == grb::SUCCESS );
-		if(s == 0){
-			const auto new_state = state;
+		const auto new_state = state;
 
-			const auto real_delta = get_energy(couplings, local_fields, new_state, h) - get_energy(couplings, local_fields, old_state, h);
+		const auto real_delta = get_energy(couplings, local_fields, new_state, h) - get_energy(couplings, local_fields, old_state, h);
+		if(s == 0){
 			std::cerr << "\n\t Delta_energy: " << delta_energy;
 			std::cerr << "\n\t Real delta: " << real_delta;
 			std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
 			std::cerr << std::endl;
 
-			assert( ISCLOSE(real_delta, delta_energy ) );
 		}
+		assert( ISCLOSE(real_delta, delta_energy ) );
 #endif
 
 		return delta_energy;
@@ -723,6 +725,7 @@ void grbProgram(
                 rc = grb::algorithms::simulated_annealing_RE(
 					sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
                 );
+				grb::collectives<>::allreduce( out.best_energy, grb::operators::min< EnergyType >() );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {
 				rc = rc ? rc : wait();
@@ -740,6 +743,7 @@ void grbProgram(
 			}
 		}
 
+
 		out.times.useful = time_taken / static_cast< double >( out.rep );
 		// print timing at root process
 		if( s == 0 ) {
@@ -897,6 +901,15 @@ int main( int argc, char ** argv ) {
             return 4;
         }
     }
+
+	int s;
+	if( MPI_Comm_rank(MPI_COMM_WORLD, &s) != MPI_SUCCESS ) {
+		std::cerr << "MPI_Comm_rank returns with non-SUCCESS exit code." << std::endl;
+		return 51;
+	}
+	if( s == 0 ){
+		std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
+	}
 	
 	// finalise MPI
 	if( MPI_Finalize() != MPI_SUCCESS ) {
@@ -904,6 +917,5 @@ int main( int argc, char ** argv ) {
 		return 50;
 	}
 
-    std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
     return out.error_code;
 }

From c5d3bc1b79e081d8429581049c3db49a676a5974 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 20 Nov 2025 11:53:31 +0100
Subject: [PATCH 29/58] Changed behavior: if no h provided assume zero

---
 tests/smoke/simulated_annealing_re_ising.cpp | 44 +++++++++++++-------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index c2436cd0a..5f84afe51 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -116,7 +116,7 @@ namespace test_data {
 
     const size_t nnz = j_matrix_data.size();
 
-    const auto h_array_data = {
+    const std::vector< JType > h_array_data = {
         -0.08910436,  0.58034508,  0.97719304,  0.16792909,
 		-0.9221754 , -0.10715418, -0.62365497,  0.25411129,
 		-0.5693644 , -0.69805978,  0.07228861, -0.79922641,
@@ -325,7 +325,12 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		} else {
 			// read from files if provided
 			read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
-			read_vector_data<JType>( data_in.filename_h, h );
+			if( std::strlen( data_in.filename_h ) > 0 ) {
+				read_vector_data<JType>( data_in.filename_h, h );
+			}else{
+				h.resize( n );
+				std::fill( h.begin(), h.end(), static_cast< JType >( 0 ) );
+			}
 			if(data_in.verify) {
 				if( std::strlen(data_in.filename_ref_solution) == 0 ) {
 					std::cerr << "Reference solution file not provided for verification\n";
@@ -341,7 +346,6 @@ void ioProgram( const struct input &data_in, bool &success ) {
 	success = true;
 }
 
-
 void grbProgram(
     const struct input &data_in, 
     struct output &out
@@ -398,6 +402,13 @@ void grbProgram(
 			out.error_code = 5;
 			return;
 		}
+		// make J symmetric
+		// grb::Matrix< JType > Jt ( n, n );
+		// Jt = J;
+		// const grb::Monoid< grb::operators::add< JType >, grb::identities::zero > addMonoid;
+		// const grb::Monoid< grb::operators::mul< JType >, grb::identities::one > mulMonoid;
+		// grb::foldl< grb::descriptors::transpose_right >( J, Jt, addMonoid); // issue  #210
+		// grb::foldl<>( J, static_cast< JType >( 0.5 ), mulMonoid);
 
 #ifdef DEBUG_IMSB
 		if( s == 0 && grb::ncols( J ) < 40 ) {
@@ -442,18 +453,21 @@ void grbProgram(
         );
     }
 
-    #ifdef DEBUG_IMSB
 	grb::Vector< EnergyType > tmp_energy ( n );
+	EnergyType initial_energy = get_energy(  J, h, states[0], tmp_energy );
+
 	for ( size_t r = 0; r < n_replicas; ++r ) {
 		const auto en = get_energy(  J, h, states[r], tmp_energy );
+		initial_energy = std::min( en, initial_energy );
+    #ifdef DEBUG_IMSB
 		if( s == 0 ) {
 			std::cout << "Initial state replica " << r << ":\n";
 			print_vector( states[r], 30 ,"states values" );
 			std::cout << "With energy " << en << "\n";
 			std::cout << std::endl;
 		}
-	}
     #endif
+	}
 
 
     // also make betas vector os size n_replicas and initialize with 10.0
@@ -543,6 +557,14 @@ void grbProgram(
 			std::cout << "\tmilliseconds per iteration: "
 				<< ( out.times.useful / static_cast< double >( out.iterations ) )
 				<< "\n";
+
+			if( data_in.verify ){
+				if( out.best_energy < initial_energy ){
+					std::cout << "Test OK" << std::endl;
+				}else{
+					std::cout << "Test FAILED" << std::endl;
+				}
+			}
 		}
 		sleep( 1 );
 	}
@@ -569,7 +591,7 @@ void printhelp( char *progname ) {
               << "Options:\n"
               << "  --use-default-data         Use embedded default test data\n"
               << "  --j-matrix-fname STR       Path to J matrix file (matrix-market or supported)\n"
-              << "  --h-fname STR              Path to h (local fields) vector (whitespace separated)\n"
+              << "  --h-fname STR              Path to h (local fields) vector (whitespace separated), if not provided assume zero\n"
               << "  --n-replicas INT           Number of replicas (default: 3)\n"
               << "  --nsweeps INT              Number of sweeps (default: 2)\n"
               << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
@@ -628,8 +650,7 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
 
     // basic validation
     if ( !in.use_default_data ) {
-        if ( std::strlen( in.filename_Jmatrix ) == 0
-				|| std::strlen( in.filename_h ) == 0 ) {
+        if ( std::strlen( in.filename_Jmatrix ) == 0 ) {
             std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
             return false;
         }
@@ -682,12 +703,5 @@ int main( int argc, char ** argv ) {
     }
 
     std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
-	if( in.verify && in.use_default_data ){
-		if( out.best_energy <= -5 ){
-    		std::cout << "Test OK" << std::endl;
-		}else{
-    		std::cout << "Test FAILED" << std::endl;
-		}
-	}
     return out.error_code;
 }

From e627fff6508e900ae7bbc4204768a3d1c0008330 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 20 Nov 2025 12:48:14 +0100
Subject: [PATCH 30/58] fixup! Changed behavior: if no h provided assume zero

---
 tests/smoke/simulated_annealing_re_from_mpi.cpp | 14 +++++++++-----
 tests/smoke/simulated_annealing_re_ising.cpp    |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 1c2073bed..42bdafeb8 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -501,7 +501,12 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		} else {
 			// read from files if provided
 			read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
-			read_vector_data<JType>( data_in.filename_h, h );
+			if( std::strlen( data_in.filename_h ) > 0 ) {
+				read_vector_data<JType>( data_in.filename_h, h );
+			}else{
+				h.resize( n );
+				std::fill( h.begin(), h.end(), static_cast< JType >( 0 ) );
+			}
 			if( data_in.verify ) {
 				if( std::strlen(data_in.filename_ref_solution) == 0 ) {
 					std::cerr << "Reference solution file not provided for verification\n";
@@ -780,7 +785,7 @@ void printhelp( char *progname ) {
               << "Options:\n"
               << "  --use-default-data         Use embedded default test data\n"
               << "  --j-matrix-fname STR       Path to J matrix file (matrix-market or supported)\n"
-              << "  --h-fname STR              Path to h (local fields) vector (whitespace separated)\n"
+              << "  --h-fname STR              Path to h (local fields) vector (whitespace separated), if not provided assume zero\n"
               << "  --n-replicas INT           Number of replicas (default: 3)\n"
               << "  --nsweeps INT              Number of sweeps (default: 2)\n"
               << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
@@ -843,9 +848,8 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
 
     // basic validation
     if ( !in.use_default_data ) {
-        if ( std::strlen( in.filename_Jmatrix ) == 0
-				|| std::strlen( in.filename_h ) == 0 ) {
-            std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
+        if ( std::strlen( in.filename_Jmatrix ) == 0 ) {
+            std::cerr << "Either --use-default-data or --j-matrix-fname must be provided\n";
             return false;
         }
     }
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index 5f84afe51..ec65d2c20 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -651,7 +651,7 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
     // basic validation
     if ( !in.use_default_data ) {
         if ( std::strlen( in.filename_Jmatrix ) == 0 ) {
-            std::cerr << "Either --use-default-data or both --j-matrix-fname and --h-fname must be provided\n";
+            std::cerr << "Either --use-default-data or both --j-matrix-fname must be provided\n";
             return false;
         }
     }

From 700f47e67d8320c43d1c546d7d7c2533019ae3f2 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 25 Nov 2025 12:18:50 +0100
Subject: [PATCH 31/58] Some slight performance improvemnts

---
 .../algorithms/simulated_annealing_re.hpp     | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 81f4e3f8e..1b43fe8dd 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -575,7 +575,7 @@ namespace grb {
 			}
 
 			grb::Vector< QType, backend > h ( n );
-			grb::Vector< QType, backend > log_rand ( n );
+			grb::Vector< QType, backend > rand ( n );
 			grb::Vector< StateType, backend > delta ( n );
 			grb::Vector< EnergyType, backend > dn ( n );
 			grb::Vector< bool, backend > accept ( n );
@@ -583,13 +583,13 @@ namespace grb {
     		std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
 
 			grb::resize( h, n );
-			grb::resize( log_rand, n );
+			grb::resize( rand, n );
 			grb::resize( delta, n );
 			grb::resize( dn, n );
 			grb::resize( accept, n );
 
 			std::vector< grb::Vector< bool, backend > > masks ;
-			rc = rc ? rc : matrix_partition< descr >( masks, couplings, h, log_rand, seed );
+			rc = rc ? rc : matrix_partition< descr >( masks, couplings, h, rand, seed );
 			grb::clear(h);
 			constexpr auto dense_descr = descr | grb::descriptors::dense;
 
@@ -598,7 +598,7 @@ namespace grb {
 					(const typeof(local_fields)&) local_fields,
 					(const typeof(masks)&) masks,
 					h,
-					log_rand,
+					rand,
 					delta,
 					dn,
 					accept,
@@ -606,7 +606,11 @@ namespace grb {
 					(const typeof(ring)&) ring
 					);
 
-			const auto ising_sweep = [](
+#ifdef NDEBUG
+            const auto ising_sweep = [](
+#else
+            const auto ising_sweep = [&get_energy](
+#endif
 				 grb::Vector< StateType, backend > &state,
 				 const TempType &beta,
 				 typeof(sweep_data) &data
@@ -618,7 +622,7 @@ namespace grb {
 				const auto &local_fields = std::get<1>(data);
 				const auto &masks = std::get<2>(data);
 				auto &h = std::get<3>(data);
-				auto &log_rand = std::get<4>(data);
+				auto &rand = std::get<4>(data);
 				auto &delta = std::get<5>(data);
 				auto &dn = std::get<6>(data);
 				auto &accept = std::get<7>(data);
@@ -628,6 +632,7 @@ namespace grb {
 				const size_t n = grb::size( state );
 				EnergyType delta_energy = static_cast< EnergyType >(0.0);
 				grb::RC rc = grb::SUCCESS;
+				(void) n;
 
 				if( !empty_local_fields) {
 					rc = rc ? rc : grb::set< descr >( h, local_fields );
@@ -635,42 +640,37 @@ namespace grb {
 					rc = rc ? rc : grb::set< descr >( h, static_cast< QType >( 0.0 ) );
 				}
 				rc = rc ? rc : grb::mxv< dense_descr >( h, couplings, state , ring );
-				std::uniform_real_distribution< QType > rand ( 0.0, 1.0 );
-				for( size_t j = 0 ; j < n ; ++j ){
-					const auto rnd = rand( rng );
-					rc = rc ? rc : grb::setElement(log_rand,  internal::log( rnd ), j );
+				std::uniform_real_distribution< QType > rand_gen ( 0.0, 1.0 );
+				for( size_t i = 0 ; i < n; ++i ){
+					grb::setElement( rand, rand_gen( rng ), i );
 				}
+
 #ifndef NDEBUG
 				const grb::Vector< StateType > old_state = state;
 #endif
 				rc = rc ? rc : grb::wait();
 				for(const auto &mask : masks ){
-					rc = rc ? rc : grb::clear( accept  );
-					rc = rc ? rc : grb::clear( delta  );
-					rc = rc ? rc : grb::clear( dn );
-
 					// dn = (2*state_slice - 1) * h_slice
 					rc = rc ? rc : grb::set< descr >( dn, mask, state );
 					rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
 					rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
 					rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
 
-					// ( dn >= 0 ) | ( log_rand < beta * dn )
+					// ( dn >= 0 ) | ( rand < beta * dn )
 					rc = rc ? rc : grb::set< descr >( accept, mask );
 					rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
 					rc = rc ? rc : grb::eWiseLambda< descr >(
-							[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
+							[ &mask, &accept, &dn, &rand, beta ]( const size_t i ){
 						if( mask[i] ){
-							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
+							accept[i] = ( dn[i] >= 0 ) || ( internal::log( rand[i] ) < beta * dn[i] );
 						}
-					}, mask, log_rand, dn, accept );
+					}, mask, rand, dn, accept );
 
 					// new_state = np.where(accept, 1 - old, old)
 					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( -1 ), ring.getMultiplicativeMonoid() );
 					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( 1 ), ring.getAdditiveMonoid() );
 					
 					// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
-					rc = rc ? rc : grb::clear( delta  );
 					rc = rc ? rc : grb::set< descr >( delta, accept, state );
 					rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< StateType >( 2 ), ring.getMultiplicativeMonoid() );
 					rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< StateType >( -1 ), ring.getAdditiveMonoid() );
@@ -694,8 +694,8 @@ namespace grb {
 
 				EnergyType e1 = static_cast< EnergyType >( 0.0 ),
 						   e2 = static_cast< EnergyType >( 0.0 );
-				// get_energy(e1, old_state);
-				// get_energy(e2, new_state);
+				get_energy(e1, old_state);
+				get_energy(e2, new_state);
 				const auto real_delta = e2 - e1;
 				if( s == 0 ){
 					std::cerr << "\n\t Delta_energy: " << delta_energy;
@@ -704,7 +704,7 @@ namespace grb {
 					std::cerr << std::endl;
 				}
 
-				// assert( ISCLOSE(real_delta, delta_energy ) );
+				assert( ISCLOSE(real_delta, delta_energy ) );
 #endif
 				return delta_energy;
 			};

From cee1bb21d89372c70cd1f12b1e1eac0b421e92c8 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 26 Nov 2025 11:48:42 +0100
Subject: [PATCH 32/58] Replaced eWiseLambda with fold in Simulated Annealing

---
 .../algorithms/simulated_annealing_re.hpp     | 20 +++++++-------
 .../smoke/simulated_annealing_re_from_mpi.cpp | 27 ++++++++-----------
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 1b43fe8dd..394bde089 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -640,11 +640,14 @@ namespace grb {
 					rc = rc ? rc : grb::set< descr >( h, static_cast< QType >( 0.0 ) );
 				}
 				rc = rc ? rc : grb::mxv< dense_descr >( h, couplings, state , ring );
-				std::uniform_real_distribution< QType > rand_gen ( 0.0, 1.0 );
+
+				std::exponential_distribution< EnergyType > rand_gen ( beta );
 				for( size_t i = 0 ; i < n; ++i ){
-					grb::setElement( rand, rand_gen( rng ), i );
+					const auto rnd = -rand_gen( rng );
+					grb::setElement( rand, rnd, i );
 				}
 
+				const grb::operators::leq< EnergyType > leq_operator;
 #ifndef NDEBUG
 				const grb::Vector< StateType > old_state = state;
 #endif
@@ -656,15 +659,10 @@ namespace grb {
 					rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
 					rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
 
-					// ( dn >= 0 ) | ( rand < beta * dn )
-					rc = rc ? rc : grb::set< descr >( accept, mask );
-					rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
-					rc = rc ? rc : grb::eWiseLambda< descr >(
-							[ &mask, &accept, &dn, &rand, beta ]( const size_t i ){
-						if( mask[i] ){
-							accept[i] = ( dn[i] >= 0 ) || ( internal::log( rand[i] ) < beta * dn[i] );
-						}
-					}, mask, rand, dn, accept );
+					// Choose which changes to accept
+					// ( dn >= 0 ) | ( rand/beta < dn )
+					rc = rc ? rc : grb::foldl< descr >( dn, rand, leq_operator );
+					rc = rc ? rc : grb::set< descr >( accept, dn, mask );
 
 					// new_state = np.where(accept, 1 - old, old)
 					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( -1 ), ring.getMultiplicativeMonoid() );
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 42bdafeb8..fd64c2769 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -337,7 +337,7 @@ EnergyType sequential_sweep_immediate(
 		const auto &couplings 	= std::get<0>(data);
 		const auto &local_fields = std::get<1>(data);
 		auto &h 		= std::get<2>(data);
-		auto &log_rand	= std::get<3>(data);
+		auto &rand	= std::get<3>(data);
 		auto &delta		= std::get<4>(data);
 		const auto &masks = std::get<5>(data);
 		auto &dn		= std::get<6>(data);
@@ -346,7 +346,7 @@ EnergyType sequential_sweep_immediate(
 
 		rc = rc ? rc : grb::wait();
 		rc = rc ? rc : grb::resize( h, n );
-		rc = rc ? rc : grb::resize( log_rand, n );
+		rc = rc ? rc : grb::resize( rand, n );
 		rc = rc ? rc : grb::resize( delta, n );
 		rc = rc ? rc : grb::resize( dn, n );
 		rc = rc ? rc : grb::resize( accept, n );
@@ -354,12 +354,13 @@ EnergyType sequential_sweep_immediate(
 		rc = rc ? rc : grb::set< descr >( h, local_fields );
 		rc = rc ? rc : grb::mxv< descr >( h, couplings, state , ring );
 
-		std::uniform_real_distribution< JType > rand ( 0.0, 1.0 );
-		for( size_t j = 0 ; j < n ; ++j ){
-			const auto rnd = rand( rng );
-			rc = rc ? rc : grb::setElement(log_rand,  std::log( rnd ), j );
+		std::exponential_distribution< EnergyType > rand_gen ( beta );
+		for( size_t i = 0 ; i < n; ++i ){
+			const auto rnd = -rand_gen( rng );
+			grb::setElement( rand, rnd, i );
 		}
 
+		const grb::operators::leq< EnergyType > leq_operator;
 #ifndef NDEBUG
 		const grb::Vector< IOType, backend > old_state = state;
 #endif
@@ -375,16 +376,10 @@ EnergyType sequential_sweep_immediate(
 			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
 			rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
 
-			// ( dn >= 0 ) | ( log_rand < beta * dn )
-			rc = rc ? rc : grb::set< descr >( accept, mask );
-			rc = rc ? rc : grb::wait(); // needed to avoid ERROR: Segmentation Fault with nonblocking backend
-			rc = rc ? rc : grb::eWiseLambda< descr >(
-					[ &mask, &accept, &dn, &log_rand, beta ]( const size_t i ){
-						(void) i;
-						if( mask[i] ){
-							accept[i] = ( dn[i] >= 0 ) || ( log_rand[i] < beta * dn[i] );
-						}
-					}, mask, log_rand, dn, accept );
+			// Choose which changes to accept
+			// ( dn >= 0 ) | ( rand/beta < dn )
+			rc = rc ? rc : grb::foldl< descr >( dn, rand, leq_operator );
+			rc = rc ? rc : grb::set< descr >( accept, dn, mask );
 
 			// new_state = np.where(accept, 1 - old, old)
 			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );

From e4c47745cbd2e2d4ca0c5cdcf01ef7d72ecf0477 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 27 Nov 2025 09:46:32 +0100
Subject: [PATCH 33/58] Reduced number of folds in simulated annealing sweep

---
 .../algorithms/simulated_annealing_re.hpp       | 11 +++++------
 tests/smoke/simulated_annealing_re_from_mpi.cpp | 17 +++++------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 394bde089..448258a63 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -648,6 +648,8 @@ namespace grb {
 				}
 
 				const grb::operators::leq< EnergyType > leq_operator;
+				const grb::operators::right_assign< EnergyType > right_assign_op;
+				const grb::operators::not_equal< EnergyType > neq_operator;
 #ifndef NDEBUG
 				const grb::Vector< StateType > old_state = state;
 #endif
@@ -655,8 +657,7 @@ namespace grb {
 				for(const auto &mask : masks ){
 					// dn = (2*state_slice - 1) * h_slice
 					rc = rc ? rc : grb::set< descr >( dn, mask, state );
-					rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
-					rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
+					rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( dn, state, static_cast< QType >( -1 ), right_assign_op );
 					rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
 
 					// Choose which changes to accept
@@ -665,13 +666,11 @@ namespace grb {
 					rc = rc ? rc : grb::set< descr >( accept, dn, mask );
 
 					// new_state = np.where(accept, 1 - old, old)
-					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( -1 ), ring.getMultiplicativeMonoid() );
-					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( 1 ), ring.getAdditiveMonoid() );
+					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( 1 ), neq_operator );
 					
 					// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 					rc = rc ? rc : grb::set< descr >( delta, accept, state );
-					rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< StateType >( 2 ), ring.getMultiplicativeMonoid() );
-					rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< StateType >( -1 ), ring.getAdditiveMonoid() );
+					rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( delta, delta, static_cast< QType >( -1 ), right_assign_op );
 					
 					// Update delta_energy -= dot(dn, accept)
 					rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index fd64c2769..752840700 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -361,19 +361,15 @@ EnergyType sequential_sweep_immediate(
 		}
 
 		const grb::operators::leq< EnergyType > leq_operator;
+		const grb::operators::right_assign< EnergyType > right_assign_op;
+		const grb::operators::not_equal< EnergyType > neq_operator;
 #ifndef NDEBUG
 		const grb::Vector< IOType, backend > old_state = state;
 #endif
 		for(const auto &mask : masks ){
-
-			rc = rc ? rc : grb::clear( accept  );
-			rc = rc ? rc : grb::clear( delta  );
-			rc = rc ? rc : grb::clear( dn );
-
 			// dn = (2*state_slice - 1) * h_slice
 			rc = rc ? rc : grb::set< descr >( dn, mask, state );
-			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( 2 ), ring.getMultiplicativeMonoid()  );
-			rc = rc ? rc : grb::foldl< descr >( dn, static_cast< EnergyType >( -1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( dn, state, static_cast< EnergyType >( -1 ), right_assign_op );
 			rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
 
 			// Choose which changes to accept
@@ -382,14 +378,11 @@ EnergyType sequential_sweep_immediate(
 			rc = rc ? rc : grb::set< descr >( accept, dn, mask );
 
 			// new_state = np.where(accept, 1 - old, old)
-			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( -1 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( 1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( 1 ), neq_operator );
 			
 			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
-			rc = rc ? rc : grb::clear( delta  );
 			rc = rc ? rc : grb::set< descr >( delta, accept, state );
-			rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< IOType >( 2 ), ring.getMultiplicativeMonoid() );
-			rc = rc ? rc : grb::foldl< descr >( delta, accept, static_cast< IOType >( -1 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( delta, delta, static_cast< EnergyType >( -1 ), right_assign_op );
 			
 			// Update delta_energy -= dot(dn, accept)
 			rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );

From f963b2fecb6fcaea7a72e2c64f21ac2864ee8035 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 28 Nov 2025 10:15:05 +0100
Subject: [PATCH 34/58] Small improvements and fixes. Temperature is no longer
 constant + many type fixes

---
 .../algorithms/simulated_annealing_re.hpp     | 58 ++++++++++---------
 tests/smoke/CMakeLists.txt                    |  2 +-
 .../smoke/simulated_annealing_re_from_mpi.cpp | 36 ++++++------
 tests/smoke/simulated_annealing_re_ising.cpp  |  2 +-
 4 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 448258a63..dcc27606b 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -74,9 +74,8 @@ namespace grb {
 		/*
 		 * Do a Parallel Tempering pass.
 		 * This means exchanging states at low temperature with states at higher temperature.
-		 * To make the code simpler, this will be done by exchanging the temperatures instead.
 		 *
-		 * TODO: Fix this documentation.
+		 * TODO: Complete this documentation.
 		 *
 		 * @param[in,out] states        On input: initial states.
 		 * @param[in,out] energies      The initial energy of each state.
@@ -101,18 +100,22 @@ namespace grb {
 	pt(
 				std::vector< grb::Vector< StateType, backend > > &states,
 				grb::Vector< EnergyType, backend > &energies,
-				const grb::Vector< TempType, backend > &betas
+				const grb::Vector< TempType, backend > &betas,
+				const int seed = 42
 				){
 
 			const size_t n_replicas = states.size();
 			// const size_t s 		= spmd<>::pid();
 			// const size_t nprocs = spmd<>::nprocs();
+			std::srand( seed );
 			grb::RC rc = grb::SUCCESS;
+			std::minstd_rand rng ( seed );
+			std::exponential_distribution< EnergyType > rand ( 1.0 );
 
 			for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 				const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
-				if( de >= 0 || std::rand() < RAND_MAX * internal::exp( de ) ){
+				if( -rand( rng ) < de ){
 					std::swap( states[i], states[i-1] );
 					std::swap( energies[i], energies[i-1] );
 				}
@@ -136,7 +139,8 @@ namespace grb {
 		pt(
 				std::vector< grb::Vector< StateType, backend > > &states,
 				grb::Vector< EnergyType, backend > &energies,
-				const grb::Vector< TempType, backend > &betas
+				const grb::Vector< TempType, backend > &betas,
+				const int seed = 42
 				){
 			static_assert( backend != grb::BSP1D );
 			// static_assert( grb::_GRB_BACKEND == grb::BSP1D );
@@ -151,29 +155,30 @@ namespace grb {
 			assert( grb::size(energies) == n_replicas );
 			assert( grb::size(betas) == n_replicas );
 #endif
+			std::minstd_rand rng ( seed + s );
+			std::exponential_distribution< EnergyType > rand ( 1.0 );
 			struct data {
 					EnergyType e;
 					TempType b;
-					int r;
+					EnergyType r;
 				};
 			grb::Vector< StateType, backend > s0 ( n );
 			grb::Vector< StateType, backend > s1 ( n );
 			grb::set( s0, static_cast< StateType >( 0 ) );
 			grb::set( s1, static_cast< StateType >( 0 ) );
 
-
 			struct data msg[ 2 ];
 			rc = rc ? rc : grb::resize( s0, n );
 			rc = rc ? rc : grb::resize( s1, n );
 			if( rc != grb::SUCCESS ) return rc;
-			int rand = std::rand();
+			const auto myrand = -rand( rng );
 
 			for( size_t si = nprocs ; rc == grb::SUCCESS && si > 0; --si ){
 				if( si-1 == s ){
 					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
-						if( de >= 0 || std::rand() < RAND_MAX * internal::exp( de ) ){
+						if( -rand( rng ) < de ){
 							std::swap( states[i], states[i-1] );
 							std::swap( energies[i], energies[i-1] );
 						}
@@ -186,7 +191,7 @@ namespace grb {
 					grb::set( s0, states[ n_replicas - 1 ] );
 					msg[ 0 ].e = energies[ n_replicas - 1 ];
 					msg[ 0 ].b = betas[ n_replicas - 1 ];
-					msg[ 0 ].r = rand;
+					msg[ 0 ].r = myrand;
 				}
 				if( si == 1 ) continue;
 
@@ -216,9 +221,8 @@ namespace grb {
 
 				const EnergyType de = ( msg[ 1 ].e - msg[ 0 ].e ) * ( msg[ 1 ].b - msg[ 0 ].b );
 
-				if( rc == grb::SUCCESS && ( de >= 0 || msg[ 0 ].r < RAND_MAX * internal::exp( de ) ) ){
+				if( rc == grb::SUCCESS && ( msg[ 0 ].r < de ) ){
 					if( si == s+1 ){
-
 						rc = rc ? rc : grb::set( states[ n_replicas - 1 ], s0 );
 						rc = rc ? rc : grb::setElement(energies, msg[ 0 ].e, n_replicas - 1 );
 					}else if( si ==  s+2 ){
@@ -294,6 +298,7 @@ namespace grb {
 				){
 
 			const size_t s = spmd<>::pid();
+			const size_t n_procs = spmd<>::nprocs();
 			const size_t n_replicas = states.size();
 			const size_t n = grb::size(states[0]);
 			(void) n;
@@ -337,7 +342,7 @@ namespace grb {
 				} // n_replicas
 				if( rc == SUCCESS && use_pt ){
 					// do a Parallel Tempering move
-					rc = pt< backend >( states, energies, betas );
+					rc = pt< backend >( states, energies, betas, i_sweep*n_procs + s );
 				}
 #ifndef NDEBUG
 				if( s == 0 ) {
@@ -355,6 +360,7 @@ namespace grb {
 			if( rc == SUCCESS ){
 				rc = rc ? rc : grb::collectives<>::allreduce(
 						best_energy, grb::operators::min< EnergyType >() );
+				// TODO: update best state to match best energy
 			}
 			
 			return rc;
@@ -481,8 +487,6 @@ namespace grb {
 		 *
 		 * states should be a vector of already initialized and filled dense grb::Vector.
 		 *
-		 *  TODO: expand and complete documentation
-		 *
 		 * Warning: This function allocates $O(n)$ memory for temporary vectors.
 		 *
 		 * @param[in,out] states        On input: initial (dense) states.
@@ -536,7 +540,6 @@ namespace grb {
 			(void) s;
 			grb::RC rc = grb::SUCCESS;
 
-			assert( grb::size(states[0]) == n );
 			assert( grb::nnz(states[0]) == n ); // state is dense
 			assert( states.size() == n_replicas );
 
@@ -576,21 +579,20 @@ namespace grb {
 
 			grb::Vector< QType, backend > h ( n );
 			grb::Vector< QType, backend > rand ( n );
-			grb::Vector< StateType, backend > delta ( n );
-			grb::Vector< EnergyType, backend > dn ( n );
+			grb::Vector< QType, backend > delta ( n );
+			grb::Vector< QType, backend > dn ( n );
 			grb::Vector< bool, backend > accept ( n );
-			std::srand( static_cast<unsigned>( seed ) );
     		std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
 
-			grb::resize( h, n );
-			grb::resize( rand, n );
-			grb::resize( delta, n );
-			grb::resize( dn, n );
-			grb::resize( accept, n );
+			rc = rc ? rc : grb::resize( h, n );
+			rc = rc ? rc : grb::resize( rand, n );
+			rc = rc ? rc : grb::resize( delta, n );
+			rc = rc ? rc : grb::resize( dn, n );
+			rc = rc ? rc : grb::resize( accept, n );
 
 			std::vector< grb::Vector< bool, backend > > masks ;
 			rc = rc ? rc : matrix_partition< descr >( masks, couplings, h, rand, seed );
-			grb::clear(h);
+			rc = rc ? rc : grb::clear(h);
 			constexpr auto dense_descr = descr | grb::descriptors::dense;
 
 			auto sweep_data = std::tie(
@@ -632,7 +634,9 @@ namespace grb {
 				const size_t n = grb::size( state );
 				EnergyType delta_energy = static_cast< EnergyType >(0.0);
 				grb::RC rc = grb::SUCCESS;
-				(void) n;
+
+				assert( grb::nnz(state) == n ); // state has to be dense!
+				assert( grb::nnz(local_fields) == n );
 
 				if( !empty_local_fields) {
 					rc = rc ? rc : grb::set< descr >( h, local_fields );
@@ -644,7 +648,7 @@ namespace grb {
 				std::exponential_distribution< EnergyType > rand_gen ( beta );
 				for( size_t i = 0 ; i < n; ++i ){
 					const auto rnd = -rand_gen( rng );
-					grb::setElement( rand, rnd, i );
+					rc = rc ? rc : grb::setElement( rand, rnd, i );
 				}
 
 				const grb::operators::leq< EnergyType > leq_operator;
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 284799402..03c55d7fa 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -150,7 +150,7 @@ add_grb_executables( simulated_annealing_re_from_mpi simulated_annealing_re_from
 )
 
 add_grb_executables( simulated_annealing_re_ising simulated_annealing_re_ising.cpp
-	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	BACKENDS reference reference_omp hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 752840700..6dc8f4273 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -287,8 +287,8 @@ EnergyType get_energy(
 	assert( n == grb::size( state ) );
 	assert( n == grb::ncols( couplings ) );
 	assert( n == grb::nrows( couplings ) );
-	grb::resize( tmp, n );
 	grb::RC rc = grb::SUCCESS;
+	rc = rc ? rc : grb::resize( tmp, n );
 	EnergyType energy = 0.0;
 	constexpr auto dense_descr = descr | grb::descriptors::dense;
 
@@ -318,21 +318,18 @@ EnergyType sequential_sweep_immediate(
 				 	 const grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
-					 grb::Vector< IOType, backend >&,
+					 grb::Vector< JType, backend >&,
 					 const std::vector< grb::Vector< bool, backend > >&,
-					 grb::Vector< EnergyType, backend >&,
+					 grb::Vector< JType, backend >&,
 					 grb::Vector< bool, backend >&,
 					 std::minstd_rand&
 					 > &data
 			  ){
 		const size_t s = spmd<>::pid();
 		const Ring ring = Ring();
+		constexpr auto dense_descr = descr | grb::descriptors::dense;
 		(void) s;
 
-		grb::RC rc = grb::SUCCESS;
-		const size_t n = grb::size( state );
-		assert( grb::nnz(state) == n ); // state has to be dense!
-
 		EnergyType delta_energy = static_cast< EnergyType >(0.0);
 		const auto &couplings 	= std::get<0>(data);
 		const auto &local_fields = std::get<1>(data);
@@ -344,6 +341,11 @@ EnergyType sequential_sweep_immediate(
 		auto &accept	= std::get<7>(data);
 		auto &rng       = std::get<8>(data);
 
+		grb::RC rc = grb::SUCCESS;
+		const size_t n = grb::size( state );
+		assert( grb::nnz(state) == n ); // state has to be dense!
+		assert( grb::nnz(local_fields) == n );
+
 		rc = rc ? rc : grb::wait();
 		rc = rc ? rc : grb::resize( h, n );
 		rc = rc ? rc : grb::resize( rand, n );
@@ -351,13 +353,13 @@ EnergyType sequential_sweep_immediate(
 		rc = rc ? rc : grb::resize( dn, n );
 		rc = rc ? rc : grb::resize( accept, n );
 
-		rc = rc ? rc : grb::set< descr >( h, local_fields );
-		rc = rc ? rc : grb::mxv< descr >( h, couplings, state , ring );
+		rc = rc ? rc : grb::set< dense_descr >( h, local_fields );
+		rc = rc ? rc : grb::mxv< dense_descr >( h, couplings, state , ring );
 
 		std::exponential_distribution< EnergyType > rand_gen ( beta );
 		for( size_t i = 0 ; i < n; ++i ){
-			const auto rnd = -rand_gen( rng );
-			grb::setElement( rand, rnd, i );
+			const auto rnd = rand_gen( rng );
+			rc = rc ? rc : grb::setElement( rand, rnd, i );
 		}
 
 		const grb::operators::leq< EnergyType > leq_operator;
@@ -423,9 +425,9 @@ template<
 				 	 const grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
 					 grb::Vector< JType, backend >&,
-					 grb::Vector< IOType, backend >&,
+					 grb::Vector< JType, backend >&,
 					 const std::vector< grb::Vector< bool, backend > >&,
-					 grb::Vector< EnergyType, backend >&,
+					 grb::Vector< JType, backend >&,
 					 grb::Vector< bool, backend >&,
 					 std::minstd_rand&
 					 >,
@@ -519,7 +521,7 @@ void grbProgram(
 
 	// get user process ID
 	const size_t s = spmd<>::pid();
-	assert( s < spmd<>::nprocs() );
+	const size_t nprocs = spmd<>::nprocs();
 
 
     grb::utils::Timer timer;
@@ -623,7 +625,7 @@ void grbProgram(
     grb::Vector< EnergyType, internal_backend > energies( n_replicas );
     grb::Vector< EnergyType, internal_backend > tmp_energy( n );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( 10.0 * ( n_replicas * nprocs ) / ( n_replicas * s + r + 1) ), r );
         rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
     }
 
@@ -644,9 +646,9 @@ void grbProgram(
 	grb::Vector< JType, internal_backend > temp_h ( n );
 	grb::Vector< JType, internal_backend > temp_log_rand ( n );
 	grb::Vector< IOType, internal_backend > best_state ( n );
-	grb::Vector< EnergyType, internal_backend > temp_dn ( n );
+	grb::Vector< JType, internal_backend > temp_dn ( n );
 	grb::Vector< bool, internal_backend > temp_accept ( n );
-	grb::Vector< IOType, internal_backend > temp_delta ( n );
+	grb::Vector< JType, internal_backend > temp_delta ( n );
 
 	// build masks, we'll use two of the above temporary vectors
     std::vector< grb::Vector< bool, internal_backend > > masks;
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index ec65d2c20..2f3541a9d 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -474,7 +474,7 @@ void grbProgram(
     grb::Vector< JType > betas( n_replicas );
     grb::Vector< EnergyType > energies( n_replicas );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast< JType >(10.0), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( 10.0* n_replicas / (r+1) ), r );
         // rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
     }
 	assert( rc == grb::SUCCESS );

From 204248e8533562a8d4c0e2b47538b834f251ac8b Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 7 Jan 2026 13:25:52 +0100
Subject: [PATCH 35/58] Many small improvements and fixes. Temperature is no
 longer constant + many type fixes

---
 tests/smoke/simulated_annealing_re_from_mpi.cpp | 2 +-
 tests/smoke/simulated_annealing_re_ising.cpp    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 6dc8f4273..a259fc8c3 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -625,7 +625,7 @@ void grbProgram(
     grb::Vector< EnergyType, internal_backend > energies( n_replicas );
     grb::Vector< EnergyType, internal_backend > tmp_energy( n );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( 10.0 * ( n_replicas * nprocs ) / ( n_replicas * s + r + 1) ), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( 10.0 / ( n_replicas * s + r + 1) ), r );
         rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
     }
 
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index 2f3541a9d..58f1c32b1 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -469,12 +469,11 @@ void grbProgram(
     #endif
 	}
 
-
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType > betas( n_replicas );
     grb::Vector< EnergyType > energies( n_replicas );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( 10.0* n_replicas / (r+1) ), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( n_replicas / (r+1) ), r );
         // rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
     }
 	assert( rc == grb::SUCCESS );

From 89ca8f0ef502c3a4e03d7f30a9e563b833b28a6b Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 28 Nov 2025 10:49:08 +0100
Subject: [PATCH 36/58] fixup! Small improvements and fixes. Temperature is no
 longer constant + many type fixes

---
 .../algorithms/simulated_annealing_re.hpp     |  1 +
 .../smoke/simulated_annealing_re_from_mpi.cpp | 29 ++++++++++-------
 tests/smoke/simulated_annealing_re_ising.cpp  | 31 +++++++++++--------
 3 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index dcc27606b..96c034e52 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -150,6 +150,7 @@ namespace grb {
 			const size_t s 		= spmd<>::pid();
 			const size_t nprocs = spmd<>::nprocs();
 			grb::RC rc = grb::SUCCESS;
+			std::srand( seed + s );
 
 #ifndef NDEBUG
 			assert( grb::size(energies) == n_replicas );
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index a259fc8c3..c862d68f8 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -38,15 +38,18 @@ const int LPF_MPI_AUTO_INITIALIZE = 0;
 
 using namespace grb;
 
-#define DEBUG_IMSB 1
-#define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
-
+// #define DEBUG_IMSB 1
 constexpr size_t MAX_FN_SIZE = 255;
 
 // Types
-using IOType = double;   // scalar/vector element type
-using JType  = double;   // coupling (matrix) value type
-using EnergyType  = double;   // coupling (matrix) value type
+using IOType = int8_t;   // scalar/vector element type
+using JType  = float;   // coupling (matrix) value type
+using EnergyType  = double;   // energy value type
+
+template< typename T1, typename T2 >
+inline bool ISCLOSE( const T1 &a, const T2 &b, const double tol = 1e-4){
+	return (std::abs((b)-(a))/std::abs(a) < tol) || (std::abs((b)-(a)) < tol);
+}
 
 // Backend to be used inside each process
 constexpr grb::Backend internal_backend = grb::reference;
@@ -274,13 +277,14 @@ template<
 	class Ring = Semiring<
 		grb::operators::add< JType >, grb::operators::mul< JType >,
 		grb::identities::zero, grb::identities::one
-	>
+	>,
+	typename Ttmp
 	>
 EnergyType get_energy(
 				 const grb::Matrix< JType, backend >& couplings,
 				 const grb::Vector< JType, backend > &local_fields,
 				 const grb::Vector< IOType,backend > &state,
-				 grb::Vector< JType, backend > &tmp,
+				 grb::Vector< Ttmp, backend > &tmp,
 				 const Ring &ring = Ring()
 			  ){
 	const size_t n = grb::size( local_fields );
@@ -358,7 +362,7 @@ EnergyType sequential_sweep_immediate(
 
 		std::exponential_distribution< EnergyType > rand_gen ( beta );
 		for( size_t i = 0 ; i < n; ++i ){
-			const auto rnd = rand_gen( rng );
+			const auto rnd = -rand_gen( rng );
 			rc = rc ? rc : grb::setElement( rand, rnd, i );
 		}
 
@@ -371,7 +375,7 @@ EnergyType sequential_sweep_immediate(
 		for(const auto &mask : masks ){
 			// dn = (2*state_slice - 1) * h_slice
 			rc = rc ? rc : grb::set< descr >( dn, mask, state );
-			rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( dn, state, static_cast< EnergyType >( -1 ), right_assign_op );
+			rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( dn, state, static_cast< JType >( -1 ), right_assign_op );
 			rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
 
 			// Choose which changes to accept
@@ -403,7 +407,7 @@ EnergyType sequential_sweep_immediate(
 		assert( rc == grb::SUCCESS );
 		const auto new_state = state;
 
-		const auto real_delta = get_energy(couplings, local_fields, new_state, h) - get_energy(couplings, local_fields, old_state, h);
+		const auto real_delta = get_energy( couplings, local_fields, new_state, dn ) - get_energy( couplings, local_fields, old_state, dn );
 		if(s == 0){
 			std::cerr << "\n\t Delta_energy: " << delta_energy;
 			std::cerr << "\n\t Real delta: " << real_delta;
@@ -591,7 +595,6 @@ void grbProgram(
     }
 
     // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
-    std::srand( static_cast<unsigned>( data_in.seed + s ) );
     std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
 
     // create states storage and initialize with random 1/0 values
@@ -654,9 +657,11 @@ void grbProgram(
     std::vector< grb::Vector< bool, internal_backend > > masks;
 	rc = rc ? rc : grb::algorithms::matrix_partition( masks, J, temp_h, temp_log_rand, test_data::seed );
 
+#ifdef DEBUG_IMSB
 	if( s == 0 ){
 		print_vector( masks.back(), 30, "MASK" );
 	}
+#endif
 	auto sweep_data = std::tie(
 			(const typeof(J)&) J,
 			(const typeof(h)&) h,
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index 58f1c32b1..d0cb446c1 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -35,15 +35,18 @@
 
 using namespace grb;
 
-#define DEBUG_IMSB 1
-#define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
-
+// #define DEBUG_IMSB 1
 constexpr size_t MAX_FN_SIZE = 255;
 
 // Types
-using IOType = double;   // scalar/vector element type
-using JType  = double;   // coupling (matrix) value type
-using EnergyType  = double;   // coupling (matrix) value type
+using IOType = int8_t;   // scalar/vector element type
+using JType  = float;   // coupling (matrix) value type
+using EnergyType  = double;   // energy value type
+
+template< typename T1, typename T2 >
+inline bool ISCLOSE( const T1 &a, const T2 &b, const double tol = 1e-4){
+	return (std::abs((b)-(a))/std::abs(a) < tol) || (std::abs((b)-(a)) < tol);
+}
 
 /** Parser type */
 typedef grb::utils::MatrixFileReader<
@@ -266,13 +269,14 @@ template<
 	class Ring = Semiring<
 		grb::operators::add< JType >, grb::operators::mul< JType >,
 		grb::identities::zero, grb::identities::one
-	>
+	>,
+	typename Ttmp
 	>
 EnergyType get_energy(
 				 const grb::Matrix< JType, backend >& couplings,
 				 const grb::Vector< JType, backend > &local_fields,
 				 const grb::Vector< IOType, backend > &state,
-				 grb::Vector< JType, backend > &tmp,
+				 grb::Vector< Ttmp, backend > &tmp,
 				 const Ring &ring = Ring()
 			  ){
 	const size_t n = grb::size( local_fields );
@@ -282,12 +286,13 @@ EnergyType get_energy(
 	grb::resize( tmp, n );
 	grb::RC rc = grb::SUCCESS;
 	EnergyType energy = 0.0;
+	constexpr auto dense_descr = descr | grb::descriptors::dense;
 
-	rc = rc ? rc : grb::set( tmp, 0.0 );
-	rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( tmp, couplings, state, ring );
-	rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
-	rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp, local_fields, ring.getAdditiveMonoid() );
-	rc = rc ? rc : grb::dot< descr | grb::descriptors::dense >( energy, tmp, state, ring );
+	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
+	rc = rc ? rc : grb::mxv< dense_descr >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< dense_descr >( energy, tmp, state, ring );
 	assert( rc == grb::SUCCESS );
 
 	return energy;

From cccdb5977e6bd64d350d16d225606a989c389f5f Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 28 Nov 2025 10:49:30 +0100
Subject: [PATCH 37/58] Testing setup

---
 .../smoke/simulated_annealing_re_from_mpi.cpp | 45 ++++++++-------
 tests/smoke/simulated_annealing_re_ising.cpp  | 56 ++++++++++++-------
 2 files changed, 61 insertions(+), 40 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index c862d68f8..c6c733b69 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -716,9 +716,18 @@ void grbProgram(
 			}
 		}
 	} else {
+		rc = grb::algorithms::simulated_annealing_RE(
+			sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+		);
+		rc = grb::algorithms::simulated_annealing_RE(
+			sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+		);
 		// do benchmark
-		timer.reset();
+		double min_time = 1e9;
+		double max_time = 0;
+		double total_time = 0;
 		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+		timer.reset();
 			if( rc == SUCCESS ) {
 				out.iterations = data_in.nsweeps;
 
@@ -730,30 +739,22 @@ void grbProgram(
 			if( grb::Properties<>::isNonblockingExecution ) {
 				rc = rc ? rc : wait();
 			}
+			const double time_taken = timer.time();
+			min_time = std::min(min_time, time_taken);
+			max_time = std::max(max_time, time_taken);
+			total_time +=  time_taken;
 		}
-		const double time_taken = timer.time();
-		if( s == 0 ) {
-			for ( size_t r = 0; r < n_replicas; ++r ) {
-				std::cout << "Final state replica " << r << ":\n";
-				print_vector( states[r], 50 ,"states values" );  
-				std::cout << "With energy " << energies[ r ] << "\n";
-				std::cout << "With energy " << get_energy(  J, h, states[r], tmp_energy ) << "\n";
-				std::cout << std::endl;
-				assert( ISCLOSE( get_energy( J, h, states[r], tmp_energy ), energies[ r ] ) );
-			}
-		}
-
 
-		out.times.useful = time_taken / static_cast< double >( out.rep );
+		out.times.useful = total_time / static_cast< double >( out.rep );
 		// print timing at root process
 		if( s == 0 ) {
-			std::cout << "Time taken for " << out.rep << " "
+			std::cout << "Average Time taken for " << out.rep << " "
 				<< "Simulated Annealing RE calls (hot start): " << out.times.useful << ". "
 				<< "Error code is " << grb::toString( rc ) << std::endl;
-			std::cout << "\tnumber of IM-SB iterations: " << out.iterations << "\n";
-			std::cout << "\tmilliseconds per iteration: "
-				<< ( out.times.useful / static_cast< double >( out.iterations ) )
-				<< "\n";
+			std::cout << "\tnumber of IM-SB iterations: " << out.rep << "\n"; std::cout << "\tmilliseconds per iteration: "
+				<< ( out.times.useful / static_cast< double >( out.iterations ) ) << "\n";;
+			std::cout << "\tMin Time: " << min_time << "\n";
+			std::cout << "\tMax Time: " << max_time << "\n";
 		}
 		sleep( 1 );
 	}
@@ -786,6 +787,7 @@ void printhelp( char *progname ) {
               << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
               << "  --seed INT                 RNG seed (default: 8)\n"
               << "  --sweep STR                Sweep selector (default: sequential_sweep_immediate)\n"
+              << "  --rep INT                  number of times to repeat the run of the algorithm (default: 1)\n"
               << "  --verify                   Verify output against reference solution\n"
               << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
               << "  --help, -h                 Print this help message\n";
@@ -827,6 +829,9 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
         } else if ( a == "--sweep" ) {
             if ( i+1 >= argc ) { std::cerr << "--sweep requires an argument\n"; return false; }
 			std::strncpy( in.sweep_name, argv[++i], MAX_FN_SIZE );
+        } else if ( a == "--rep" ) {
+            if ( i+1 >= argc ) { std::cerr << "--rep requires an argument\n"; return false; }
+            in.rep = static_cast<unsigned>( std::stoul(argv[++i]) );
         } else if ( a == "--verify" ) {
             in.verify = true;
         } else if ( a == "--ref-solution-fname" ) {
@@ -907,7 +912,7 @@ int main( int argc, char ** argv ) {
 		return 51;
 	}
 	if( s == 0 ){
-		std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
+		std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.rep << " best_energy=" << out.best_energy << "\n";
 	}
 	
 	// finalise MPI
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index d0cb446c1..d9ea571fa 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -439,8 +439,10 @@ void grbProgram(
     std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
 
     // create states storage and initialize with random 1/0 values
+    std::vector< grb::Vector<IOType> > states0;
     std::vector< grb::Vector<IOType> > states;
     for ( size_t r = 0; r < n_replicas; ++r ) {
+        states0.emplace_back( grb::Vector<IOType>(n) );
         states.emplace_back( grb::Vector<IOType>(n) );
         // initialize with random values
         std::uniform_int_distribution< unsigned short > randint(0,1);
@@ -451,11 +453,12 @@ void grbProgram(
                 randint( rng ) ) );
         }
         rc = rc ? rc : grb::buildVector(
-            states.back(),
+            states0.back(),
             rand_data.cbegin(),
             rand_data.cend(),
             SEQUENTIAL
         );
+		grb::set( states.back(), states0.back() );
     }
 
 	grb::Vector< EnergyType > tmp_energy ( n );
@@ -525,9 +528,25 @@ void grbProgram(
 			}
 		}
 	} else {
+		for( size_t i = 0; i < 2 ; ++i ){
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				grb::set(states[r], states0[r]);
+			}
+			grb::clear( energies );
+			rc = grb::algorithms::simulated_annealing_RE_Ising(
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+			);
+		}
 		// do benchmark
-		timer.reset();
+		double min_time = 1e9;
+		double max_time = 0;
+		double total_time = 0;
 		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				grb::set(states[r], states0[r]);
+			}
+			grb::clear( energies );
+			timer.reset();
 			if( rc == SUCCESS ) {
 				out.iterations = data_in.nsweeps;
 
@@ -538,29 +557,22 @@ void grbProgram(
 			if( grb::Properties<>::isNonblockingExecution ) {
 				rc = rc ? rc : wait();
 			}
-		}
-		const double time_taken = timer.time();
-		for ( size_t r = 0; r < n_replicas; ++r ) {
-			const auto energy = energies[r];
-			if( s == 0 ) {
-				std::cout << "Final state replica " << r << ":\n";
-				print_vector( states[r], 50 ,"states values" );  
-				std::cout << "With energy " << energy << "\n";
-				std::cout << std::endl;
-			}
-			assert( ISCLOSE( get_energy( J, h, states[r], tmp_energy ), energies[ r ] ) );
+			const double time_taken = timer.time();
+			min_time = std::min(min_time, time_taken);
+			max_time = std::max(max_time, time_taken);
+			total_time +=  time_taken;
 		}
 
-		out.times.useful = time_taken / static_cast< double >( out.rep );
+		out.times.useful = total_time / static_cast< double >( out.rep );
 		// print timing at root process
 		if( s == 0 ) {
-			std::cout << "Time taken for " << out.rep << " "
+			std::cout << "Average Time taken for " << out.rep << " "
 				<< "Simulated Annealing RE calls (hot start): " << out.times.useful << ". "
 				<< "Error code is " << grb::toString( rc ) << std::endl;
-			std::cout << "\tnumber of IM-SB iterations: " << out.iterations << "\n";
-			std::cout << "\tmilliseconds per iteration: "
-				<< ( out.times.useful / static_cast< double >( out.iterations ) )
-				<< "\n";
+			std::cout << "\tnumber of IM-SB iterations: " << out.rep << "\n"; std::cout << "\tmilliseconds per iteration: "
+				<< ( out.times.useful / static_cast< double >( out.iterations ) ) << "\n";;
+			std::cout << "\tMin Time: " << min_time << "\n";
+			std::cout << "\tMax Time: " << max_time << "\n";
 
 			if( data_in.verify ){
 				if( out.best_energy < initial_energy ){
@@ -600,6 +612,7 @@ void printhelp( char *progname ) {
               << "  --nsweeps INT              Number of sweeps (default: 2)\n"
               << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
               << "  --seed INT                 RNG seed (default: 8)\n"
+              << "  --rep INT                  number of times to repeat the run of the algorithm (default: 1)\n"
               << "  --verify                   Verify output against reference solution\n"
               << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
               << "  --help, -h                 Print this help message\n";
@@ -635,6 +648,9 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
         } else if ( a == "--use-pt" ) {
             if ( i+1 >= argc ) { std::cerr << "--use-pt requires an argument\n"; return false; }
             in.use_pt = static_cast<bool>( std::stoul(argv[++i]) );
+        } else if ( a == "--rep" ) {
+            if ( i+1 >= argc ) { std::cerr << "--rep requires an argument\n"; return false; }
+            in.rep = static_cast<unsigned>( std::stoul(argv[++i]) );
         } else if ( a == "--seed" ) {
             if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
             in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
@@ -706,6 +722,6 @@ int main( int argc, char ** argv ) {
         }
     }
 
-    std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.iterations << " best_energy=" << out.best_energy << "\n";
+    std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.rep << " best_energy=" << out.best_energy << "\n";
     return out.error_code;
 }

From 88d44137d92cf49f5e9128a7f6dcc76b5c5c3eb8 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 7 Jan 2026 13:26:33 +0100
Subject: [PATCH 38/58] fixup! Testing setup

---
 .../smoke/simulated_annealing_re_from_mpi.cpp | 35 +++++++++++++------
 tests/smoke/simulated_annealing_re_ising.cpp  | 10 +++---
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index c6c733b69..22c7c280e 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -599,9 +599,11 @@ void grbProgram(
 
     // create states storage and initialize with random 1/0 values
     const size_t n_replicas = data_in.n_replicas;
+    std::vector< grb::Vector< IOType, internal_backend > > states0;
     std::vector< grb::Vector< IOType, internal_backend > > states;
     for ( size_t r = 0; r < n_replicas; ++r ) {
         states.emplace_back( grb::Vector< IOType, internal_backend >(n) );
+        states0.emplace_back( grb::Vector< IOType, internal_backend >(n) );
         // initialize with random values
         std::uniform_int_distribution< unsigned short > randint(0,1);
         // we use buildvectorUnique with a random set of indices
@@ -611,11 +613,12 @@ void grbProgram(
                 randint( rng ) ) );
         }
         rc = rc ? rc : grb::buildVector(
-            states.back(),
+            states0.back(),
             rand_data.cbegin(),
             rand_data.cend(),
             SEQUENTIAL
         );
+		rc = rc ? rc : grb::set( states.back(), states0.back() );
     }
 	using Ring = Semiring<
 			grb::operators::add< JType >, grb::operators::mul< JType >,
@@ -626,11 +629,13 @@ void grbProgram(
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType, internal_backend > betas( n_replicas );
     grb::Vector< EnergyType, internal_backend > energies( n_replicas );
+    grb::Vector< EnergyType, internal_backend > energies0( n_replicas );
     grb::Vector< EnergyType, internal_backend > tmp_energy( n );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( 10.0 / ( n_replicas * s + r + 1) ), r );
-        rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( 10.0 * ( n_replicas * nprocs ) / ( n_replicas * s + r + 1) ), r );
+        rc = rc ? rc : grb::setElement( energies0, get_energy(  J, h, states[r], tmp_energy ), r );
     }
+	rc = rc ? rc : grb::set( energies, energies0 );
 
     #ifdef DEBUG_IMSB
     if( s == 0 ) {
@@ -638,7 +643,7 @@ void grbProgram(
             std::cout << "Process " << s << ": ";
             std::cout << "Initial state replica " << r << ":\n";
             print_vector( states[r], 30 ,"states values" );  
-			std::cout << "With energy " << energies[r] << "\n";
+			std::cout << "With energy " << energies0[r] << "\n";
             std::cout << std::endl;
         }
     }
@@ -716,18 +721,26 @@ void grbProgram(
 			}
 		}
 	} else {
-		rc = grb::algorithms::simulated_annealing_RE(
-			sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
-		);
-		rc = grb::algorithms::simulated_annealing_RE(
-			sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
-		);
+		for( size_t i = 0; i < 2 ; ++i ){
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				rc = rc ? rc : grb::set(states[r], states0[r]);
+			}
+			rc = rc ? rc : grb::set( energies, energies0 );
+
+			rc = grb::algorithms::simulated_annealing_RE(
+				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+			);
+		}
 		// do benchmark
 		double min_time = 1e9;
 		double max_time = 0;
 		double total_time = 0;
 		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
-		timer.reset();
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				rc = rc ? rc : grb::set(states[r], states0[r]);
+			}
+			rc = rc ? rc : grb::set( energies, energies0 );
+			timer.reset();
 			if( rc == SUCCESS ) {
 				out.iterations = data_in.nsweeps;
 
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index d9ea571fa..790f663ca 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -458,7 +458,7 @@ void grbProgram(
             rand_data.cend(),
             SEQUENTIAL
         );
-		grb::set( states.back(), states0.back() );
+		rc = rc ? rc : grb::set( states.back(), states0.back() );
     }
 
 	grb::Vector< EnergyType > tmp_energy ( n );
@@ -530,9 +530,9 @@ void grbProgram(
 	} else {
 		for( size_t i = 0; i < 2 ; ++i ){
 			for ( size_t r = 0; r < n_replicas; ++r ) {
-				grb::set(states[r], states0[r]);
+				rc = rc ? rc : grb::set(states[r], states0[r]);
 			}
-			grb::clear( energies );
+			rc = rc ? rc : grb::clear( energies );
 			rc = grb::algorithms::simulated_annealing_RE_Ising(
 			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
 			);
@@ -543,9 +543,9 @@ void grbProgram(
 		double total_time = 0;
 		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
 			for ( size_t r = 0; r < n_replicas; ++r ) {
-				grb::set(states[r], states0[r]);
+				rc = rc ? rc : grb::set(states[r], states0[r]);
 			}
-			grb::clear( energies );
+			rc = rc ? rc : grb::clear( energies );
 			timer.reset();
 			if( rc == SUCCESS ) {
 				out.iterations = data_in.nsweeps;

From f7b169cd38a376550f022b1379d342cd58146bb2 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 28 Nov 2025 11:47:44 +0100
Subject: [PATCH 39/58] Communicate states only if necessary - big time saving

---
 .../graphblas/algorithms/simulated_annealing_re.hpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 96c034e52..ef7df0af6 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -197,16 +197,11 @@ namespace grb {
 				if( si == 1 ) continue;
 
 #ifdef _GRB_WITH_LPF
-				rc = rc ? rc : grb::internal::broadcast( s0, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].e, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].b, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].r, si-2 );
-				rc = rc ? rc : grb::internal::broadcast( s1, si-1 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].e, si-1 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].b, si-1 );
-
-				assert( grb::nnz(s0) == n ); // state has to be dense!
-				assert( grb::nnz(s1) == n ); // state has to be dense!
 #else
 				assert( false ); // this should never run
 #endif
@@ -223,6 +218,14 @@ namespace grb {
 				const EnergyType de = ( msg[ 1 ].e - msg[ 0 ].e ) * ( msg[ 1 ].b - msg[ 0 ].b );
 
 				if( rc == grb::SUCCESS && ( msg[ 0 ].r < de ) ){
+#ifdef _GRB_WITH_LPF
+					rc = rc ? rc : grb::internal::broadcast( s0, si-2 );
+					rc = rc ? rc : grb::internal::broadcast( s1, si-1 );
+					assert( grb::nnz(s0) == n ); // state has to be dense!
+					assert( grb::nnz(s1) == n ); // state has to be dense!
+#else
+					assert( false ); // this should never run
+#endif
 					if( si == s+1 ){
 						rc = rc ? rc : grb::set( states[ n_replicas - 1 ], s0 );
 						rc = rc ? rc : grb::setElement(energies, msg[ 0 ].e, n_replicas - 1 );

From 3a4efdd3158dab3ecf4a070ad8d6ba4a7a1b63b0 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 7 Jan 2026 13:27:50 +0100
Subject: [PATCH 40/58] Fixed seeding

---
 include/graphblas/algorithms/simulated_annealing_re.hpp | 7 +++----
 tests/smoke/simulated_annealing_re_from_mpi.cpp         | 8 ++++----
 tests/smoke/simulated_annealing_re_ising.cpp            | 5 ++---
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index ef7df0af6..733ce1be3 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -107,7 +107,6 @@ namespace grb {
 			const size_t n_replicas = states.size();
 			// const size_t s 		= spmd<>::pid();
 			// const size_t nprocs = spmd<>::nprocs();
-			std::srand( seed );
 			grb::RC rc = grb::SUCCESS;
 			std::minstd_rand rng ( seed );
 			std::exponential_distribution< EnergyType > rand ( 1.0 );
@@ -150,7 +149,6 @@ namespace grb {
 			const size_t s 		= spmd<>::pid();
 			const size_t nprocs = spmd<>::nprocs();
 			grb::RC rc = grb::SUCCESS;
-			std::srand( seed + s );
 
 #ifndef NDEBUG
 			assert( grb::size(energies) == n_replicas );
@@ -298,7 +296,8 @@ namespace grb {
 				grb::Vector< StateType, backend >  &best_state,
 				EnergyType &best_energy,
 				const size_t &n_sweeps,
-				const bool &use_pt = false
+				const bool &use_pt = false,
+				const size_t &seed = 42
 				){
 
 			const size_t s = spmd<>::pid();
@@ -346,7 +345,7 @@ namespace grb {
 				} // n_replicas
 				if( rc == SUCCESS && use_pt ){
 					// do a Parallel Tempering move
-					rc = pt< backend >( states, energies, betas, i_sweep*n_procs + s );
+					rc = pt< backend >( states, energies, betas, seed + i_sweep*n_procs + s );
 				}
 #ifndef NDEBUG
 				if( s == 0 ) {
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 22c7c280e..9d481e242 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -660,7 +660,7 @@ void grbProgram(
 
 	// build masks, we'll use two of the above temporary vectors
     std::vector< grb::Vector< bool, internal_backend > > masks;
-	rc = rc ? rc : grb::algorithms::matrix_partition( masks, J, temp_h, temp_log_rand, test_data::seed );
+	rc = rc ? rc : grb::algorithms::matrix_partition( masks, J, temp_h, temp_log_rand, data_in.seed );
 
 #ifdef DEBUG_IMSB
 	if( s == 0 ){
@@ -685,7 +685,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
         );
 
 		rc = rc ? rc : wait();
@@ -728,7 +728,7 @@ void grbProgram(
 			rc = rc ? rc : grb::set( energies, energies0 );
 
 			rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
 			);
 		}
 		// do benchmark
@@ -745,7 +745,7 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE(
-					sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+					sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed + i
                 );
 				grb::collectives<>::allreduce( out.best_energy, grb::operators::min< EnergyType >() );
 			}
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index 790f663ca..12d740b36 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -435,7 +435,6 @@ void grbProgram(
     }
 
     // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
-    std::srand( static_cast<unsigned>( data_in.seed + s ) );
     std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
 
     // create states storage and initialize with random 1/0 values
@@ -534,7 +533,7 @@ void grbProgram(
 			}
 			rc = rc ? rc : grb::clear( energies );
 			rc = grb::algorithms::simulated_annealing_RE_Ising(
-			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
 			);
 		}
 		// do benchmark
@@ -551,7 +550,7 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE_Ising(
-				 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+				 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed + i
                 );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {

From acab644a23481dced2adb78ecf1bfecd7be4b911 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 7 Jan 2026 13:28:32 +0100
Subject: [PATCH 41/58] Some changes and improvements: SPMD now uses
 simulated_annealing_RE_Ising, more checks

---
 tests/smoke/simulated_annealing_re_ising.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index 12d740b36..ffe3a1aec 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -35,7 +35,7 @@
 
 using namespace grb;
 
-// #define DEBUG_IMSB 1
+// #define DEBUG_SARE 1
 constexpr size_t MAX_FN_SIZE = 255;
 
 // Types
@@ -182,7 +182,7 @@ void read_matrix_data(const std::string &filename, std::vector<Dtype> &data, boo
 			++it
 		) {
 			data.push_back( Dtype( *it ) );
-#ifdef DEBUG_IMSB
+#ifdef DEBUG_SARE
 			if( spmd<>::pid() == 0 ){
 				// print last data element from std::vector<NonzeroT> data
 				std::cout << "readmatrix_data: " << data.back().first.first << ", "
@@ -207,7 +207,7 @@ void read_matrix_data_from_array(
             data.emplace_back(
                 NonzeroT( entry.first.first, entry.first.second, entry.second )
             );
-#ifdef DEBUG_IMSB
+#ifdef DEBUG_SARE
 			if( spmd<>::pid() < 1 ){
 				// print last data element from std::vector<NonzeroT> data
 				std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
@@ -415,7 +415,7 @@ void grbProgram(
 		// grb::foldl< grb::descriptors::transpose_right >( J, Jt, addMonoid); // issue  #210
 		// grb::foldl<>( J, static_cast< JType >( 0.5 ), mulMonoid);
 
-#ifdef DEBUG_IMSB
+#ifdef DEBUG_SARE
 		if( s == 0 && grb::ncols( J ) < 40 ) {
 			std::cout << "Matrix J:\n";
 			print_matrix( J );
@@ -466,7 +466,7 @@ void grbProgram(
 	for ( size_t r = 0; r < n_replicas; ++r ) {
 		const auto en = get_energy(  J, h, states[r], tmp_energy );
 		initial_energy = std::min( en, initial_energy );
-    #ifdef DEBUG_IMSB
+    #ifdef DEBUG_SARE
 		if( s == 0 ) {
 			std::cout << "Initial state replica " << r << ":\n";
 			print_vector( states[r], 30 ,"states values" );
@@ -480,7 +480,7 @@ void grbProgram(
     grb::Vector< JType > betas( n_replicas );
     grb::Vector< EnergyType > energies( n_replicas );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( n_replicas / (r+1) ), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( (10.0) * std::pow<JType>( 2, r ) ), r );
         // rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
     }
 	assert( rc == grb::SUCCESS );
@@ -492,7 +492,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE_Ising(
-				 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
         );
 
 		rc = rc ? rc : wait();
@@ -535,6 +535,7 @@ void grbProgram(
 			rc = grb::algorithms::simulated_annealing_RE_Ising(
 			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
 			);
+			assert( ISCLOSE( get_energy(  J, h, best_state, tmp_energy ), out.best_energy) );
 		}
 		// do benchmark
 		double min_time = 1e9;
@@ -557,6 +558,8 @@ void grbProgram(
 				rc = rc ? rc : wait();
 			}
 			const double time_taken = timer.time();
+
+			assert( ISCLOSE( get_energy(  J, h, best_state, tmp_energy ), out.best_energy) );
 			min_time = std::min(min_time, time_taken);
 			max_time = std::max(max_time, time_taken);
 			total_time +=  time_taken;

From c4e471fb263b5a22588314f395a4b38cd6779985 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 3 Dec 2025 12:02:12 +0100
Subject: [PATCH 42/58] Some changes and improvements: SPMD now uses
 simulated_annealing_RE_Ising, more checks

---
 .../algorithms/simulated_annealing_re.hpp     | 23 ++++----
 .../smoke/simulated_annealing_re_from_mpi.cpp | 59 +++++--------------
 2 files changed, 28 insertions(+), 54 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 733ce1be3..78f9ada92 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -406,6 +406,7 @@ namespace grb {
 			const size_t n = grb::nrows( A );
 			const size_t s = spmd<>::pid();
 			assert( n == grb::ncols( A ) ); // A needs to be square
+			// assert( grb::is_symmetric( A ) );
 			(void) s;
 
 			grb::resize( frontier, n );
@@ -545,27 +546,30 @@ namespace grb {
 
 			assert( grb::nnz(states[0]) == n ); // state is dense
 			assert( states.size() == n_replicas );
+			// assert( grb::is_symmetric( couplings ) );
 
+			assert( empty_local_fields || ( grb::size( local_fields ) == n ) );
+			assert( empty_local_fields || ( grb::nnz(local_fields) == n ) );
 			EnergyType energy;
 			grb::Vector< EnergyType, backend > tmp_calc_energy ( n );
 
-			const auto get_energy = [&couplings, &local_fields, &tmp_calc_energy, &ring](
-					EnergyType &energy, const grb::Vector< StateType > &state
+			const auto get_energy = [&couplings, &local_fields, &tmp_calc_energy, &ring, &n](
+					EnergyType &energy, const grb::Vector< StateType, backend > &state
 					){
-				const size_t n = grb::size( local_fields );
-				(void) n;
 				assert( n == grb::size( state ) );
 				assert( n == grb::ncols( couplings ) );
 				assert( n == grb::nrows( couplings ) );
 				grb::RC rc = grb::SUCCESS;
+				constexpr auto dense_descr = descr | grb::descriptors::dense;
+
 				grb::set( tmp_calc_energy, static_cast<EnergyType>( 0.0 ) );
-				rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( tmp_calc_energy, couplings, state, ring );
-				rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp_calc_energy, static_cast< EnergyType >( 0.5 ),
+				rc = rc ? rc : grb::mxv< dense_descr >( tmp_calc_energy, couplings, state, ring );
+				rc = rc ? rc : grb::foldl< dense_descr >( tmp_calc_energy, static_cast< EnergyType >( 0.5 ),
 						ring.getMultiplicativeMonoid() );
 				if( !empty_local_fields) {
-					rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( tmp_calc_energy, local_fields, ring.getAdditiveMonoid() );
+					rc = rc ? rc : grb::foldl< dense_descr >( tmp_calc_energy, local_fields, ring.getAdditiveMonoid() );
 				}
-				rc = rc ? rc : grb::dot< descr | grb::descriptors::dense >( energy, tmp_calc_energy, state, ring );
+				rc = rc ? rc : grb::dot< dense_descr >( energy, tmp_calc_energy, state, ring );
 				return rc;
 			};
 
@@ -639,7 +643,6 @@ namespace grb {
 				grb::RC rc = grb::SUCCESS;
 
 				assert( grb::nnz(state) == n ); // state has to be dense!
-				assert( grb::nnz(local_fields) == n );
 
 				if( !empty_local_fields) {
 					rc = rc ? rc : grb::set< descr >( h, local_fields );
@@ -658,7 +661,7 @@ namespace grb {
 				const grb::operators::right_assign< EnergyType > right_assign_op;
 				const grb::operators::not_equal< EnergyType > neq_operator;
 #ifndef NDEBUG
-				const grb::Vector< StateType > old_state = state;
+				const grb::Vector< StateType, backend > old_state = state;
 #endif
 				rc = rc ? rc : grb::wait();
 				for(const auto &mask : masks ){
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 9d481e242..390635437 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -38,7 +38,7 @@ const int LPF_MPI_AUTO_INITIALIZE = 0;
 
 using namespace grb;
 
-// #define DEBUG_IMSB 1
+// #define DEBUG_SARE 1
 constexpr size_t MAX_FN_SIZE = 255;
 
 // Types
@@ -190,7 +190,7 @@ void read_matrix_data(const std::string &filename, std::vector<Dtype> &data, boo
 			++it
 		) {
 			data.push_back( Dtype( *it ) );
-#ifdef DEBUG_IMSB
+#ifdef DEBUG_SARE
 			if( spmd<>::pid() == 0 ){
 				// print last data element from std::vector<NonzeroT> data
 				std::cout << "readmatrix_data: " << data.back().first.first << ", "
@@ -215,7 +215,7 @@ void read_matrix_data_from_array(
             data.emplace_back(
                 NonzeroT( entry.first.first, entry.first.second, entry.second )
             );
-#ifdef DEBUG_IMSB
+#ifdef DEBUG_SARE
 			if( spmd<>::pid() < 2 ){
 				// print last data element from std::vector<NonzeroT> data
 				std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
@@ -575,7 +575,7 @@ void grbProgram(
 			return;
 		}
 
-#ifdef DEBUG_IMSB
+#ifdef DEBUG_SARE
 		if( s == 0 && grb::ncols( J ) < 40 ) {
 			std::cout << "Matrix J:\n";
 			print_matrix( J );
@@ -624,7 +624,7 @@ void grbProgram(
 			grb::operators::add< JType >, grb::operators::mul< JType >,
 			grb::identities::zero, grb::identities::one >;
 	
-	const auto sweep = sequential_sweep_immediate< Ring >; // get_sweep_function( data_in.sweep_name );
+	// const auto sweep = sequential_sweep_immediate< Ring >; // get_sweep_function( data_in.sweep_name );
 
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType, internal_backend > betas( n_replicas );
@@ -632,12 +632,12 @@ void grbProgram(
     grb::Vector< EnergyType, internal_backend > energies0( n_replicas );
     grb::Vector< EnergyType, internal_backend > tmp_energy( n );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( 10.0 * ( n_replicas * nprocs ) / ( n_replicas * s + r + 1) ), r );
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( (10.0 / (s * n_replicas) ) * std::pow<JType>( 1.5, ( n_replicas * s + r ) ) ), r );
         rc = rc ? rc : grb::setElement( energies0, get_energy(  J, h, states[r], tmp_energy ), r );
     }
 	rc = rc ? rc : grb::set( energies, energies0 );
 
-    #ifdef DEBUG_IMSB
+    #ifdef DEBUG_SARE
     if( s == 0 ) {
         for ( size_t r = 0; r < n_replicas; ++r ) {
             std::cout << "Process " << s << ": ";
@@ -648,44 +648,15 @@ void grbProgram(
         }
     }
     #endif
-    rc = rc ? rc : wait();
-
-	// we allocate temporary vectors
-	grb::Vector< JType, internal_backend > temp_h ( n );
-	grb::Vector< JType, internal_backend > temp_log_rand ( n );
 	grb::Vector< IOType, internal_backend > best_state ( n );
-	grb::Vector< JType, internal_backend > temp_dn ( n );
-	grb::Vector< bool, internal_backend > temp_accept ( n );
-	grb::Vector< JType, internal_backend > temp_delta ( n );
-
-	// build masks, we'll use two of the above temporary vectors
-    std::vector< grb::Vector< bool, internal_backend > > masks;
-	rc = rc ? rc : grb::algorithms::matrix_partition( masks, J, temp_h, temp_log_rand, data_in.seed );
-
-#ifdef DEBUG_IMSB
-	if( s == 0 ){
-		print_vector( masks.back(), 30, "MASK" );
-	}
-#endif
-	auto sweep_data = std::tie(
-			(const typeof(J)&) J,
-			(const typeof(h)&) h,
- 			temp_h,
-			temp_log_rand,
-			temp_delta,
-			(const typeof(masks)&) masks,
-			temp_dn,
-			temp_accept,
-			rng
-			);
-	grb::wait();
+    rc = rc ? rc : wait();
 
 	out.rep = data_in.rep;
 	// time a single call
 	if( out.rep == 0 ) {
 		timer.reset();
-		rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
+		rc = grb::algorithms::simulated_annealing_RE_Ising(
+				 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
         );
 
 		rc = rc ? rc : wait();
@@ -727,8 +698,8 @@ void grbProgram(
 			}
 			rc = rc ? rc : grb::set( energies, energies0 );
 
-			rc = grb::algorithms::simulated_annealing_RE(
-				sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
+			rc = grb::algorithms::simulated_annealing_RE_Ising(
+				J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
 			);
 		}
 		// do benchmark
@@ -744,9 +715,9 @@ void grbProgram(
 			if( rc == SUCCESS ) {
 				out.iterations = data_in.nsweeps;
 
-                rc = grb::algorithms::simulated_annealing_RE(
-					sweep, sweep_data, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed + i
-                );
+				rc = grb::algorithms::simulated_annealing_RE_Ising(
+					 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
+				);
 				grb::collectives<>::allreduce( out.best_energy, grb::operators::min< EnergyType >() );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {

From 03a1949b02782b68b121e30a836938e6618d7a67 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 3 Dec 2025 12:02:59 +0100
Subject: [PATCH 43/58] Corrected SPMD PT swap

---
 include/graphblas/algorithms/simulated_annealing_re.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 78f9ada92..d03d6fd2b 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -225,10 +225,10 @@ namespace grb {
 					assert( false ); // this should never run
 #endif
 					if( si == s+1 ){
-						rc = rc ? rc : grb::set( states[ n_replicas - 1 ], s0 );
+						rc = rc ? rc : grb::set( states[ n_replicas - 1 ], s1 );
 						rc = rc ? rc : grb::setElement(energies, msg[ 0 ].e, n_replicas - 1 );
 					}else if( si ==  s+2 ){
-						rc = rc ? rc : grb::set( states[ 0 ], s1 );
+						rc = rc ? rc : grb::set( states[ 0 ], s0 );
 						rc = rc ? rc : grb::setElement(energies, msg[ 1 ].e, 0 );
 					}
 				}

From 74f81ad60c0fbe3c03dcf9a214cc938055225098 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 10 Dec 2025 10:47:24 +0100
Subject: [PATCH 44/58] Cleanup of SPMD

---
 .../smoke/simulated_annealing_re_from_mpi.cpp | 174 +-----------------
 1 file changed, 10 insertions(+), 164 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 390635437..aeff80c2c 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -15,6 +15,7 @@
 #include <sstream>
 #include <vector>
 #include <tuple>
+#include <string>
 #include <memory>
 #include <algorithm>
 #include <random>
@@ -80,7 +81,6 @@ typedef grb::utils::Singleton<
         size_t,                    // n_replicas
         bool,                      // use_pt
         unsigned,                  // seed
-        char[ MAX_FN_SIZE + 1 ],   // sweep_name
         std::vector<NonzeroT>,     // matrix data
         std::vector<JType>         // h vector
     >
@@ -306,154 +306,6 @@ EnergyType get_energy(
 	return energy;
 }
 
-template<
-		class Ring = Semiring<
-			grb::operators::add< JType >, grb::operators::mul< JType >,
-			grb::identities::zero, grb::identities::one
-		>,
-		Backend backend = internal_backend,
-		grb::Descriptor descr = grb::descriptors::no_operation
-	>
-EnergyType sequential_sweep_immediate(
-				 grb::Vector< IOType, backend > &state,
-				 const JType &beta,
-				 std::tuple<
-				 	 const grb::Matrix< JType, backend >&,
-				 	 const grb::Vector< JType, backend >&,
-					 grb::Vector< JType, backend >&,
-					 grb::Vector< JType, backend >&,
-					 grb::Vector< JType, backend >&,
-					 const std::vector< grb::Vector< bool, backend > >&,
-					 grb::Vector< JType, backend >&,
-					 grb::Vector< bool, backend >&,
-					 std::minstd_rand&
-					 > &data
-			  ){
-		const size_t s = spmd<>::pid();
-		const Ring ring = Ring();
-		constexpr auto dense_descr = descr | grb::descriptors::dense;
-		(void) s;
-
-		EnergyType delta_energy = static_cast< EnergyType >(0.0);
-		const auto &couplings 	= std::get<0>(data);
-		const auto &local_fields = std::get<1>(data);
-		auto &h 		= std::get<2>(data);
-		auto &rand	= std::get<3>(data);
-		auto &delta		= std::get<4>(data);
-		const auto &masks = std::get<5>(data);
-		auto &dn		= std::get<6>(data);
-		auto &accept	= std::get<7>(data);
-		auto &rng       = std::get<8>(data);
-
-		grb::RC rc = grb::SUCCESS;
-		const size_t n = grb::size( state );
-		assert( grb::nnz(state) == n ); // state has to be dense!
-		assert( grb::nnz(local_fields) == n );
-
-		rc = rc ? rc : grb::wait();
-		rc = rc ? rc : grb::resize( h, n );
-		rc = rc ? rc : grb::resize( rand, n );
-		rc = rc ? rc : grb::resize( delta, n );
-		rc = rc ? rc : grb::resize( dn, n );
-		rc = rc ? rc : grb::resize( accept, n );
-
-		rc = rc ? rc : grb::set< dense_descr >( h, local_fields );
-		rc = rc ? rc : grb::mxv< dense_descr >( h, couplings, state , ring );
-
-		std::exponential_distribution< EnergyType > rand_gen ( beta );
-		for( size_t i = 0 ; i < n; ++i ){
-			const auto rnd = -rand_gen( rng );
-			rc = rc ? rc : grb::setElement( rand, rnd, i );
-		}
-
-		const grb::operators::leq< EnergyType > leq_operator;
-		const grb::operators::right_assign< EnergyType > right_assign_op;
-		const grb::operators::not_equal< EnergyType > neq_operator;
-#ifndef NDEBUG
-		const grb::Vector< IOType, backend > old_state = state;
-#endif
-		for(const auto &mask : masks ){
-			// dn = (2*state_slice - 1) * h_slice
-			rc = rc ? rc : grb::set< descr >( dn, mask, state );
-			rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( dn, state, static_cast< JType >( -1 ), right_assign_op );
-			rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
-
-			// Choose which changes to accept
-			// ( dn >= 0 ) | ( rand/beta < dn )
-			rc = rc ? rc : grb::foldl< descr >( dn, rand, leq_operator );
-			rc = rc ? rc : grb::set< descr >( accept, dn, mask );
-
-			// new_state = np.where(accept, 1 - old, old)
-			rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< IOType >( 1 ), neq_operator );
-			
-			// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
-			rc = rc ? rc : grb::set< descr >( delta, accept, state );
-			rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( delta, delta, static_cast< EnergyType >( -1 ), right_assign_op );
-			
-			// Update delta_energy -= dot(dn, accept)
-			rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
-
-			// update h
-			rc = rc ? rc : grb::mxv< descr >( h, couplings, delta, ring );
-			
-		}
-		rc = rc ? rc : grb::wait();
-
-#ifndef NDEBUG
-		if( rc != grb::SUCCESS ){
-			std::cerr << "\n\t Error in some GraphBLAS function in sequential_sweep_immediate " << rc << " : " << grb::toString( rc ) << std::endl;
-			abort();
-		}
-		assert( rc == grb::SUCCESS );
-		const auto new_state = state;
-
-		const auto real_delta = get_energy( couplings, local_fields, new_state, dn ) - get_energy( couplings, local_fields, old_state, dn );
-		if(s == 0){
-			std::cerr << "\n\t Delta_energy: " << delta_energy;
-			std::cerr << "\n\t Real delta: " << real_delta;
-			std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
-			std::cerr << std::endl;
-
-		}
-		assert( ISCLOSE(real_delta, delta_energy ) );
-#endif
-
-		return delta_energy;
-}
-
-
-template<
-		Backend backend,
-		typename SweepDataType = std::tuple<
-				 	 const grb::Matrix< JType, backend >&,
-				 	 const grb::Vector< JType, backend >&,
-					 grb::Vector< JType, backend >&,
-					 grb::Vector< JType, backend >&,
-					 grb::Vector< JType, backend >&,
-					 const std::vector< grb::Vector< bool, backend > >&,
-					 grb::Vector< JType, backend >&,
-					 grb::Vector< bool, backend >&,
-					 std::minstd_rand&
-					 >,
-		typename SweepFuncType = std::function< EnergyType(
-					 const grb::Matrix< JType, backend >&,
-					 const grb::Vector< JType, backend >&,
-					 grb::Vector< IOType, backend >&,
-					 const JType&,
-					 SweepDataType&
-				 ) >,
-		class Ring = Semiring<
-			grb::operators::add< JType >, grb::operators::mul< JType >,
-			grb::identities::zero, grb::identities::one
-		>
-	>
-SweepFuncType get_sweep_function( const char sweep_name[] ){
-	if( std::strcmp(sweep_name, "sequential_sweep_immediate") != 0 ){
-			std::cerr << "Warning: unknown sweep setting. Falling back to  \"sequential_sweep_immediate\"" << std::endl;
-	}
-	 return sequential_sweep_immediate< Ring >;
-}
-
 void ioProgram( const struct input &data_in, bool &success ) {
 
     using namespace test_data;
@@ -473,9 +325,8 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		auto &n_replicas_st = std::get<3>(storage); // n_replicas
 		auto &use_pt      = std::get<4>(storage); // use_pt
 		auto &seed_st     = std::get<5>(storage); // seed
-		auto &sweep_name  = std::get<6>(storage); // sweep_name
-		auto &Jdata       = std::get<7>(storage); // std::vector<NonzeroT>
-		auto &h           = std::get<8>(storage); // std::vector<JType>
+		auto &Jdata       = std::get<6>(storage); // std::vector<NonzeroT>
+		auto &h           = std::get<7>(storage); // std::vector<JType>
 
 		// Initialize metadata from input (allow CLI to override defaults)
 		(void) n;
@@ -484,7 +335,6 @@ void ioProgram( const struct input &data_in, bool &success ) {
 		n_replicas_st = data_in.n_replicas;
 		use_pt        = data_in.use_pt;
 		seed_st       = data_in.seed;
-        std::strncpy( sweep_name, data_in.sweep_name, MAX_FN_SIZE+1 );
 
 
 		if ( data_in.use_default_data ) {
@@ -526,7 +376,7 @@ void grbProgram(
 	// get user process ID
 	const size_t s = spmd<>::pid();
 	const size_t nprocs = spmd<>::nprocs();
-
+	(void) nprocs;
 
     grb::utils::Timer timer;
 	timer.reset();
@@ -545,7 +395,7 @@ void grbProgram(
     // load into GraphBLAS
     grb::Matrix< JType, internal_backend > J( n, n );
 	{
-		const auto &data = std::get<7>(Storage::getData());
+		const auto &data = std::get<6>(Storage::getData());
 		RC io_rc = buildMatrixUnique(
 			J,
 			utils::makeNonzeroIterator<
@@ -585,7 +435,7 @@ void grbProgram(
 
     // build vector h with data from singleton
     {
-        const auto &h_data = std::get<8>(Storage::getData());
+        const auto &h_data = std::get<7>(Storage::getData());
 		rc = rc ? rc : buildVector(
 			h,
 			h_data.cbegin(),
@@ -620,11 +470,6 @@ void grbProgram(
         );
 		rc = rc ? rc : grb::set( states.back(), states0.back() );
     }
-	using Ring = Semiring<
-			grb::operators::add< JType >, grb::operators::mul< JType >,
-			grb::identities::zero, grb::identities::one >;
-	
-	// const auto sweep = sequential_sweep_immediate< Ring >; // get_sweep_function( data_in.sweep_name );
 
     // also make betas vector os size n_replicas and initialize with 10.0
     grb::Vector< JType, internal_backend > betas( n_replicas );
@@ -710,6 +555,7 @@ void grbProgram(
 			for ( size_t r = 0; r < n_replicas; ++r ) {
 				rc = rc ? rc : grb::set(states[r], states0[r]);
 			}
+			out.best_energy = std::numeric_limits< EnergyType >::max();
 			rc = rc ? rc : grb::set( energies, energies0 );
 			timer.reset();
 			if( rc == SUCCESS ) {
@@ -727,6 +573,9 @@ void grbProgram(
 			min_time = std::min(min_time, time_taken);
 			max_time = std::max(max_time, time_taken);
 			total_time +=  time_taken;
+			if(s == 0){
+				std::cerr << n_replicas << "," << data_in.nsweeps << "," << time_taken << "," << out.best_energy << std::endl;
+			}
 		}
 
 		out.times.useful = total_time / static_cast< double >( out.rep );
@@ -810,9 +659,6 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
         } else if ( a == "--seed" ) {
             if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
             in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
-        } else if ( a == "--sweep" ) {
-            if ( i+1 >= argc ) { std::cerr << "--sweep requires an argument\n"; return false; }
-			std::strncpy( in.sweep_name, argv[++i], MAX_FN_SIZE );
         } else if ( a == "--rep" ) {
             if ( i+1 >= argc ) { std::cerr << "--rep requires an argument\n"; return false; }
             in.rep = static_cast<unsigned>( std::stoul(argv[++i]) );

From 75120092016ac5cb9d8a81e80e36fec6d7451994 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 10 Dec 2025 10:48:05 +0100
Subject: [PATCH 45/58] Added reference energy goal for early stop

---
 .../algorithms/simulated_annealing_re.hpp     | 18 ++++++++++---
 .../smoke/simulated_annealing_re_from_mpi.cpp | 25 ++++++++++---------
 tests/smoke/simulated_annealing_re_ising.cpp  | 22 +++++++++++-----
 3 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index d03d6fd2b..44bb64cd3 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -296,6 +296,7 @@ namespace grb {
 				grb::Vector< StateType, backend >  &best_state,
 				EnergyType &best_energy,
 				const size_t &n_sweeps,
+				const EnergyType &goal = 0,
 				const bool &use_pt = false,
 				const size_t &seed = 42
 				){
@@ -323,6 +324,7 @@ namespace grb {
 						  << "\n\t n = " << n
 						  << "\n\t n_replicas = " << n_replicas
 						  << "\n\t n_sweeps = " << n_sweeps
+						  << "\n\t goal = " << goal
 						  << "\n\t use_pt = " << use_pt
 						  << std::endl;
 			}
@@ -342,16 +344,24 @@ namespace grb {
 						best_energy = energies[j];
 						best_state = states[j];
 					}
+					if( goal < -1 && best_energy <= goal ) break;
 				} // n_replicas
+
+				// TODO: find a better way than this, to avoid a sync at each iteration
+				rc = rc ? rc : grb::collectives<>::allreduce(
+						best_energy, grb::operators::min< EnergyType >() );
+
 				if( rc == SUCCESS && use_pt ){
 					// do a Parallel Tempering move
 					rc = pt< backend >( states, energies, betas, seed + i_sweep*n_procs + s );
 				}
+
 #ifndef NDEBUG
 				if( s == 0 ) {
-					std::cerr << "Energy at iteration " << i_sweep << " = " << energies[ 0 ] << std::endl;
+					std::cerr << "Energy at iteration " << i_sweep << " = " << best_energy << std::endl;
 				}
 #endif
+				if( goal < -1 &&  best_energy <= goal ) i_sweep = n_sweeps;
 			} // n_sweeps
 
 #ifndef NDEBUG
@@ -534,6 +544,7 @@ namespace grb {
 				grb::Vector< StateType, backend > &best_state,
 				EnergyType &best_energy,
 				const size_t &n_sweeps,
+				const EnergyType &goal = 0,
 				const bool &use_pt = false,
 				const int seed = 42,
 				const Ring &ring = Ring()
@@ -717,7 +728,7 @@ namespace grb {
 			};
 
 			return simulated_annealing_RE(
-					ising_sweep, sweep_data, states, energies, betas, best_state, best_energy, n_sweeps, use_pt
+					ising_sweep, sweep_data, states, energies, betas, best_state, best_energy, n_sweeps, goal, use_pt
 					);
 		}
 
@@ -768,6 +779,7 @@ namespace grb {
 				grb::Vector< StateType, backend > &best_state,
 				EnergyType &best_energy,
 				const size_t &n_sweeps,
+				const EnergyType &goal = 0,
 				const bool &use_pt = false,
 				const int seed = 42,
 				const Ring &ring = Ring()
@@ -775,7 +787,7 @@ namespace grb {
 			grb::Vector< QType > empty_local_fields ( 0 );
 
 			return simulated_annealing_RE_Ising< backend, descr, true >(
-					Q, empty_local_fields, states, energies, betas, best_state, best_energy, n_sweeps, use_pt, seed, ring
+					Q, empty_local_fields, states, energies, betas, best_state, best_energy, n_sweeps, goal, use_pt, seed, ring
 					);
 		}
 	} // namespace algorithms
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index aeff80c2c..2d73c7aa5 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -142,7 +142,7 @@ struct input {
     size_t nsweeps = test_data::nsweeps;
     bool use_pt = test_data::use_pt;
     unsigned seed = test_data::seed;
-    char sweep_name [ MAX_FN_SIZE + 1 ] = "sequential_sweep_immediate";
+    EnergyType reference_energy = 0.0;
     bool verify = false;
     char filename_Jmatrix [ MAX_FN_SIZE + 1 ];
     char filename_h [ MAX_FN_SIZE + 1 ];
@@ -501,7 +501,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE_Ising(
-				 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed
         );
 
 		rc = rc ? rc : wait();
@@ -544,7 +544,7 @@ void grbProgram(
 			rc = rc ? rc : grb::set( energies, energies0 );
 
 			rc = grb::algorithms::simulated_annealing_RE_Ising(
-				J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed
 			);
 		}
 		// do benchmark
@@ -562,13 +562,10 @@ void grbProgram(
 				out.iterations = data_in.nsweeps;
 
 				rc = grb::algorithms::simulated_annealing_RE_Ising(
-					 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
+					J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed + i
 				);
 				grb::collectives<>::allreduce( out.best_energy, grb::operators::min< EnergyType >() );
 			}
-			if( grb::Properties<>::isNonblockingExecution ) {
-				rc = rc ? rc : wait();
-			}
 			const double time_taken = timer.time();
 			min_time = std::min(min_time, time_taken);
 			max_time = std::max(max_time, time_taken);
@@ -609,8 +606,8 @@ void grbProgram(
 // --- Simple help / CLI parser for the new runner (no backward compatibility) ---
 void printhelp( char *progname ) {
     std::cout << "Usage: " << progname << " [--use-default-data] [--j-matrix-fname STR] [--h-fname STR]\n"
-              << "       [--n-replicas INT] [--nsweeps INT] [--seed INT] [--sweep STR]\n"
-              << "       [--verify] [--ref-solution-fname STR] [--help]\n\n"
+              << "       [--n-replicas INT] [--nsweeps INT] [--seed INT]\n"
+              << "       [--rep INT] [--goal INT] [--verify] [--ref-solution-fname STR] [--help]\n\n"
               << "Options:\n"
               << "  --use-default-data         Use embedded default test data\n"
               << "  --j-matrix-fname STR       Path to J matrix file (matrix-market or supported)\n"
@@ -619,8 +616,8 @@ void printhelp( char *progname ) {
               << "  --nsweeps INT              Number of sweeps (default: 2)\n"
               << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
               << "  --seed INT                 RNG seed (default: 8)\n"
-              << "  --sweep STR                Sweep selector (default: sequential_sweep_immediate)\n"
-              << "  --rep INT                  number of times to repeat the run of the algorithm (default: 1)\n"
+              << "  --rep INT                  Number of times to repeat the run of the algorithm (default: 1)\n"
+              << "  --goal FLOAT               The value of the energy to achieve before stopping (default: 0, no such check).\n"
               << "  --verify                   Verify output against reference solution\n"
               << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
               << "  --help, -h                 Print this help message\n";
@@ -634,6 +631,7 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
     // map benchmarking configuration to the runner's fields
     in.rep = grb::config::BENCHMARKING::inner();
     in.outer = grb::config::BENCHMARKING::outer();
+    in.reference_energy = static_cast<EnergyType>( 0.0 );
     // keep verify default (false) unless overridden via CLI
     in.verify = false;
 
@@ -662,6 +660,9 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
         } else if ( a == "--rep" ) {
             if ( i+1 >= argc ) { std::cerr << "--rep requires an argument\n"; return false; }
             in.rep = static_cast<unsigned>( std::stoul(argv[++i]) );
+        } else if ( a == "--goal" ) {
+            if ( i+1 >= argc ) { std::cerr << "--goal requires an argument\n"; return false; }
+            in.reference_energy = std::stof(argv[++i]);
         } else if ( a == "--verify" ) {
             in.verify = true;
         } else if ( a == "--ref-solution-fname" ) {
@@ -709,7 +710,7 @@ int main( int argc, char ** argv ) {
     }
 
 
-    std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=" << in.sweep_name << "\n";
+    std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=ising_sweep_spmd" << "\n";
 
     // Run IO program (populates Storage or similar)
     {
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
index ffe3a1aec..e3f7794d6 100644
--- a/tests/smoke/simulated_annealing_re_ising.cpp
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -136,7 +136,7 @@ struct input {
     bool use_default_data = false;
     char filename_Jmatrix [ MAX_FN_SIZE + 1 ];
     char filename_h [ MAX_FN_SIZE + 1 ];
-    char sweep_name [ MAX_FN_SIZE + 1 ]= "sequential_sweep_immediate";
+    EnergyType reference_energy = 0.0;
     bool verify = false;
     char filename_ref_solution [ MAX_FN_SIZE + 1 ];
 	bool direct;
@@ -492,7 +492,7 @@ void grbProgram(
 	if( out.rep == 0 ) {
 		timer.reset();
 		rc = grb::algorithms::simulated_annealing_RE_Ising(
-			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed
         );
 
 		rc = rc ? rc : wait();
@@ -531,10 +531,13 @@ void grbProgram(
 			for ( size_t r = 0; r < n_replicas; ++r ) {
 				rc = rc ? rc : grb::set(states[r], states0[r]);
 			}
+			out.best_energy = std::numeric_limits< EnergyType >::max();
 			rc = rc ? rc : grb::clear( energies );
+
 			rc = grb::algorithms::simulated_annealing_RE_Ising(
-			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed + i
 			);
+
 			assert( ISCLOSE( get_energy(  J, h, best_state, tmp_energy ), out.best_energy) );
 		}
 		// do benchmark
@@ -545,13 +548,14 @@ void grbProgram(
 			for ( size_t r = 0; r < n_replicas; ++r ) {
 				rc = rc ? rc : grb::set(states[r], states0[r]);
 			}
+			out.best_energy = std::numeric_limits< EnergyType >::max();
 			rc = rc ? rc : grb::clear( energies );
 			timer.reset();
 			if( rc == SUCCESS ) {
 				out.iterations = data_in.nsweeps;
 
                 rc = grb::algorithms::simulated_annealing_RE_Ising(
-				 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.use_pt, data_in.seed + i
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed + i
                 );
 			}
 			if( grb::Properties<>::isNonblockingExecution ) {
@@ -563,6 +567,7 @@ void grbProgram(
 			min_time = std::min(min_time, time_taken);
 			max_time = std::max(max_time, time_taken);
 			total_time +=  time_taken;
+			std::cerr << n_replicas << "," << data_in.nsweeps << "," << time_taken << "," << out.best_energy << std::endl;
 		}
 
 		out.times.useful = total_time / static_cast< double >( out.rep );
@@ -604,8 +609,8 @@ void grbProgram(
 // --- Simple help / CLI parser for the new runner (no backward compatibility) ---
 void printhelp( char *progname ) {
     std::cout << "Usage: " << progname << " [--use-default-data] [--j-matrix-fname STR] [--h-fname STR]\n"
-              << "       [--n-replicas INT] [--nsweeps INT] [--seed INT] [--sweep STR]\n"
-              << "       [--verify] [--ref-solution-fname STR] [--help]\n\n"
+              << "       [--n-replicas INT] [--nsweeps INT] [--seed INT]\n"
+              << "       [--rep INT] [--goal INT] [--verify] [--ref-solution-fname STR] [--help]\n\n"
               << "Options:\n"
               << "  --use-default-data         Use embedded default test data\n"
               << "  --j-matrix-fname STR       Path to J matrix file (matrix-market or supported)\n"
@@ -615,6 +620,7 @@ void printhelp( char *progname ) {
               << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
               << "  --seed INT                 RNG seed (default: 8)\n"
               << "  --rep INT                  number of times to repeat the run of the algorithm (default: 1)\n"
+              << "  --goal FLOAT               The value of the energy to achieve before stopping (default: 0, no such check).\n"
               << "  --verify                   Verify output against reference solution\n"
               << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
               << "  --help, -h                 Print this help message\n";
@@ -628,6 +634,7 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
     // map benchmarking configuration to the runner's fields
     in.rep = grb::config::BENCHMARKING::inner();
     in.outer = grb::config::BENCHMARKING::outer();
+    in.reference_energy = static_cast<EnergyType>( 0.0 );
     // keep verify default (false) unless overridden via CLI
     in.verify = false;
 
@@ -656,6 +663,9 @@ bool parse_arguments( input &in, int argc, char ** argv ) {
         } else if ( a == "--seed" ) {
             if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
             in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
+        } else if ( a == "--goal" ) {
+            if ( i+1 >= argc ) { std::cerr << "--goal requires an argument\n"; return false; }
+            in.reference_energy = std::stof(argv[++i]);
         } else if ( a == "--verify" ) {
             in.verify = true;
         } else if ( a == "--ref-solution-fname" ) {

From 26238b7628e62257a1ba750ebab3462999d7dd0f Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 16 Dec 2025 10:05:59 +0100
Subject: [PATCH 46/58] Fixed birthday paradox bug in matrix_partition

---
 .../algorithms/simulated_annealing_re.hpp        | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 44bb64cd3..6ece8664c 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -344,7 +344,6 @@ namespace grb {
 						best_energy = energies[j];
 						best_state = states[j];
 					}
-					if( goal < -1 && best_energy <= goal ) break;
 				} // n_replicas
 
 				// TODO: find a better way than this, to avoid a sync at each iteration
@@ -423,10 +422,20 @@ namespace grb {
 			grb::resize( w, n );
 
     		std::minstd_rand rng ( seed );
-			std::uniform_real_distribution< AType > rand ( 0.1, 2.0 );
 
+			// random shuffle w
+			// const auto w_raw = grb::internal::getRaw( w );
 			for( size_t i = 0 ; i < n ; ++i ){
-				rc = rc ? rc : grb::setElement( w, rand( rng ), i );
+				rc = rc ? rc : grb::setElement( w, i+1, i );
+			}
+			for( size_t i = 0 ; i < n ; ++i ){
+				std::uniform_int_distribution< size_t > rand ( i, n-1 );
+				const auto j = rand(rng);
+				const auto a = w[i];
+				const auto b = w[j];
+				rc = rc ? rc : grb::setElement( w, b, i );
+				rc = rc ? rc : grb::setElement( w, a, j );
+				// std::swap( w_raw[i],  w_raw[j] );
 			}
 
 			const grb::Semiring<
@@ -512,6 +521,7 @@ namespace grb {
 		 * @param[in,out] betas     	Inverse temperature of each state.
 		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
 		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
+		 * @param[in]     seed			Seed to use for internal randomization (must be the same for all processees);
 		 *
 		 * @tparam StateType	The state variable type.
 		 * @tparam QType		The matrix values' type.

From 702080593bd85535f9dea12c9fd8aeac30d3a33d Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 18 Dec 2025 14:46:45 +0100
Subject: [PATCH 47/58] Reproducible results with different number of processes

---
 .../algorithms/simulated_annealing_re.hpp     | 84 +++++++------------
 .../smoke/simulated_annealing_re_from_mpi.cpp | 18 ++--
 2 files changed, 42 insertions(+), 60 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 6ece8664c..61de06d7a 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -43,32 +43,6 @@
 #define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
 
 namespace grb {
-	namespace internal {
-		/*
-		 * The following functions are used to ensure the correct type of the value in
-		 * in the exponential function.
-		 */
-		template< typename T >
-		inline T exp(T x ){
-			static_assert(
-					std::is_same<T, float>::value
-				 || std::is_same<T, double>::value
-				 || std::is_same<T, long double>::value
-					);
-			return std::exp( x );
-		}
-
-		template< typename T >
-		inline T log(T x ){
-			static_assert(
-					std::is_same<T, float>::value
-				 || std::is_same<T, double>::value
-				 || std::is_same<T, long double>::value
-					);
-			return std::log( x );
-		}
-	} // namespace internal
-
 	namespace algorithms {
 
 		/*
@@ -154,13 +128,12 @@ namespace grb {
 			assert( grb::size(energies) == n_replicas );
 			assert( grb::size(betas) == n_replicas );
 #endif
-			std::minstd_rand rng ( seed + s );
-			std::exponential_distribution< EnergyType > rand ( 1.0 );
 			struct data {
 					EnergyType e;
 					TempType b;
 					EnergyType r;
 				};
+			// TODO: should these two be static? Probably.
 			grb::Vector< StateType, backend > s0 ( n );
 			grb::Vector< StateType, backend > s1 ( n );
 			grb::set( s0, static_cast< StateType >( 0 ) );
@@ -170,13 +143,19 @@ namespace grb {
 			rc = rc ? rc : grb::resize( s0, n );
 			rc = rc ? rc : grb::resize( s1, n );
 			if( rc != grb::SUCCESS ) return rc;
-			const auto myrand = -rand( rng );
+
+			std::minstd_rand rng;
+			std::exponential_distribution< EnergyType > rand ( 1.0 );
+
+			rng.seed( seed + s*n_replicas );
+			const EnergyType myrand = -rand( rng );
 
 			for( size_t si = nprocs ; rc == grb::SUCCESS && si > 0; --si ){
 				if( si-1 == s ){
 					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
+						rng.seed( seed + s*n_replicas + i );
 						if( -rand( rng ) < de ){
 							std::swap( states[i], states[i-1] );
 							std::swap( energies[i], energies[i-1] );
@@ -185,7 +164,7 @@ namespace grb {
 					grb::set( s1, states[0] );
 					msg[ 1 ].e = energies[ 0 ];
 					msg[ 1 ].b = betas[0];
-					// msg[ 1 ].r = rand;
+					msg[ 1 ].r = myrand;
 				}else if( si-2 == s ){
 					grb::set( s0, states[ n_replicas - 1 ] );
 					msg[ 0 ].e = energies[ n_replicas - 1 ];
@@ -200,6 +179,7 @@ namespace grb {
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].r, si-2 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].e, si-1 );
 				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].b, si-1 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].r, si-1 );
 #else
 				assert( false ); // this should never run
 #endif
@@ -215,21 +195,21 @@ namespace grb {
 
 				const EnergyType de = ( msg[ 1 ].e - msg[ 0 ].e ) * ( msg[ 1 ].b - msg[ 0 ].b );
 
-				if( rc == grb::SUCCESS && ( msg[ 0 ].r < de ) ){
+				if( rc == grb::SUCCESS && ( msg[ 1 ].r < de ) ){
 #ifdef _GRB_WITH_LPF
-					rc = rc ? rc : grb::internal::broadcast( s0, si-2 );
 					rc = rc ? rc : grb::internal::broadcast( s1, si-1 );
+					rc = rc ? rc : grb::internal::broadcast( s0, si-2 );
 					assert( grb::nnz(s0) == n ); // state has to be dense!
 					assert( grb::nnz(s1) == n ); // state has to be dense!
 #else
 					assert( false ); // this should never run
 #endif
-					if( si == s+1 ){
-						rc = rc ? rc : grb::set( states[ n_replicas - 1 ], s1 );
-						rc = rc ? rc : grb::setElement(energies, msg[ 0 ].e, n_replicas - 1 );
-					}else if( si ==  s+2 ){
+					if( si-1 == s ){
 						rc = rc ? rc : grb::set( states[ 0 ], s0 );
-						rc = rc ? rc : grb::setElement(energies, msg[ 1 ].e, 0 );
+						rc = rc ? rc : grb::setElement( energies, msg[ 0 ].e, 0 );
+					}else if( si-2 ==  s ){
+						rc = rc ? rc : grb::set( states[ n_replicas - 1 ], s1 );
+						rc = rc ? rc : grb::setElement( energies, msg[ 1 ].e, n_replicas - 1 );
 					}
 				}
 			}
@@ -308,18 +288,16 @@ namespace grb {
 			(void) n;
 			(void) s;
 
+			grb::RC rc = grb::SUCCESS;
+
+#ifndef NDEBUG
 			assert( n_replicas > 0 );
 			assert( n_replicas == grb::size( betas ) );
 
 			for(size_t i = 0; i < n_replicas ; ++i ){
 				assert( n == grb::size( states[ i ] ) );
 			}
-
-			grb::RC rc = grb::SUCCESS;
-
-
-#ifndef NDEBUG
-			if( grb::spmd<>::pid() == 0 ) {
+			if( s == 0 ) {
 				std::cerr << "DEBUG: Called  simulated_annealing_RE with parameters: "
 						  << "\n\t n = " << n
 						  << "\n\t n_replicas = " << n_replicas
@@ -328,14 +306,18 @@ namespace grb {
 						  << "\n\t use_pt = " << use_pt
 						  << std::endl;
 			}
+			assert( grb::size(best_state) == n );
 #endif
 
 			best_energy = std::numeric_limits< EnergyType >::max();
-			assert( grb::size(best_state) >= n );
 
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
 				for( size_t j = 0 ; j < n_replicas ; ++j ){
-					
+
+					const int seedi = i_sweep*n_procs*n_replicas + n_replicas*s + j;
+
+					std::get<8>(sweep_data).seed(seedi);
+
 					energies[j] += sweep( states[j], betas[j], sweep_data );
 					grb::wait();
 				
@@ -352,7 +334,7 @@ namespace grb {
 
 				if( rc == SUCCESS && use_pt ){
 					// do a Parallel Tempering move
-					rc = pt< backend >( states, energies, betas, seed + i_sweep*n_procs + s );
+					rc = pt( states, energies, betas, seed + i_sweep );
 				}
 
 #ifndef NDEBUG
@@ -421,21 +403,19 @@ namespace grb {
 			grb::resize( frontier, n );
 			grb::resize( w, n );
 
-    		std::minstd_rand rng ( seed );
+			std::minstd_rand rng ( seed );
 
 			// random shuffle w
-			// const auto w_raw = grb::internal::getRaw( w );
 			for( size_t i = 0 ; i < n ; ++i ){
 				rc = rc ? rc : grb::setElement( w, i+1, i );
 			}
 			for( size_t i = 0 ; i < n ; ++i ){
 				std::uniform_int_distribution< size_t > rand ( i, n-1 );
-				const auto j = rand(rng);
+				const auto j = rand( rng );
 				const auto a = w[i];
 				const auto b = w[j];
 				rc = rc ? rc : grb::setElement( w, b, i );
 				rc = rc ? rc : grb::setElement( w, a, j );
-				// std::swap( w_raw[i],  w_raw[j] );
 			}
 
 			const grb::Semiring<
@@ -486,7 +466,7 @@ namespace grb {
 						if( s == 0 ) {
 							std::cerr << x.first << ", ";
 						}
-						cnt ++;
+						cnt++;
 					}
 				}
 				if( s == 0 ) {
@@ -610,7 +590,7 @@ namespace grb {
 			grb::Vector< QType, backend > delta ( n );
 			grb::Vector< QType, backend > dn ( n );
 			grb::Vector< bool, backend > accept ( n );
-    		std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
+			std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
 
 			rc = rc ? rc : grb::resize( h, n );
 			rc = rc ? rc : grb::resize( rand, n );
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 2d73c7aa5..7156bbc05 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -445,23 +445,25 @@ void grbProgram(
     }
 
     // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
-    std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
+    std::minstd_rand rng ( data_in.seed ); // rng or std::mt19937
 
     // create states storage and initialize with random 1/0 values
     const size_t n_replicas = data_in.n_replicas;
     std::vector< grb::Vector< IOType, internal_backend > > states0;
     std::vector< grb::Vector< IOType, internal_backend > > states;
-    for ( size_t r = 0; r < n_replicas; ++r ) {
-        states.emplace_back( grb::Vector< IOType, internal_backend >(n) );
-        states0.emplace_back( grb::Vector< IOType, internal_backend >(n) );
+    std::vector< IOType > rand_data (n);
+    for ( size_t r = 0; r < nprocs * n_replicas; ++r ) {
         // initialize with random values
-        std::uniform_int_distribution< unsigned short > randint(0,1);
+        std::uniform_int_distribution< IOType > randint(0,1);
         // we use buildvectorUnique with a random set of indices
-        std::vector< IOType > rand_data;
         for ( size_t i = 0; i < n; ++i ) {
-            rand_data.emplace_back( static_cast<IOType>(
-                randint( rng ) ) );
+            rand_data[i] = randint( rng );
         }
+
+		if( r/n_replicas != s ) continue;
+
+        states.emplace_back( n );
+        states0.emplace_back( n );
         rc = rc ? rc : grb::buildVector(
             states0.back(),
             rand_data.cbegin(),

From 734acb217b336e684e7a90b821da64868c116ba0 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 18 Dec 2025 15:24:49 +0100
Subject: [PATCH 48/58] Back to random behavior

---
 include/graphblas/algorithms/simulated_annealing_re.hpp | 7 +------
 tests/smoke/simulated_annealing_re_from_mpi.cpp         | 6 ++----
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 61de06d7a..9950b8bcb 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -147,7 +147,7 @@ namespace grb {
 			std::minstd_rand rng;
 			std::exponential_distribution< EnergyType > rand ( 1.0 );
 
-			rng.seed( seed + s*n_replicas );
+			rng.seed( seed + s );
 			const EnergyType myrand = -rand( rng );
 
 			for( size_t si = nprocs ; rc == grb::SUCCESS && si > 0; --si ){
@@ -155,7 +155,6 @@ namespace grb {
 					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
 						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
 
-						rng.seed( seed + s*n_replicas + i );
 						if( -rand( rng ) < de ){
 							std::swap( states[i], states[i-1] );
 							std::swap( energies[i], energies[i-1] );
@@ -314,10 +313,6 @@ namespace grb {
 			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
 				for( size_t j = 0 ; j < n_replicas ; ++j ){
 
-					const int seedi = i_sweep*n_procs*n_replicas + n_replicas*s + j;
-
-					std::get<8>(sweep_data).seed(seedi);
-
 					energies[j] += sweep( states[j], betas[j], sweep_data );
 					grb::wait();
 				
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index 7156bbc05..deb8562a3 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -445,14 +445,14 @@ void grbProgram(
     }
 
     // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
-    std::minstd_rand rng ( data_in.seed ); // rng or std::mt19937
+    std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
 
     // create states storage and initialize with random 1/0 values
     const size_t n_replicas = data_in.n_replicas;
     std::vector< grb::Vector< IOType, internal_backend > > states0;
     std::vector< grb::Vector< IOType, internal_backend > > states;
     std::vector< IOType > rand_data (n);
-    for ( size_t r = 0; r < nprocs * n_replicas; ++r ) {
+    for ( size_t r = 0; r < n_replicas; ++r ) {
         // initialize with random values
         std::uniform_int_distribution< IOType > randint(0,1);
         // we use buildvectorUnique with a random set of indices
@@ -460,8 +460,6 @@ void grbProgram(
             rand_data[i] = randint( rng );
         }
 
-		if( r/n_replicas != s ) continue;
-
         states.emplace_back( n );
         states0.emplace_back( n );
         rc = rc ? rc : grb::buildVector(

From 3a645e60e83278e05eb4701def2944d10aaaf885 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 7 Jan 2026 10:57:59 +0100
Subject: [PATCH 49/58] Fixed geometric temperature gradient

---
 tests/smoke/simulated_annealing_re_from_mpi.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
index deb8562a3..1c84c5078 100644
--- a/tests/smoke/simulated_annealing_re_from_mpi.cpp
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -471,13 +471,13 @@ void grbProgram(
 		rc = rc ? rc : grb::set( states.back(), states0.back() );
     }
 
-    // also make betas vector os size n_replicas and initialize with 10.0
+    // also make betas vector of size n_replicas and initialize with a geometric gradient
     grb::Vector< JType, internal_backend > betas( n_replicas );
     grb::Vector< EnergyType, internal_backend > energies( n_replicas );
     grb::Vector< EnergyType, internal_backend > energies0( n_replicas );
     grb::Vector< EnergyType, internal_backend > tmp_energy( n );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
-        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( (10.0 / (s * n_replicas) ) * std::pow<JType>( 1.5, ( n_replicas * s + r ) ) ), r );
+		rc = rc ? rc : grb::setElement( betas, static_cast< JType >( ( 1.0 ) * std::pow< JType >( 1.5, ( n_replicas * s + r ) ) ), r );
         rc = rc ? rc : grb::setElement( energies0, get_energy(  J, h, states[r], tmp_energy ), r );
     }
 	rc = rc ? rc : grb::set( energies, energies0 );

From 99fbea170a6e733b4b4ea7a7f847f0dc297f2e6b Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Thu, 8 Jan 2026 10:26:54 +0100
Subject: [PATCH 50/58] Added more debug tests + seeding in
 simulated_annealing_ising affects simulated_annealing_RE

---
 .../algorithms/simulated_annealing_re.hpp     | 37 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 9950b8bcb..a97023aa0 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -303,6 +303,7 @@ namespace grb {
 						  << "\n\t n_sweeps = " << n_sweeps
 						  << "\n\t goal = " << goal
 						  << "\n\t use_pt = " << use_pt
+						  << "\n\t seed = " << seed
 						  << std::endl;
 			}
 			assert( grb::size(best_state) == n );
@@ -665,14 +666,46 @@ namespace grb {
 					rc = rc ? rc : grb::set< descr >( dn, mask, state );
 					rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( dn, state, static_cast< QType >( -1 ), right_assign_op );
 					rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
+					assert( grb::nnz( dn ) == grb::nnz( mask ) );
+#ifndef NDEBUG
+					for( const auto x : dn ){
+						assert( mask[x.first] == 1 );
+						assert( (2*int(state[x.first])-1)*h[x.first] == x.second );
+					}
+					const auto dn0 = dn;
+#endif
 
 					// Choose which changes to accept
 					// ( dn >= 0 ) | ( rand/beta < dn )
 					rc = rc ? rc : grb::foldl< descr >( dn, rand, leq_operator );
 					rc = rc ? rc : grb::set< descr >( accept, dn, mask );
+					assert( grb::nnz( accept ) <= grb::nnz( mask ) );
+#ifndef NDEBUG
+					size_t cnt = 0;
+					for( const auto x : dn0 ){
+						const size_t i = x.first;
+						assert( mask[x.first] == 1 );
+						assert( x.second );
+						if( x.second >= rand[i] ){
+							assert( dn[i] == 1 );
+							assert( accept[i] == 1 );
+							cnt++;
+						}else{
+							assert( dn[i] == 0 );
+						}
+					}
+					assert( grb::nnz( accept ) == cnt );
+#endif
 
 					// new_state = np.where(accept, 1 - old, old)
 					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( 1 ), neq_operator );
+#ifndef NDEBUG
+					for( const auto x : accept ){
+						const size_t i = x.first;
+						if( x.second ) assert( state0[i] == 1-state[i] );
+						else assert( state0[i] == state[i] );
+					}
+#endif
 					
 					// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 					rc = rc ? rc : grb::set< descr >( delta, accept, state );
@@ -706,14 +739,14 @@ namespace grb {
 					std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
 					std::cerr << std::endl;
 				}
-
 				assert( ISCLOSE(real_delta, delta_energy ) );
 #endif
+
 				return delta_energy;
 			};
 
 			return simulated_annealing_RE(
-					ising_sweep, sweep_data, states, energies, betas, best_state, best_energy, n_sweeps, goal, use_pt
+					ising_sweep, sweep_data, states, energies, betas, best_state, best_energy, n_sweeps, goal, use_pt, seed
 					);
 		}
 

From 1425539e6d4bfdba07f6fa242641b8fabc71caf0 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Wed, 7 Jan 2026 10:58:47 +0100
Subject: [PATCH 51/58] Test with no masking

---
 .../algorithms/simulated_annealing_re.hpp     | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index a97023aa0..75f5f9150 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -285,6 +285,7 @@ namespace grb {
 			const size_t n_replicas = states.size();
 			const size_t n = grb::size(states[0]);
 			(void) n;
+			(void) n_procs;
 			(void) s;
 
 			grb::RC rc = grb::SUCCESS;
@@ -436,7 +437,7 @@ namespace grb {
 				}
 
 				// add new mask
-				masks.emplace_back( grb::Vector< bool, backend >( n ) );
+				masks.emplace_back( n );
 				auto &new_mask = masks.at(i);
 				rc = rc ? rc : grb::resize( new_mask, n );
 				rc = rc ? rc : grb::set< descr >( new_mask, frontier, static_cast< MaskType >(true) );
@@ -594,9 +595,22 @@ namespace grb {
 			rc = rc ? rc : grb::resize( dn, n );
 			rc = rc ? rc : grb::resize( accept, n );
 
-			std::vector< grb::Vector< bool, backend > > masks ;
+			std::vector< grb::Vector< bool, backend > > masks;
 			rc = rc ? rc : matrix_partition< descr >( masks, couplings, h, rand, seed );
-			rc = rc ? rc : grb::clear(h);
+
+			std::vector< grb::Vector< bool, backend > > trivial_masks;
+
+			for(const auto &mask : masks ){
+				for( const auto &x : mask ){
+					trivial_masks.emplace_back( n );
+					grb::setElement( trivial_masks.back(), x.second, x.first );
+				}
+			}
+			assert( trivial_masks.size() == n );
+
+			masks = trivial_masks;
+
+			rc = rc ? rc : grb::clear( h );
 			constexpr auto dense_descr = descr | grb::descriptors::dense;
 
 			auto sweep_data = std::tie(

From c094bec6eea5e69e46075f1c17925a5358e99064 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 9 Jan 2026 11:11:27 +0100
Subject: [PATCH 52/58] Revert "Test with no masking"

This reverts commit 1425539e6d4bfdba07f6fa242641b8fabc71caf0.
---
 .../algorithms/simulated_annealing_re.hpp     | 20 +++----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 75f5f9150..a97023aa0 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -285,7 +285,6 @@ namespace grb {
 			const size_t n_replicas = states.size();
 			const size_t n = grb::size(states[0]);
 			(void) n;
-			(void) n_procs;
 			(void) s;
 
 			grb::RC rc = grb::SUCCESS;
@@ -437,7 +436,7 @@ namespace grb {
 				}
 
 				// add new mask
-				masks.emplace_back( n );
+				masks.emplace_back( grb::Vector< bool, backend >( n ) );
 				auto &new_mask = masks.at(i);
 				rc = rc ? rc : grb::resize( new_mask, n );
 				rc = rc ? rc : grb::set< descr >( new_mask, frontier, static_cast< MaskType >(true) );
@@ -595,22 +594,9 @@ namespace grb {
 			rc = rc ? rc : grb::resize( dn, n );
 			rc = rc ? rc : grb::resize( accept, n );
 
-			std::vector< grb::Vector< bool, backend > > masks;
+			std::vector< grb::Vector< bool, backend > > masks ;
 			rc = rc ? rc : matrix_partition< descr >( masks, couplings, h, rand, seed );
-
-			std::vector< grb::Vector< bool, backend > > trivial_masks;
-
-			for(const auto &mask : masks ){
-				for( const auto &x : mask ){
-					trivial_masks.emplace_back( n );
-					grb::setElement( trivial_masks.back(), x.second, x.first );
-				}
-			}
-			assert( trivial_masks.size() == n );
-
-			masks = trivial_masks;
-
-			rc = rc ? rc : grb::clear( h );
+			rc = rc ? rc : grb::clear(h);
 			constexpr auto dense_descr = descr | grb::descriptors::dense;
 
 			auto sweep_data = std::tie(

From 35153401a8b5efdba6f2ac7998868da15a0eaa63 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 9 Jan 2026 10:56:16 +0100
Subject: [PATCH 53/58] Substituted typeof with decltype

---
 .../graphblas/algorithms/simulated_annealing_re.hpp    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index a97023aa0..2a4f05052 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -600,16 +600,16 @@ namespace grb {
 			constexpr auto dense_descr = descr | grb::descriptors::dense;
 
 			auto sweep_data = std::tie(
-					(const typeof(couplings)&) couplings,
-					(const typeof(local_fields)&) local_fields,
-					(const typeof(masks)&) masks,
+					(const decltype(couplings)&) couplings,
+					(const decltype(local_fields)&) local_fields,
+					(const decltype(masks)&) masks,
 					h,
 					rand,
 					delta,
 					dn,
 					accept,
 					rng,
-					(const typeof(ring)&) ring
+					(const decltype(ring)&) ring
 					);
 
 #ifdef NDEBUG
@@ -619,7 +619,7 @@ namespace grb {
 #endif
 				 grb::Vector< StateType, backend > &state,
 				 const TempType &beta,
-				 typeof(sweep_data) &data
+				 decltype(sweep_data) &data
 			  ){
 				const size_t s 		= spmd<>::pid();
 				(void) s;

From cbe019537e525c1f4a9c28bc12d8f9641609f4f8 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 9 Jan 2026 11:12:18 +0100
Subject: [PATCH 54/58] Added simulated annealing test with planted solution

---
 tests/smoke/CMakeLists.txt                    |   5 +
 .../simulated_annealing_re_planted_sol.cpp    | 249 ++++++++++++++++++
 tests/smoke/smoketests.sh                     |   7 +-
 3 files changed, 260 insertions(+), 1 deletion(-)
 create mode 100644 tests/smoke/simulated_annealing_re_planted_sol.cpp

diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 03c55d7fa..cfb1d8506 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -154,6 +154,11 @@ add_grb_executables( simulated_annealing_re_ising simulated_annealing_re_ising.c
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
+add_grb_executables( simulated_annealing_re_planted_sol simulated_annealing_re_planted_sol.cpp
+	BACKENDS reference reference_omp hyperdags nonblocking bsp1d
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
+
 add_grb_executables( gmres gmres.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
diff --git a/tests/smoke/simulated_annealing_re_planted_sol.cpp b/tests/smoke/simulated_annealing_re_planted_sol.cpp
new file mode 100644
index 000000000..f6210806f
--- /dev/null
+++ b/tests/smoke/simulated_annealing_re_planted_sol.cpp
@@ -0,0 +1,249 @@
+#include <iostream>
+#include <vector>
+#include <random>
+#include <algorithm>
+#include <cmath>
+#include <cassert>
+
+#include <graphblas/algorithms/simulated_annealing_re.hpp>
+#include <graphblas.hpp>
+
+using QType = float;
+using StateType = int8_t;
+using EnergyType = double;
+
+template< grb::Backend backend >
+void generate_sparse_planted_qubo(
+    int n,
+    int degree,
+    std::pair< QType, QType > weight_range,
+    grb::Vector< QType, backend >  &Q_diag,
+    grb::Matrix< QType, backend > &Q_off,
+    grb::Vector< StateType, backend > &x_star,
+    double &E_star,
+    unsigned int seed = 0
+) {
+	std::minstd_rand rng( seed );
+	std::uniform_int_distribution< StateType > int_dist(0, 1);
+	std::uniform_real_distribution< QType > weight_dist(weight_range.first, weight_range.second);
+
+	std::vector< StateType > x ( n );
+	for( auto  &y : x ){
+		y = int_dist( rng );
+	}
+	grb::resize( x_star, n );
+	grb::buildVector( x_star, x.begin(), x.end(), grb::SEQUENTIAL );
+
+	grb::clear( Q_diag );
+	grb::clear( Q_off );
+	E_star = 0.0;
+
+	std::map< std::pair<size_t,size_t>, QType > Q;
+	std::vector< QType > Qdiag ( n, 0 );
+
+    for (size_t i = 0; i < n; ++i) {
+		std::vector< size_t > neighbors;
+        for (size_t j = 0; j < n; ++j) {
+            if (j != i) neighbors.push_back(j);
+        }
+		std::shuffle( neighbors.begin(), neighbors.end(), rng );
+        neighbors.resize( degree );
+
+        for (const auto&j : neighbors) {
+            if (j < i) continue;
+
+            const double w = weight_dist( rng );
+            const int b = x_star[i] ^ x_star[j];
+
+            if (b == 0) {
+                Qdiag[i] += w;
+                Qdiag[j] += w;
+                Q[{i, j}] -= 2*w;
+                Q[{j, i}] -= 2*w;
+            } else {
+                Qdiag[i] -= w;
+                Qdiag[j] -= w;
+                Q[{i, j}] += 2*w;
+                Q[{j, i}] += 2*w;
+                E_star += w;
+            }
+        }
+    }
+	std::vector< size_t > i, j;
+	std::vector< QType > v;
+	for(const auto &x : Q ){
+		i.push_back( x.first.first );
+		j.push_back( x.first.second );
+		v.push_back( x.second );
+	}
+
+	grb::buildVector( Q_diag, Qdiag.begin(), Qdiag.end(), grb::SEQUENTIAL );
+	grb::buildMatrixUnique( Q_off,
+			i.begin(), i.end(),
+			j.begin(), j.end(),
+			v.begin(), v.end(),
+			grb::SEQUENTIAL );
+}
+
+template<
+	grb::Backend backend,
+	grb::Descriptor descr = grb::descriptors::no_operation,
+	class Ring = grb::Semiring<
+		grb::operators::add< QType >, grb::operators::mul< QType >,
+		grb::identities::zero, grb::identities::one
+	>,
+	typename Ttmp
+	>
+EnergyType get_energy(
+				 const grb::Matrix< QType, backend >& couplings,
+				 const grb::Vector< QType, backend > &local_fields,
+				 const grb::Vector< StateType,backend > &state,
+				 grb::Vector< Ttmp, backend > &tmp,
+				 const Ring &ring = Ring()
+			  ){
+	const size_t n = grb::size( local_fields );
+	assert( n == grb::size( state ) );
+	assert( n == grb::ncols( couplings ) );
+	assert( n == grb::nrows( couplings ) );
+	grb::RC rc = grb::SUCCESS;
+	rc = rc ? rc : grb::resize( tmp, n );
+	EnergyType energy = 0.0;
+	constexpr auto dense_descr = descr | grb::descriptors::dense;
+
+	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
+	rc = rc ? rc : grb::mxv< dense_descr >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, static_cast< QType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< dense_descr >( energy, tmp, state, ring );
+	assert( rc == grb::SUCCESS );
+
+	return energy;
+}
+
+template< grb::Backend backend >
+bool brute_force_check(
+    const grb::Vector< QType, backend > &Q_diag,
+    const grb::Matrix< QType, backend > &Q_off,
+    const grb::Vector< StateType, backend > &x_star,
+    double E_star
+) {
+    const int n = grb::size( x_star );
+    EnergyType min_energy = 1e7;
+	std::vector< grb::Vector< StateType > > argmins;
+	assert( n < 8 * sizeof( int64_t ) );
+
+	grb::Vector< double, backend > tmp ( n );
+	grb::Vector< StateType, backend > x ( n );
+    for (int64_t bits = 0; bits < (1 << n); ++bits) {
+        for (int i = 0; i < n; ++i) {
+			grb::setElement( x, (bits >> i) & 1, i);
+        }
+        double E = get_energy( Q_off, Q_diag, x, tmp );
+
+        if (E < min_energy - 1e-9) {
+            min_energy = E;
+            argmins = {x};
+        } else if (abs(E - min_energy) < 1e-9) {
+            argmins.push_back(x);
+        }
+    }
+
+	std::cout << "Planted energy   : " << -E_star << std::endl;
+	std::cout << "Minimum found    : " << min_energy << std::endl;
+	std::cout << "# ground states  : " << argmins.size() << std::endl;
+
+    bool planted_ok = false;
+    for (const auto &x : argmins) {
+        if ( std::equal(x.begin(), x.end(), x_star.begin()) ) {
+            planted_ok = true;
+            break;
+        }
+    }
+
+	std::cout << std::boolalpha;
+	std::cout << "planted_is_optimal: " << planted_ok << std::endl;
+    std::cout << "energy_matches: " << (std::abs(min_energy + E_star) < 1e-9) << std::endl;
+    std::cout << "degeneracy " << argmins.size() << std::endl;
+	return planted_ok;
+}
+
+void grbProgram( const size_t&n, grb::RC &rc ) {
+	rc = grb::SUCCESS;
+    const int degree = 4;
+    const std::pair< QType, QType > weight_range = {1.0, 1.0};
+    const unsigned int seed = 1;
+
+    grb::Vector< QType > Q_diag ( n );
+    grb::Matrix< QType > Q_off ( n, n );
+	grb::Vector< StateType > x_star ( n );
+    double E_star = 0.0;
+
+    generate_sparse_planted_qubo( n, degree, weight_range, Q_diag, Q_off, x_star, E_star, seed );
+
+    const bool optimal = brute_force_check(Q_diag, Q_off, x_star, E_star);
+	// assert( optimal );
+
+	std::cout << "------------------ Test with SA-RE ----------------------" << std::endl;
+	grb::Vector< StateType > best_state ( n );
+	EnergyType best_energy = 42;
+	constexpr bool use_pt = true;
+	constexpr EnergyType reference_energy = 0;
+	constexpr size_t nsweeps = 100;
+	constexpr size_t n_replicas = 16;
+	const size_t s = grb::spmd<>::pid();
+
+    std::minstd_rand rng ( seed + s ); // rng or std::mt19937
+
+    // create states storage and initialize with random 1/0 values
+    std::vector< grb::Vector< StateType > > states;
+    for ( size_t r = 0; r < n_replicas; ++r ) {
+        std::uniform_int_distribution< StateType > randint(0,1);
+        std::vector< StateType > rand_data;
+        for ( size_t i = 0; i < n; ++i ) {
+            rand_data.emplace_back( static_cast< StateType >(
+                randint( rng ) ) );
+        }
+        states.emplace_back( n );
+        rc = rc ? rc : grb::buildVector(
+            states.back(),
+            rand_data.cbegin(),
+            rand_data.cend(),
+            grb::SEQUENTIAL
+        );
+    }
+
+    // also make betas vector of size n_replicas and initialize with 10.0
+    grb::Vector< QType > betas( n_replicas );
+    grb::Vector< EnergyType > energies( n_replicas );
+    grb::Vector< EnergyType > tmp_energy( n );
+    for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
+        rc = rc ? rc : grb::setElement( betas, static_cast< QType >( (10.0) * std::pow<QType>( 2, r ) ), r );
+        rc = rc ? rc : grb::setElement( energies, get_energy( Q_off, Q_diag, states[r], tmp_energy ), r );
+    }
+	assert( rc == grb::SUCCESS );
+
+	rc = grb::algorithms::simulated_annealing_RE_Ising(
+		 Q_off, Q_diag, states, energies, betas, best_state, best_energy, nsweeps, reference_energy, use_pt, seed
+	);
+	assert( get_energy( Q_off, Q_diag, best_state, tmp_energy ) == best_energy );
+	std::cout << "Optimized SA-RE value: " << best_energy << std::endl;
+
+	if( best_energy != -E_star ){
+		rc = grb::FAILED;
+	}
+}
+
+int main() {
+	const size_t in = 18;
+	grb::RC out;
+
+	grb::Launcher< grb::AUTOMATIC > launcher;
+	grb::RC rc = launcher.exec( &grbProgram, in, out, true );
+	if ( rc != grb::SUCCESS ) {
+		std::cerr << "grbProgram launcher failed: " << toString(rc) << "\n";
+		return 4;
+	}
+	std::cout << "Test " << (( out == grb::SUCCESS )? "OK" : "FAILED") << std::endl;
+    return 0;
+
+}
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index 3150b5bd9..3e08561da 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -438,11 +438,16 @@ for BACKEND in ${BACKENDS[@]}; do
 			fi
 
 			if [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "reference"  ] || [ "$BACKEND" = "hyperdags" ] || [ "$BACKEND" = "nonblocking"  ]; then
-				echo ">>>      [x]           [ ]       Tests Simulated Annealing-Replica Exchange on a"
+				echo ">>>      [x]           [ ]       Testing Simulated Annealing-Replica Exchange on a"
 				echo "                                 small 16x16 matrix."
 				echo "Functional test executable: ${TEST_BIN_DIR}/simulated_annealing_re_ising_${BACKEND}"
 				$runner ${TEST_BIN_DIR}/simulated_annealing_re_ising_${BACKEND} --use-default-data --verify &> ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log
 				( grep "Test OK" ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log ) || printf 'Test FAILED.\n'
+				echo ">>>      [x]           [ ]       Testing Simulated Annealing-Replica Exchange on a"
+				echo "                                 18x18 matrix with constructed optimal solution."
+				echo "Functional test executable: ${TEST_BIN_DIR}/simulated_annealing_re_planted_sol_${BACKEND}"
+				$runner ${TEST_BIN_DIR}/simulated_annealing_re_planted_sol_${BACKEND} &> ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log
+				( grep "Test OK" ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log ) || printf 'Test FAILED.\n'
 			fi
 		done
 	done

From 23eaf9075b54e399f47dc53670e3e5b4256cd718 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 9 Jan 2026 11:34:53 +0100
Subject: [PATCH 55/58] fixup! Added more debug tests + seeding in
 simulated_annealing_ising affects simulated_annealing_RE

---
 include/graphblas/algorithms/simulated_annealing_re.hpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 2a4f05052..21f9d90fb 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -699,13 +699,6 @@ namespace grb {
 
 					// new_state = np.where(accept, 1 - old, old)
 					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( 1 ), neq_operator );
-#ifndef NDEBUG
-					for( const auto x : accept ){
-						const size_t i = x.first;
-						if( x.second ) assert( state0[i] == 1-state[i] );
-						else assert( state0[i] == state[i] );
-					}
-#endif
 					
 					// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
 					rc = rc ? rc : grb::set< descr >( delta, accept, state );

From 85dc6ac9ff1835245e5eac052f49349de296c032 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Tue, 13 Jan 2026 12:16:10 +0100
Subject: [PATCH 56/58] fixup! Added simulated annealing test with planted
 solution

---
 .../algorithms/simulated_annealing_re.hpp     |  1 +
 .../simulated_annealing_re_planted_sol.cpp    | 79 ++++++++++++++-----
 2 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index 21f9d90fb..cc9b235aa 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -285,6 +285,7 @@ namespace grb {
 			const size_t n_replicas = states.size();
 			const size_t n = grb::size(states[0]);
 			(void) n;
+			(void) n_procs;
 			(void) s;
 
 			grb::RC rc = grb::SUCCESS;
diff --git a/tests/smoke/simulated_annealing_re_planted_sol.cpp b/tests/smoke/simulated_annealing_re_planted_sol.cpp
index f6210806f..109ea805a 100644
--- a/tests/smoke/simulated_annealing_re_planted_sol.cpp
+++ b/tests/smoke/simulated_annealing_re_planted_sol.cpp
@@ -2,7 +2,7 @@
 #include <vector>
 #include <random>
 #include <algorithm>
-#include <cmath>
+#include <cstdlib>
 #include <cassert>
 
 #include <graphblas/algorithms/simulated_annealing_re.hpp>
@@ -12,10 +12,25 @@ using QType = float;
 using StateType = int8_t;
 using EnergyType = double;
 
+constexpr EnergyType EPS = 1e-6;
+
+template< typename T >
+inline bool ISCLOSE( const T a, const T b ){
+	return (std::abs<T>(a-b) < EPS) || (std::abs<T>((a-b)/a) < EPS);
+}
+
+struct data_in {
+	size_t n = 18;
+	size_t degree = 5;
+	size_t n_replicas = 8;
+	size_t nsweeps = 5;
+	int seed = 0;
+};
+
 template< grb::Backend backend >
 void generate_sparse_planted_qubo(
-    int n,
-    int degree,
+    const size_t n,
+    const size_t degree,
     std::pair< QType, QType > weight_range,
     grb::Vector< QType, backend >  &Q_diag,
     grb::Matrix< QType, backend > &Q_off,
@@ -127,7 +142,7 @@ bool brute_force_check(
     const grb::Vector< StateType, backend > &x_star,
     double E_star
 ) {
-    const int n = grb::size( x_star );
+    const size_t n = grb::size( x_star );
     EnergyType min_energy = 1e7;
 	std::vector< grb::Vector< StateType > > argmins;
 	assert( n < 8 * sizeof( int64_t ) );
@@ -135,7 +150,7 @@ bool brute_force_check(
 	grb::Vector< double, backend > tmp ( n );
 	grb::Vector< StateType, backend > x ( n );
     for (int64_t bits = 0; bits < (1 << n); ++bits) {
-        for (int i = 0; i < n; ++i) {
+        for (size_t i = 0; i < n; ++i) {
 			grb::setElement( x, (bits >> i) & 1, i);
         }
         double E = get_energy( Q_off, Q_diag, x, tmp );
@@ -167,11 +182,12 @@ bool brute_force_check(
 	return planted_ok;
 }
 
-void grbProgram( const size_t&n, grb::RC &rc ) {
+void grbProgram( const struct data_in &in, grb::RC &rc ) {
 	rc = grb::SUCCESS;
-    const int degree = 4;
-    const std::pair< QType, QType > weight_range = {1.0, 1.0};
-    const unsigned int seed = 1;
+	const auto n = in.n;
+    const int degree = in.degree;
+    const std::pair< QType, QType > weight_range = {0.1, 1.0};
+    const unsigned int seed = in.seed;
 
     grb::Vector< QType > Q_diag ( n );
     grb::Matrix< QType > Q_off ( n, n );
@@ -179,17 +195,23 @@ void grbProgram( const size_t&n, grb::RC &rc ) {
     double E_star = 0.0;
 
     generate_sparse_planted_qubo( n, degree, weight_range, Q_diag, Q_off, x_star, E_star, seed );
+	std::cout << "Optimal value: " << -E_star << std::endl;
+
+	if( n < 22 ){
+		const bool optimal = brute_force_check(Q_diag, Q_off, x_star, E_star);
+		if( !optimal ){
+			rc = grb::FAILED;
+			std::cerr << "Constructed solution is not optimal." << std::endl;
+		}
+	}
 
-    const bool optimal = brute_force_check(Q_diag, Q_off, x_star, E_star);
-	// assert( optimal );
-
-	std::cout << "------------------ Test with SA-RE ----------------------" << std::endl;
+	// std::cout << "------------------ Test with SA-RE ----------------------" << std::endl;
 	grb::Vector< StateType > best_state ( n );
 	EnergyType best_energy = 42;
 	constexpr bool use_pt = true;
 	constexpr EnergyType reference_energy = 0;
-	constexpr size_t nsweeps = 100;
-	constexpr size_t n_replicas = 16;
+	const size_t nsweeps = in.nsweeps;
+	const size_t n_replicas = in.n_replicas;
 	const size_t s = grb::spmd<>::pid();
 
     std::minstd_rand rng ( seed + s ); // rng or std::mt19937
@@ -227,16 +249,35 @@ void grbProgram( const size_t&n, grb::RC &rc ) {
 	);
 	assert( get_energy( Q_off, Q_diag, best_state, tmp_energy ) == best_energy );
 	std::cout << "Optimized SA-RE value: " << best_energy << std::endl;
+	std::cout << "Absolute error: " << best_energy+E_star << std::endl;
+	std::cout << "Relative error: " << (best_energy+E_star)/best_energy << std::endl;
+
 
-	if( best_energy != -E_star ){
+	if( !ISCLOSE(best_energy, -E_star) ){
 		rc = grb::FAILED;
 	}
 }
 
-int main() {
-	const size_t in = 18;
-	grb::RC out;
+int main( int argc, char **argv ){
+	struct data_in in;
+	in.n = argc > 1 ? atoi(argv[1]) : 18 ;
+	in.degree = argc > 2 ? atoi(argv[2]) : 5 ;
+	in.n_replicas = argc > 3 ? atoi(argv[3]) : 8 ;
+	in.nsweeps = argc > 4 ? atoi(argv[4]) : 5 ;
+	in.seed = argc > 5 ? atoi(argv[5]) : 0 ;
+
+	if( in.n == 0 || in.degree == 0 || in.n_replicas == 0 ){
+		std::cout << "Usage: " << std::endl;
+		std::cout << argv[0] << " [n] [degree] [n_replicas] [nsweeps] [seed]" << std::endl;
+		exit( 0 );
+	}
+	std::cout << "n = " << in.n << std::endl;
+	std::cout << "degree = " << in.degree << std::endl;
+	std::cout << "n_replicas = " << in.n_replicas << std::endl;
+	std::cout << "nsweeps = " << in.nsweeps << std::endl;
+	std::cout << "seed = " << in.seed << std::endl;
 
+	grb::RC out;
 	grb::Launcher< grb::AUTOMATIC > launcher;
 	grb::RC rc = launcher.exec( &grbProgram, in, out, true );
 	if ( rc != grb::SUCCESS ) {

From 017dcfa76095436d78779d690634e68296c73620 Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 16 Jan 2026 12:42:05 +0100
Subject: [PATCH 57/58] New planted QUBO instance strategy - not yet using both

---
 .../simulated_annealing_re_planted_sol.cpp    | 270 ++++++++++++------
 1 file changed, 186 insertions(+), 84 deletions(-)

diff --git a/tests/smoke/simulated_annealing_re_planted_sol.cpp b/tests/smoke/simulated_annealing_re_planted_sol.cpp
index 109ea805a..3016519a4 100644
--- a/tests/smoke/simulated_annealing_re_planted_sol.cpp
+++ b/tests/smoke/simulated_annealing_re_planted_sol.cpp
@@ -2,13 +2,14 @@
 #include <vector>
 #include <random>
 #include <algorithm>
+#include <bitset>
 #include <cstdlib>
 #include <cassert>
 
 #include <graphblas/algorithms/simulated_annealing_re.hpp>
 #include <graphblas.hpp>
 
-using QType = float;
+using QType = double;
 using StateType = int8_t;
 using EnergyType = double;
 
@@ -20,13 +21,126 @@ inline bool ISCLOSE( const T a, const T b ){
 }
 
 struct data_in {
-	size_t n = 18;
+	// instance settings
+	size_t n = 5;	// size of small instance
+	size_t k = 3;	// number of small instances
 	size_t degree = 5;
+	// solver settings
 	size_t n_replicas = 8;
 	size_t nsweeps = 5;
+	// global setting
 	int seed = 0;
 };
 
+template<
+	grb::Backend backend,
+	grb::Descriptor descr = grb::descriptors::no_operation,
+	class Ring = grb::Semiring<
+		grb::operators::add< QType >, grb::operators::mul< QType >,
+		grb::identities::zero, grb::identities::one
+	>,
+	typename Ttmp
+	>
+EnergyType get_energy(
+				 const grb::Matrix< QType, backend >& couplings,
+				 const grb::Vector< QType, backend > &local_fields,
+				 const grb::Vector< StateType,backend > &state,
+				 grb::Vector< Ttmp, backend > &tmp,
+				 const Ring &ring = Ring()
+			  ){
+	const size_t n = grb::size( local_fields );
+	assert( n == grb::size( state ) );
+	assert( n == grb::ncols( couplings ) );
+	assert( n == grb::nrows( couplings ) );
+	grb::RC rc = grb::SUCCESS;
+	EnergyType energy = 0.0;
+	constexpr auto dense_descr = descr | grb::descriptors::dense;
+
+	rc = rc ? rc : grb::resize( tmp, n );
+	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
+	rc = rc ? rc : grb::mxv< dense_descr >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, static_cast< QType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< dense_descr >( energy, tmp, state, ring );
+	assert( rc == grb::SUCCESS );
+
+	return energy;
+}
+
+
+template< grb::Backend backend >
+void generate_random_qubo(
+    const size_t n,
+    const size_t k,
+    grb::Vector< QType, backend >  &Q_diag,
+    grb::Matrix< QType, backend > &Q_off,
+    grb::Vector< StateType, backend >  &x_star,
+    unsigned int seed = 0
+) {
+	grb::RC rc = grb::SUCCESS;
+	rc = rc ? rc : grb::clear( Q_diag );
+	rc = rc ? rc : grb::clear( Q_off );
+
+	std::minstd_rand rng( seed );
+	std::uniform_real_distribution< QType > weight_dist( -1, 1 );
+
+	std::map< std::pair<size_t,size_t>, QType > Q;
+	std::vector< QType > Qdiag ( n*k, 0 );
+	for(size_t kk = 0; kk < k ; ++kk){
+		for (size_t i = 0; i < n; ++i) {
+			for (size_t j = i; j < n; ++j) {
+				const QType val = weight_dist(rng);
+				if (i == j){
+					Qdiag[n*kk+i] = val;
+				} else{
+					Q[{n*kk+i, n*kk+j}] = val;
+					Q[{n*kk+j, n*kk+i}] = val;
+				}
+			}
+		}
+	}
+	std::vector< size_t > i, j;
+	std::vector< QType > v;
+	for(const auto &x : Q ){
+		i.push_back( x.first.first );
+		j.push_back( x.first.second );
+		v.push_back( x.second );
+	}
+
+	rc = rc ?  rc : grb::buildVector( Q_diag, Qdiag.begin(), Qdiag.end(), grb::SEQUENTIAL );
+	rc = rc ? rc : grb::buildMatrixUnique( Q_off,
+			i.begin(), i.end(),
+			j.begin(), j.end(),
+			v.begin(), v.end(),
+			grb::SEQUENTIAL );
+	assert( rc == grb::SUCCESS );
+
+	grb::Vector< StateType, backend > x ( n*k );
+	grb::Vector< QType, backend > tmp ( n*k );
+	rc = rc ? rc : grb::set( tmp, static_cast<StateType>( 0 ) );
+	rc = rc ? rc : grb::set( x_star, static_cast<StateType>( 0 ) );
+
+	QType min_energy = 0 ;
+
+	for(size_t kk = 0; kk < k ; ++kk){
+		rc = rc ? rc : grb::set( x, x_star );
+		for (int64_t bits = 0; bits < (1 << n); ++bits) {
+			for (size_t i = 0; i < n; ++i) {
+				grb::setElement( x, (bits >> i) & 1, i + kk*n );
+			}
+			const double E = get_energy( Q_off, Q_diag, x, tmp );
+
+			if ( E < min_energy - 1e-9 ) {
+				min_energy = E;
+				x_star = x;
+			}
+		}
+	}
+	assert( get_energy( Q_off, Q_diag, x_star, tmp ) == min_energy );
+	assert( rc == grb::SUCCESS );
+	return;
+}
+
 template< grb::Backend backend >
 void generate_sparse_planted_qubo(
     const size_t n,
@@ -34,23 +148,16 @@ void generate_sparse_planted_qubo(
     std::pair< QType, QType > weight_range,
     grb::Vector< QType, backend >  &Q_diag,
     grb::Matrix< QType, backend > &Q_off,
-    grb::Vector< StateType, backend > &x_star,
+    const grb::Vector< StateType, backend > &x_star,
     double &E_star,
     unsigned int seed = 0
 ) {
 	std::minstd_rand rng( seed );
-	std::uniform_int_distribution< StateType > int_dist(0, 1);
 	std::uniform_real_distribution< QType > weight_dist(weight_range.first, weight_range.second);
 
-	std::vector< StateType > x ( n );
-	for( auto  &y : x ){
-		y = int_dist( rng );
-	}
-	grb::resize( x_star, n );
-	grb::buildVector( x_star, x.begin(), x.end(), grb::SEQUENTIAL );
-
-	grb::clear( Q_diag );
-	grb::clear( Q_off );
+	grb::RC rc = grb::SUCCESS;
+	rc = rc ? rc : grb::clear( Q_diag );
+	rc = rc ? rc : grb::clear( Q_off );
 	E_star = 0.0;
 
 	std::map< std::pair<size_t,size_t>, QType > Q;
@@ -92,47 +199,13 @@ void generate_sparse_planted_qubo(
 		v.push_back( x.second );
 	}
 
-	grb::buildVector( Q_diag, Qdiag.begin(), Qdiag.end(), grb::SEQUENTIAL );
-	grb::buildMatrixUnique( Q_off,
+	rc = rc ? rc : grb::buildVector( Q_diag, Qdiag.begin(), Qdiag.end(), grb::SEQUENTIAL );
+	rc = rc ? rc : grb::buildMatrixUnique( Q_off,
 			i.begin(), i.end(),
 			j.begin(), j.end(),
 			v.begin(), v.end(),
 			grb::SEQUENTIAL );
-}
-
-template<
-	grb::Backend backend,
-	grb::Descriptor descr = grb::descriptors::no_operation,
-	class Ring = grb::Semiring<
-		grb::operators::add< QType >, grb::operators::mul< QType >,
-		grb::identities::zero, grb::identities::one
-	>,
-	typename Ttmp
-	>
-EnergyType get_energy(
-				 const grb::Matrix< QType, backend >& couplings,
-				 const grb::Vector< QType, backend > &local_fields,
-				 const grb::Vector< StateType,backend > &state,
-				 grb::Vector< Ttmp, backend > &tmp,
-				 const Ring &ring = Ring()
-			  ){
-	const size_t n = grb::size( local_fields );
-	assert( n == grb::size( state ) );
-	assert( n == grb::ncols( couplings ) );
-	assert( n == grb::nrows( couplings ) );
-	grb::RC rc = grb::SUCCESS;
-	rc = rc ? rc : grb::resize( tmp, n );
-	EnergyType energy = 0.0;
-	constexpr auto dense_descr = descr | grb::descriptors::dense;
-
-	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
-	rc = rc ? rc : grb::mxv< dense_descr >( tmp, couplings, state, ring );
-	rc = rc ? rc : grb::foldl< dense_descr >( tmp, static_cast< QType >( 0.5 ), ring.getMultiplicativeMonoid() );
-	rc = rc ? rc : grb::foldl< dense_descr >( tmp, local_fields, ring.getAdditiveMonoid() );
-	rc = rc ? rc : grb::dot< dense_descr >( energy, tmp, state, ring );
 	assert( rc == grb::SUCCESS );
-
-	return energy;
 }
 
 template< grb::Backend backend >
@@ -140,7 +213,7 @@ bool brute_force_check(
     const grb::Vector< QType, backend > &Q_diag,
     const grb::Matrix< QType, backend > &Q_off,
     const grb::Vector< StateType, backend > &x_star,
-    double E_star
+    const double opt_energy
 ) {
     const size_t n = grb::size( x_star );
     EnergyType min_energy = 1e7;
@@ -153,7 +226,11 @@ bool brute_force_check(
         for (size_t i = 0; i < n; ++i) {
 			grb::setElement( x, (bits >> i) & 1, i);
         }
-        double E = get_energy( Q_off, Q_diag, x, tmp );
+        const double E = get_energy( Q_off, Q_diag, x, tmp );
+		if(n < 6){
+			std::bitset<5> x (bits);
+			std::cerr << x << " --> " << E << std::endl;
+		}
 
         if (E < min_energy - 1e-9) {
             min_energy = E;
@@ -163,7 +240,7 @@ bool brute_force_check(
         }
     }
 
-	std::cout << "Planted energy   : " << -E_star << std::endl;
+	std::cout << "Planted energy   : " << opt_energy << std::endl;
 	std::cout << "Minimum found    : " << min_energy << std::endl;
 	std::cout << "# ground states  : " << argmins.size() << std::endl;
 
@@ -177,7 +254,7 @@ bool brute_force_check(
 
 	std::cout << std::boolalpha;
 	std::cout << "planted_is_optimal: " << planted_ok << std::endl;
-    std::cout << "energy_matches: " << (std::abs(min_energy + E_star) < 1e-9) << std::endl;
+    std::cout << "energy_matches: " << (std::abs(min_energy - opt_energy) < 1e-9) << std::endl;
     std::cout << "degeneracy " << argmins.size() << std::endl;
 	return planted_ok;
 }
@@ -185,28 +262,47 @@ bool brute_force_check(
 void grbProgram( const struct data_in &in, grb::RC &rc ) {
 	rc = grb::SUCCESS;
 	const auto n = in.n;
+	const auto k = in.k;
     const int degree = in.degree;
     const std::pair< QType, QType > weight_range = {0.1, 1.0};
     const unsigned int seed = in.seed;
 
-    grb::Vector< QType > Q_diag ( n );
-    grb::Matrix< QType > Q_off ( n, n );
-	grb::Vector< StateType > x_star ( n );
-    double E_star = 0.0;
+    grb::Vector< QType > Q_diag ( n*k ), Q_diag_rand ( n*k );
+    grb::Matrix< QType > Q_off ( n*k, n*k ), Q_off_rand ( n*k, n*k );
+	grb::Vector< StateType > x_star ( n*k );
+    double opt_energy = 0.0;
+
+	generate_random_qubo( n, k, Q_diag_rand, Q_off_rand, x_star, seed );
+    // generate_sparse_planted_qubo( n, degree, weight_range, Q_diag, Q_off, x_star, opt_energy, seed );
+
+	const grb::Monoid< grb::operators::add<QType>, grb::identities::zero > addMonoid;
+	// rc = rc ? rc :grb::foldl( Q_diag, Q_diag_rand, addMonoid );
+	// rc = rc ? rc :grb::foldl( Q_off, Q_off_rand, addMonoid );
+	
+	rc = rc ? rc : grb::set( Q_diag, Q_diag_rand );
+	// rc = rc ? rc : grb::set( Q_off, Q_off_rand );
+	std::swap( Q_off, Q_off_rand );
+	assert( rc == grb::SUCCESS );
+
+    grb::Vector< QType > tmp ( n*k );
+	rc = rc ? rc : grb::set( tmp, 0 );
+    
+    opt_energy = get_energy( Q_off, Q_diag, x_star, tmp );
 
-    generate_sparse_planted_qubo( n, degree, weight_range, Q_diag, Q_off, x_star, E_star, seed );
-	std::cout << "Optimal value: " << -E_star << std::endl;
+	std::cout << "Optimal value: " << opt_energy << std::endl;
 
-	if( n < 22 ){
-		const bool optimal = brute_force_check(Q_diag, Q_off, x_star, E_star);
+	if( n*k < 22 ){
+		const bool optimal = brute_force_check(Q_diag, Q_off, x_star, opt_energy);
 		if( !optimal ){
 			rc = grb::FAILED;
 			std::cerr << "Constructed solution is not optimal." << std::endl;
+			return;
 		}
 	}
+	assert( rc == grb::SUCCESS );
 
-	// std::cout << "------------------ Test with SA-RE ----------------------" << std::endl;
-	grb::Vector< StateType > best_state ( n );
+	std::cout << "------------------ Test with SA-RE ----------------------" << std::endl;
+	grb::Vector< StateType > best_state ( n*k );
 	EnergyType best_energy = 42;
 	constexpr bool use_pt = true;
 	constexpr EnergyType reference_energy = 0;
@@ -221,11 +317,11 @@ void grbProgram( const struct data_in &in, grb::RC &rc ) {
     for ( size_t r = 0; r < n_replicas; ++r ) {
         std::uniform_int_distribution< StateType > randint(0,1);
         std::vector< StateType > rand_data;
-        for ( size_t i = 0; i < n; ++i ) {
+        for ( size_t i = 0; i < n*k; ++i ) {
             rand_data.emplace_back( static_cast< StateType >(
                 randint( rng ) ) );
         }
-        states.emplace_back( n );
+        states.emplace_back( n*k );
         rc = rc ? rc : grb::buildVector(
             states.back(),
             rand_data.cbegin(),
@@ -233,49 +329,55 @@ void grbProgram( const struct data_in &in, grb::RC &rc ) {
             grb::SEQUENTIAL
         );
     }
+	assert( rc == grb::SUCCESS );
 
     // also make betas vector of size n_replicas and initialize with 10.0
     grb::Vector< QType > betas( n_replicas );
     grb::Vector< EnergyType > energies( n_replicas );
-    grb::Vector< EnergyType > tmp_energy( n );
     for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
         rc = rc ? rc : grb::setElement( betas, static_cast< QType >( (10.0) * std::pow<QType>( 2, r ) ), r );
-        rc = rc ? rc : grb::setElement( energies, get_energy( Q_off, Q_diag, states[r], tmp_energy ), r );
+        rc = rc ? rc : grb::setElement( energies, get_energy( Q_off, Q_diag, states[r], tmp ), r );
     }
 	assert( rc == grb::SUCCESS );
 
 	rc = grb::algorithms::simulated_annealing_RE_Ising(
 		 Q_off, Q_diag, states, energies, betas, best_state, best_energy, nsweeps, reference_energy, use_pt, seed
 	);
-	assert( get_energy( Q_off, Q_diag, best_state, tmp_energy ) == best_energy );
 	std::cout << "Optimized SA-RE value: " << best_energy << std::endl;
-	std::cout << "Absolute error: " << best_energy+E_star << std::endl;
-	std::cout << "Relative error: " << (best_energy+E_star)/best_energy << std::endl;
+	std::cout << "Absolute error: " << best_energy-opt_energy << std::endl;
+	std::cout << "Relative error: " << (best_energy-opt_energy)/best_energy << std::endl;
 
-
-	if( !ISCLOSE(best_energy, -E_star) ){
+	if( !ISCLOSE(best_energy, opt_energy) ){
 		rc = grb::FAILED;
 	}
 }
 
 int main( int argc, char **argv ){
 	struct data_in in;
-	in.n = argc > 1 ? atoi(argv[1]) : 18 ;
-	in.degree = argc > 2 ? atoi(argv[2]) : 5 ;
-	in.n_replicas = argc > 3 ? atoi(argv[3]) : 8 ;
-	in.nsweeps = argc > 4 ? atoi(argv[4]) : 5 ;
-	in.seed = argc > 5 ? atoi(argv[5]) : 0 ;
-
-	if( in.n == 0 || in.degree == 0 || in.n_replicas == 0 ){
+	in.n = argc > 1 ? atoi(argv[1]) : in.n ;
+	assert( in.n > 0 );
+	in.k = argc > 2 ? atoi(argv[2]) : in.k ;
+	assert( in.k > 0 );
+	in.degree = argc > 3 ? atoi(argv[3]) : in.degree ;
+	if( in.degree >= in.n ) in.degree = in.n-1;
+	in.n_replicas = argc > 4 ? atoi(argv[4]) : in.n_replicas ;
+	in.nsweeps = argc > 5 ? atoi(argv[5]) : in.nsweeps ;
+	in.seed = argc > 6 ? atoi(argv[6]) : in.seed ;
+
+	if( in.n == 0 || in.degree == 0 || in.n_replicas == 0 || argc > 7 ){
 		std::cout << "Usage: " << std::endl;
 		std::cout << argv[0] << " [n] [degree] [n_replicas] [nsweeps] [seed]" << std::endl;
 		exit( 0 );
 	}
-	std::cout << "n = " << in.n << std::endl;
-	std::cout << "degree = " << in.degree << std::endl;
-	std::cout << "n_replicas = " << in.n_replicas << std::endl;
-	std::cout << "nsweeps = " << in.nsweeps << std::endl;
-	std::cout << "seed = " << in.seed << std::endl;
+	std::cout << "\tn = " << in.n << std::endl;
+	std::cout << "\tk = " << in.k << std::endl;
+	std::cout << "\ttotal size = " << in.n*in.k << std::endl;
+	std::cout << "\tdegree = " << in.degree << std::endl;
+	std::cout << "\tn_replicas = " << in.n_replicas << std::endl;
+	std::cout << "\tnsweeps = " << in.nsweeps << std::endl;
+	std::cout << "\tseed = " << in.seed << std::endl;
+	assert( in.n < 22 );
+
 
 	grb::RC out;
 	grb::Launcher< grb::AUTOMATIC > launcher;

From 4a1fcf24c4eaf1836b3e19913e6f5b9f82ac51fc Mon Sep 17 00:00:00 2001
From: Giovanni Gaio <48856010+GiovaGa@users.noreply.github.com>
Date: Fri, 16 Jan 2026 12:42:28 +0100
Subject: [PATCH 58/58] Added some input checks in simulated_annealing

---
 .../graphblas/algorithms/simulated_annealing_re.hpp   | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
index cc9b235aa..a8d82be89 100644
--- a/include/graphblas/algorithms/simulated_annealing_re.hpp
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -542,12 +542,20 @@ namespace grb {
 			(void) s;
 			grb::RC rc = grb::SUCCESS;
 
+#ifndef NDEBUG
 			assert( grb::nnz(states[0]) == n ); // state is dense
 			assert( states.size() == n_replicas );
 			// assert( grb::is_symmetric( couplings ) );
+			for( const auto &state : states ){
+				for( size_t i = 0; i < n; ++i ){
+					assert( (state[i] == static_cast< StateType >( 0 )) ||
+							(state[i] == static_cast< StateType >( 1 )) );
+				}
+			}
 
 			assert( empty_local_fields || ( grb::size( local_fields ) == n ) );
 			assert( empty_local_fields || ( grb::nnz(local_fields) == n ) );
+#endif
 			EnergyType energy;
 			grb::Vector< EnergyType, backend > tmp_calc_energy ( n );
 
@@ -671,7 +679,8 @@ namespace grb {
 #ifndef NDEBUG
 					for( const auto x : dn ){
 						assert( mask[x.first] == 1 );
-						assert( (2*int(state[x.first])-1)*h[x.first] == x.second );
+						assert( ( (state[x.first] == 1) && ( x.second  == h[x.first]) ) ||
+								( (state[x.first] == 0) && ( x.second  == -h[x.first]) ) );
 					}
 					const auto dn0 = dn;
 #endif