diff --git a/.github/workflows/dbg_smoke.yml b/.github/workflows/dbg_smoke.yml
index 8fb2795ce..1f956c072 100644
--- a/.github/workflows/dbg_smoke.yml
+++ b/.github/workflows/dbg_smoke.yml
@@ -17,7 +17,7 @@ jobs:
       run: sudo apt-get install -y libnuma-dev
 
     - name: Configure
-      run: mkdir build && cd build && ../bootstrap.sh --prefix=../install --debug-build
+      run: mkdir build && cd build && ../bootstrap.sh --prefix=../install --debug-build  --no-dense
 
     - name: Build
       working-directory: ${{github.workspace}}/build
diff --git a/.gitignore b/.gitignore
index bbb0d673e..a0eca11b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ paths.mk
 [Bb]uild*/
 [Oo]bj*/
 [Ii]nstall*/
-cmake-build-*/
\ No newline at end of file
+cmake-build-*/
+.vscode/
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 368bc4c26..d1c8c08e7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -115,7 +115,7 @@ workflow:
 #  before_script:
 #    - yum -y update && yum -y groupinstall "Development Tools" && yum -y install make autoconf cmake numactl-devel
 #  script:
-#    - rm -rf build install && mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install && make -j$(nproc) && make -j$(nproc) build_tests_all
+#    - rm -rf build install && mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install --no-dense && make -j$(nproc) && make -j$(nproc) build_tests_all
 #    - *strip_symbols
 #  artifacts:
 #    paths:
@@ -131,7 +131,7 @@ workflow:
 #  before_script:
 #    - yum -y update && yum -y groupinstall "Development Tools" && yum -y install make autoconf cmake numactl-devel
 #  script:
-#    - mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install --debug-build && make -j$(nproc) && make -j$(nproc) build_tests_all
+#    - mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install --debug-build --no-dense && make -j$(nproc) && make -j$(nproc) build_tests_all
 #  rules:
 #    - if: $EXTRA_TESTS_ENABLED == "yes"
 
@@ -187,7 +187,7 @@ workflow:
 
 build_test:
   script:
-    - mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install --with-datasets=${ALP_DATASETS}
+    - mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install --with-datasets=${ALP_DATASETS} --no-dense
       && make -j$(nproc) build_tests_all
     - *strip_symbols
   artifacts:
@@ -238,7 +238,7 @@ test_installation:
 
 build_test_buildtype_debug:
   script:
-    - mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install --with-datasets=${ALP_DATASETS}
+    - mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install --with-datasets=${ALP_DATASETS} --no-dense
       --debug-build && make -j$(nproc) && make -j$(nproc) build_tests_all
     - *strip_symbols
   artifacts:
@@ -272,7 +272,7 @@ gitleaks:
 # factored out command to download the datasets, cmake, and build in non-debug mode
 .setup_and_build_ndebug_slurm: &setup_and_build_ndebug_slurm
   - mkdir -p install build && cd ./build
-  - ../bootstrap.sh --prefix=../install --with-datasets=${SLURM_DATASETS_DIR_PATH} --no-hyperdags
+  - ../bootstrap.sh --prefix=../install --with-datasets=${SLURM_DATASETS_DIR_PATH} --no-hyperdags --no-dense
   - make -j$(nproc)
 
 tests_performance_slurm:
@@ -318,7 +318,7 @@ build_test_lpf:
     - if: $LPF_TESTS_ENABLED == "yes"
   script:
 # build only LPF-related tests
-    - mkdir -p install build && cd ./build && ../bootstrap.sh --with-lpf=${LPF_PATH} --no-nonblocking --no-reference
+    - mkdir -p install build && cd ./build && ../bootstrap.sh --with-lpf=${LPF_PATH} --no-nonblocking --no-reference --no-dense
       --no-hyperdags --prefix=../install --with-datasets=${ALP_DATASETS} && make -j$(nproc) build_tests_all
     - *strip_symbols
   artifacts:
@@ -389,7 +389,7 @@ build_test_gcc_versions:
         #   VER: [11,12,13,14]
   script:
     - mkdir -p install build && cd ./build &&
-      CXX=${CXX_COMPILER}-${VER} CC=${CC_COMPILER}-${VER} ../bootstrap.sh
+      CXX=${CXX_COMPILER}-${VER} CC=${CC_COMPILER}-${VER} ../bootstrap.sh --no-dense
       --prefix=../install --with-datasets=${ALP_DATASETS}
       --with-lpf=${LPF_BASE_PATH}/build_mpich_${CC_COMPILER}_${VER}/install &&
       make -j$(nproc) build_tests_all
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02c49eb37..c667372e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ project( GraphBLAS
 	DESCRIPTION "The ultimate engine for sparse computation"
 	LANGUAGES CXX C
 )
-set( CMAKE_CXX_STANDARD 11 )
+set( CMAKE_CXX_STANDARD 14 )
 set( CMAKE_CXX_STANDARD_REQUIRED ON )
 
 # install within the build directory by default (NOT to /usr/local or the likes)
@@ -50,6 +50,8 @@ endif()
 ### CONFIGURATION OPTIONS
 # to choose backends and dependencies
 option( WITH_REFERENCE_BACKEND "With Reference backend" ON )
+option( WITH_ALP_REFERENCE_BACKEND "With Reference Dense backend" ON )
+option( WITH_ALP_DISPATCH_BACKEND "With Dispatch Dense backend" OFF )
 option( WITH_OMP_BACKEND "With OMP backend" ON )
 option( WITH_HYPERDAGS_BACKEND "With Hyperdags backend" ON )
 if( WITH_HYPERDAGS_BACKEND )
@@ -58,6 +60,7 @@ if( WITH_HYPERDAGS_BACKEND )
 	endif()
 endif()
 option( WITH_NONBLOCKING_BACKEND "With Nonblocking backend" ON )
+option( WITH_ALP_OMP_BACKEND "With OMP Dense backend" OFF )
 option( WITH_NUMA "With NUMA support" ON )
 option( LPF_INSTALL_PATH "Path to the LPF tools for the BSP1D and Hybrid backends" OFF )
 # the following options depend on LPF_INSTALL_PATH being set
@@ -132,7 +135,11 @@ if( NOT WITH_REFERENCE_BACKEND AND
 	NOT WITH_NONBLOCKING_BACKEND AND
 	NOT WITH_BSP1D_BACKEND AND
 	NOT WITH_HYBRID_BACKEND AND
-	NOT WITH_HYPERDAGS_BACKEND )
+	NOT WITH_HYPERDAGS_BACKEND AND
+	NOT WITH_ALP_REFERENCE_BACKEND AND
+	NOT WITH_ALP_DISPATCH_BACKEND AND
+	NOT WITH_ALP_OMP_BACKEND )
+	# at least one backend should be enabled
 	message( FATAL_ERROR "At least one backend should be enabled")
 endif()
 
@@ -156,6 +163,15 @@ endif()
 if( WITH_HYBRID_BACKEND )
 	list( APPEND AVAILABLE_TEST_BACKENDS "hybrid" )
 endif()
+if( WITH_ALP_REFERENCE_BACKEND )
+	list( APPEND AVAILABLE_TEST_BACKENDS "alp_reference" )
+endif()
+if( WITH_ALP_DISPATCH_BACKEND )
+	list( APPEND AVAILABLE_TEST_BACKENDS "alp_dispatch" )
+endif()
+if( WITH_ALP_OMP_BACKEND )
+	list( APPEND AVAILABLE_TEST_BACKENDS "alp_omp" )
+endif()
 
 # Enable backends based on features
 if( ENABLE_SOLVER_LIB )
@@ -267,7 +283,6 @@ if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	find_package( LPF REQUIRED )
 endif( )
 
-
 ### SETTINGS FOR COMPILATION
 
 set( TEST_CATEGORIES "unit" "smoke" "performance" )
@@ -280,6 +295,10 @@ set( TEST_CATEGORIES "unit" "smoke" "performance" )
 # scope and propagate down to the other files
 include( AddGRBVars )
 
+if( WITH_ALP_DISPATCH_BACKEND )
+        include( Blas )
+endif( )
+
 # here, add information for wrappers generated during installation
 include( AddGRBInstall )
 
@@ -303,8 +322,11 @@ include( Transition )
 
 # by default no headers are built
 set( WITH_REFERENCE_BACKEND_HEADERS OFF )
+set( WITH_ALP_REFERENCE_BACKEND_HEADERS OFF )
+set( WITH_ALP_DISPATCH_BACKEND_HEADERS OFF )
 set( WITH_OMP_BACKEND_HEADERS OFF )
 set( WITH_HYPERDAGS_BACKEND_HEADERS OFF )
+set( WITH_ALP_OMP_BACKEND_HEADERS OFF )
 
 # activate headers based on requested backends
 if( WITH_REFERENCE_BACKEND OR WITH_BSP1D_BACKEND OR WITH_NONBLOCKING_BACKEND )
@@ -321,6 +343,18 @@ if( WITH_OMP_BACKEND OR WITH_HYBRID_BACKEND )
 	set( WITH_OMP_BACKEND_HEADERS ON )
 endif()
 
+if( WITH_ALP_REFERENCE_BACKEND )
+	set( WITH_ALP_REFERENCE_BACKEND_HEADERS ON )
+endif()
+
+if( WITH_ALP_DISPATCH_BACKEND )
+	set( WITH_ALP_DISPATCH_BACKEND_HEADERS ON )
+endif()
+
+if( WITH_ALP_OMP_BACKEND )
+	set( WITH_ALP_OMP_BACKEND_HEADERS ON )
+endif()
+
 add_subdirectory( include )
 
 ### BACKEND IMPLEMENTATIONS
diff --git a/README.md b/README.md
index 54c891dd4..b39d14b7c 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 </pre>
 
+# ALP/Dense Testing Guide
+
+Please visit [The ALP/Dense Tests](alpdense.md) for the lastest information about testing the prototype ALP/Dense interface and backends.
 
 This distribution contains the C++ Algebraic Programming (ALP) framework, and
 provides the ALP/GraphBLAS, ALP/Pregel, and Sparse BLAS programming interfaces.
diff --git a/alpdense.md b/alpdense.md
new file mode 100644
index 000000000..7f109436c
--- /dev/null
+++ b/alpdense.md
@@ -0,0 +1,314 @@
+# Introduction
+This file is intended to provide instructions for:
+- Running smoke, unoptimized performance tests for the ALP/Dense sequential reference backend (aka alp_reference);
+- Running optimized performance tests of the ALP/Dense sequential reference backend with dispatch to BLAS (aka alp_dispatch);
+- Running optimized performance tests of the ALP/Dense shared memory backend with dispatch to BLAS (aka alp_omp).
+
+# Performance Tests
+
+This tests have been executed:
+- On a Kunpeng 920 node using 1 core for the sequential reference and alp_dispatch tests and 64 cores for the alp_omp tests;
+- Compiling with gcc 9.4.0;
+- Linking against KunpengBLAS from the Kunpeng BoostKit 22.0.RC1 and the netlib LAPACK linking to the same BLAS library.
+- All tests report runtime in milliseconds after the _time (ms, ...)_ text lines printed on screen.
+
+In our evaluation we extracted the _Kunpeng BoostKit 22.0.RC1_ in a `BLAS_ROOT` folder (the `usr/local/kml` directory extracted from the `boostkit-kml-1.6.0-1.aarch64.rpm` package). `BLAS_ROOT` should contain the `include/kblas.h` header file and the `lib/kblas/{locking, nolocking, omp, pthread}/libkblas.so` library. 
+
+If no system LAPACK library can be found by the compiler, `LAPACK_LIB` (containing the `liblapack.{a,so}` library) and `LAPACK_INCLUDE` (containing the `lapacke.h` header file) have to be appropriately set and provided to cmake, for example exporting them as follows:
+
+```
+# The root folder where this branch is cloned.
+export ALP_SOURCE="$(realpath ../)"
+# The build folder from which running these steps.
+export ALP_BUILD="$(pwd)"
+# The KML installation folder.
+# For example, the "usr/local/kml" directory extracted from the "boostkit-kml-1.6.0-1.aarch64.rpm"
+#export BLAS_ROOT="/path/to/kunpengblas/boostkit-kml-1.6.0.aarch64/usr/local/kml"
+# The lib folder of the LAPACK library.
+#export LAPACK_LIB="/path/to/lapack/netlib/build/lib"
+# The include folder of the LAPACK library.
+# Must include the C/C++ LAPACKE interface.
+#export LAPACK_INCLUDE="/path/to/lapack/netlib/lapack-3.9.1/LAPACKE/include/"
+
+if [ -z ${BLAS_ROOT+x} ] || [ -z ${LAPACK_LIB+x} ] || [ -z ${LAPACK_INCLUDE+x} ]; then
+    echo "Please define BLAS_ROOT, LAPACK_LIB, and LAPACK_INCLUDE variables."
+fi
+```
+
+In particular, we assume the availability of the C/C++ LAPACKE interface and, for all tests below, we assume no system libraries are available. 
+
+***Assuming this branch is cloned in the `ALP_SOURCE` folder, all instructions provided below should be run from a `$ALP_SOURCE/build` folder.***
+
+An analogous [script-like](alpdense.sh) version of this page is available in the ALP root directory of this branch. You may decide to run it directly (**note:** always making sure to customize the export commands above to your environment first) as follows:
+
+```
+bash ../alpdense.sh
+```
+
+The scripts also logs the output of each test group below into a separate file in `$ALP_BUILD/logs`, i.e.,
+- Smoke tests:
+  - `alp_smoketests.log` (ALP smoketests - reference backend, unoptimized)
+  - `lapack_smoketests.log` (LAPACK smoketests - sequential KBLAS)
+- Performance tests:
+  - `lapack_doptrf_seq.log` (LAPACK `dpotrf` - sequential KBLAS)
+  - `alp_dpotrf_seq.log` (ALP `dpotrf` - dispatch backend, sequential KBLAS)
+  - `lapack_dpotrf_omp.log` (LAPACK `dpotrf` - shared-memory KBLAS)
+  - `alp_dpotrf_omp.log` (ALP `dpotrf` - dispatch backend, shared-memory KBLAS)
+  - `kblas_mxm_omp.log` (KunpengBLAS `mxm` - shared memory)
+  - `alp_smoketests.log` (ALP `mxm` - omp+dispatch backends, shared-memory KBLAS)
+
+The rest of this page describes each step of the script above.
+
+# Source Code Location
+
+Assuming this branch is cloned in the `ALP_SOURCE` folder, all ALP/Dense include files are located in the `$ALP_SOURCE/include/alp` folder:
+- In particular, all the pre-implemented algorithms are located in `$ALP_SOURCE/include/alp/algorithms` 
+- The reference, dispatch, and omp backends are located in `$ALP_SOURCE/include/alp/reference`, `$ALP_SOURCE/include/$ALP_SOURCE/alp/dispatch`, and `$ALP_SOURCE/include/alp/omp`, respectively.
+
+All tests discussed below are collected in the `$ALP_SOURCE/tests/smoke` and `$ALP_SOURCE/tests/performance` folders. The folder `$ALP_SOURCE/tests/unit` contains additional unit tests not discuss in this page.
+
+# Dependencies 
+
+For all tests below, the standard ALP dependencies are required:
+- gfortran: -lgfortran
+- LibNUMA: -lnuma
+- Standard math library: -lm
+- POSIX threads: -lpthread
+- OpenMP: -fopenmp in the case of GCC
+
+# Sequential Smoke Tests (Functional, Unoptimized)
+
+We collect the following smoke tests associated with the ALP/Dense reference backend:
+- Basic targets:
+  - General matrix-matrix multiplication ([source](tests/smoke/alp_gemm.cpp))
+  - Householder tridiagonalization of a real symmetric/complex Hermitian matrix ([source](tests/smoke/alp_zhetrd.cpp))
+  - Divide and conquer tridiagonal eigensolver for tridiagonal, real symmetric matrices ([source](tests/smoke/alp_stedc.cpp))
+  - Eigensolver for real symmetric matrices ([source](tests/smoke/alp_zheevd.cpp))
+  - Householder QR decomposition of a real/complex general matrix ([source](tests/smoke/alp_zgeqrf.cpp))
+- Challenge targets:
+  - Triangular linear system solve using backsubstitution of upper tridiagonal, real/complex matrix ([source](tests/smoke/alp_backsubstitution.cpp))
+  - Triangular linear system solve using forwardsubstitution of lower tridiagonal, real/complex matrix ([source](tests/smoke/alp_forwardsubstitution.cpp))
+  - Cholesky decomposition of a symmetric/Hermitian positive definite matrix ([source](tests/smoke/alp_cholesky.cpp))
+  - Householder LU decomposition of a real/complex general matrices ([source](tests/smoke/alp_zgetrf.cpp))
+  - Inverse of a symmetric/Hermitian positive definite matrix ([source](tests/smoke/alp_potri.cpp))
+  - Singular value decomposition of a real/complex general matrix ([source](tests/smoke/alp_zgesvd.cpp))
+
+This tests are collected and run as ALP smoketests.
+From `$ALP_SOURCE/build` run:
+
+```
+cmake -DWITH_ALP_REFERENCE_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make install -j$(nproc) || ( echo "test failed" &&  exit 1 )
+SMOKE_PRINT_TIME=ON make smoketests_alp -j$(nproc)
+```
+
+**Note:** The variable `SMOKE_PRINT_TIME=ON` is used to print timing information of each test to screen. Set it to `OFF` or remove it from the command if this action is not desired.
+
+If the tests run correctly, for each of them you should see an output similar to the following:
+
+```
+****************************************************************************************
+      FUNCTIONAL    PERFORMANCE                       DESCRIPTION      
+----------------------------------------------------------------------------------------
+
+>>>      [x]           [ ]       Tests Cholesky decomposition for a random
+                                 symmetric positive definite matrix (100x100).
+Timing of blocked inplace version with bs = 64.
+ time (ms, total) = 72.1747
+ time (ms, per repeat) = 3.60873
+Test OK
+
+```
+
+To compare with LAPACK+KunpengBLAS (not ALP code) you may run the following:
+
+```
+KBLAS_LIB=$BLAS_ROOT/lib/kblas/locking
+USECASES=("dstedc" "dsyevd" "dsytrd" "zhetrd" "dgeqrf" "dgesvd" "dgetrf" "dpotri")
+
+for USECASE in "${USECASES[@]}"
+do
+    install/bin/grbcxx -o ${USECASE}_lapack_reference.exe $ALP_SOURCE/tests/performance/lapack_${USECASE}.cpp $LAPACK_LIB/liblapack.a $KBLAS_LIB/libkblas.so -Wl,-rpath $KBLAS_LIB -I$LAPACK_INCLUDE -lgfortran || ( echo "Compiling ${USECASE} failed" &&  exit 1 )
+done
+
+for USECASE in "${USECASES[@]}"
+do
+    ./${USECASE}_lapack_reference.exe -n 100 -repeat 20 || ( echo "test ${USECASE} failed" &&  exit 1 )
+done
+```
+
+# Sequential Cholesky Decomposition Tests (Dispatch, Optimized)
+
+Here we compare our ALP Cholesky implementation, based on the alp_dispatch backend, against the `potrf` LAPACK functionality.
+
+From the `$ALP_SOURCE/build` folder run the following commands:
+
+```
+cmake -DKBLAS_ROOT="$BLAS_ROOT" -DWITH_ALP_DISPATCH_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make install  -j$(nproc) || ( echo "test failed" &&  exit 1 )
+```
+
+## LAPACK-Based Test (Sequential BLAS)
+
+To compile and run the LAPACK-based Cholesky test (not ALP code) run the following commands:
+```
+install/bin/grbcxx  -b alp_dispatch -o cholesky_lapack_reference.exe $ALP_SOURCE/tests/performance/lapack_cholesky.cpp $LAPACK_LIB/liblapack.a -I$LAPACK_INCLUDE -lgfortran || ( echo "test failed" &&  exit 1 )
+for MSIZE in {400..4000..100}
+do 
+    ./cholesky_lapack_reference.exe -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+done
+```
+
+If the commands run correctly the output on screen should look like the following:
+
+```
+Testing dpotrf_ for U^T * U = S, with S SPD of size ( 1024 x 1024 )
+Test repeated 10 times.
+ time (ms, total) = 433.652
+ time (ms, per repeat) = 43.3652
+Tests OK
+```
+
+In our tests, we executed `./cholesky_lapack_reference.exe` with matrix sizes (`-n` flag) in the range [400, 4000] in steps of 100.
+
+## ALP-Based Test (Dispatch Sequential Building Blocks to Optimized BLAS)
+
+Some facts about this test:
+- The algorithm is a blocked variant of Cholesky with block size BS = 64 (as done in LAPACK).
+- It recursively requires an unblocked version of the same algorithm (of size BSxBS) which does not dispatch to LAPACK.
+- All BLAS functions needed by the algorithm are dispatched to the external BLAS library. In particular, as POC of what ALP could offer in terms of performance if its primitives could be efficiently generated/optimized (e.g., via our envisioned MLIR-based backend for delayed compilation), it dispatches the triangular solve and the fused `foldl`+`mxm` operations.
+
+```
+make test_alp_cholesky_perf_alp_dispatch -j$(nproc) || ( echo "test failed" &&  exit 1 )
+for MSIZE in {400..4000..100}
+do 
+    tests/performance/alp_cholesky_perf_alp_dispatch -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+done
+```
+
+If the commands run correctly the output on screen should look like the following:
+
+```
+Testing Cholesky decomposition U^T * U = S, with S SPD of size ( 1024 x 1024 )
+Test repeated 10 times.
+ time (ms, total) = 463.652
+ time (ms, per repeat) = 46.3652
+Tests OK
+```
+
+As for the LAPACK-based test, we executed `tests/performance/alp_cholesky_perf_alp_dispatch` with matrix sizes (`-n` flag) in the range [400, 4000] in steps of 100.
+
+**Note:** A consistent test should use the same BLAS in LAPACK-based as well as in the ALP-based tests.
+
+## Cholesky Decomposition with Shmem BLAS
+
+An analogous experiment can be conducted using the shared-memory BLAS library in place of the sequential one as follows (the following block runs both the LAPACK and the ALP tests):
+
+```
+subbuild="build_potrf_with_omp_blas"
+rm -rf $subbuild && mkdir $subbuild && cd $subbuild
+cmake -DKBLAS_ROOT="$BLAS_ROOT" -DKBLAS_IMPL=omp -DWITH_ALP_OMP_BACKEND=ON -DWITH_ALP_DISPATCH_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make install -j$(nproc) || ( echo "test failed" &&  exit 1 )
+
+install/bin/grbcxx  -b alp_dispatch -o cholesky_lapack_omp.exe $ALP_SOURCE/tests/performance/lapack_cholesky.cpp $LAPACK_LIB/liblapack.a -I$LAPACK_INCLUDE -lgfortran || ( echo "test failed" &&  exit 1 )
+for NT in 1 64 96
+do
+    echo "#####################################################################"
+    echo " Testing potrf: LAPACK + KunpengBLAS (omp) with OMP_NUM_THREADS=${NT}"
+    echo "#####################################################################"
+    for MSIZE in {400..4000..100}
+    do 
+        OMP_NUM_THREADS=${NT} ./cholesky_lapack_omp.exe -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+    done
+    echo " Tests completed."
+    echo "#####################################################################"
+done
+
+make test_alp_cholesky_perf_alp_dispatch -j$(nproc) || ( echo "test failed" &&  exit 1 )
+for NT in 1 64 96
+do
+    echo "##########################################################################"
+    echo "Testing potrf: Testing ALP + KunpengBLAS (omp) with OMP_NUM_THREADS=${NT}"
+    echo "##########################################################################"
+    for MSIZE in {400..4000..100}
+    do 
+        OMP_NUM_THREADS=${NT} tests/performance/alp_cholesky_perf_alp_dispatch -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+    done
+    echo " Tests completed."
+    echo "##########################################################################"
+done
+cd $ALP_BUILD
+
+```
+
+# Shared-Memory Parallel `mxm` Tests (Optimized)
+
+Here we compare our ALP shared memory backend (alp_omp) `mxm` implementation against the BLAS's `gemm` functionality.
+`mxm` is an inplace, ALP primitive that computes C = C + A*B, with matrices of conforming sizes.
+
+Our current shared memory backend implementation is currently only supporting square thread grids (although the methodology is not limited to that in general). For this reason, in the tests below we run both LAPACK and ALP using 64 threads. To ensure a fair comparison, we link with the `omp` version of KunpengBLAS.
+
+You can compile with the `omp` version of KunpengBLAS by additionally providing the `-DKBLAS_IMPL=omp` flag when calling cmake. However, this should be compiled in a different directory from the other BLAS-based builds, as follows:
+```
+subbuild="build_mxm_with_omp_blas"
+rm -rf $subbuild && mkdir $subbuild && cd $subbuild
+cmake -DKBLAS_ROOT="$BLAS_ROOT" -DKBLAS_IMPL=omp -DWITH_ALP_OMP_BACKEND=ON -DWITH_ALP_DISPATCH_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make install -j$(nproc) || ( echo "test failed" &&  exit 1 )
+```
+
+## `gemm`-Based BLAS Test.
+
+from `$subbuild` run:
+```
+install/bin/grbcxx -b alp_dispatch -o blas_mxm.exe $ALP_SOURCE/tests/performance/blas_mxm.cpp -lgfortran || ( echo "test failed" &&  exit 1 )
+for MSIZE in {1024..10240..1024}
+do 
+    OMP_NUM_THREADS=64 ./blas_mxm.exe -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+done
+cd $ALP_BUILD
+```
+
+If the commands run correctly the output on screen should look like the following:
+
+```
+Testing cblas_dgemm for C(1024 x 1024) +=   A(1024 x 1024) x B(1024 x 1024)  10 times.
+ time (ms, total) = 116.494
+ time (ms, per repeat) = 11.6494
+Tests OK
+```
+
+In our tests, we executed `./blas_mxm.exe` with matrix sizes (`-n` flag) in the range [1024:1024:10240].
+
+## ALP-Based Test (Dispatch Sequential Building Blocks to Optimized BLAS).
+
+Some facts about this test:
+- The ALP `mxm` shared memory implementation is based on a [2.5D matrix multiplication algorithm](https://netlib.org/lapack/lawnspdf/lawn248.pdf);
+- In this test we execute with a 3D thread grid of size 4x4x4;
+- We set `OMP_NUM_THREADS=64` threads and fix `GOMP_CPU_AFFINITY="0-15 24-39 48-63 72-87"` to reflect the cores and NUMA topology of the node;
+- The algorithm is allocating memory using a 2D block-cyclic layout with blocks of size 128x128.
+- Each sequential block-level `mxm` (128x128x128) is dispatched to the selected BLAS library.
+
+From `$ALP_SOURCE/build` run:
+
+```
+subbuild="build_mxm_with_alp_omp"
+rm -rf $subbuild && mkdir $subbuild && cd $subbuild
+cmake -DKBLAS_ROOT="$BLAS_ROOT" -DWITH_ALP_DISPATCH_BACKEND=ON -DWITH_ALP_OMP_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make test_alp_mxm_perf_alp_omp -j$(nproc) || ( echo "test failed" &&  exit 1 )
+for MSIZE in {1024..10240..1024}
+do 
+    GOMP_CPU_AFFINITY="0-15 24-39 48-63 72-87" OMP_NUM_THREADS=64 tests/performance/alp_mxm_perf_alp_omp -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+done
+cd $ALP_BUILD
+```
+
+If the commands run correctly the output on screen should look like the following:
+
+```
+Testing  C(1024 x 1024) += A(1024 x 1024) x B(1024 x 1024) 10 times.
+ time (ms, total) = 69.7239
+ time (ms, per repeat) = 6.97239
+Tests OK
+```
+
+As for the gemm-based test, we executed `tests/performance/alp_mxm_perf_alp_omp` with matrix sizes (`-n` flag) in the range [1024:1024:10240].
diff --git a/alpdense.sh b/alpdense.sh
new file mode 100644
index 000000000..b817853d9
--- /dev/null
+++ b/alpdense.sh
@@ -0,0 +1,258 @@
+# This file is intended to provide instructions for:
+#    Running smoke tests for the ALP/Dense reference backend (aka alp_reference);
+#    Running performance tests of the ALP/Dense reference backend with dispatch to BLAS (aka alp_dispatch);
+#    Running performance tests of the ALP/Dense shared memory backend with dispatch to BLAS (aka alp_omp).
+
+# For all tests below standard ALP dependencies are required:
+#    gfortran: -lgfortran
+#    LibNUMA: -lnuma
+#    Standard math library: -lm
+#    POSIX threads: -lpthread
+#    OpenMP: -fopenmp in the case of GCC
+
+# Before running please export: 
+
+# The root folder where this branch is cloned.
+export ALP_SOURCE="$(realpath ../)"
+
+# The build folder from which running these steps.
+export ALP_BUILD="$(pwd)"
+
+# The KML installation folder.
+# For example, the "usr/local/kml" directory extracted from the "boostkit-kml-1.6.0-1.aarch64.rpm"
+#export BLAS_ROOT="/path/to/kunpengblas/boostkit-kml-1.6.0.aarch64/usr/local/kml"
+
+# The lib folder of the LAPACK library.
+#export LAPACK_LIB="/path/to/lapack/netlib/build/lib"
+
+# The include folder of the LAPACK library.
+# Must include the C/C++ LAPACKE interface.
+#export LAPACK_INCLUDE="/path/to/lapack/netlib/lapack-3.9.1/LAPACKE/include/"
+
+if [ -z ${BLAS_ROOT+x} ] || [ -z ${LAPACK_LIB+x} ] || [ -z ${LAPACK_INCLUDE+x} ]; then
+    echo "Please define BLAS_ROOT, LAPACK_LIB, and LAPACK_INCLUDE variables."
+    exit 1
+fi
+
+####################
+####################
+# Smoke tests
+####################
+####################
+
+# We collect the following smoke tests associated with the ALP/Dense reference backend:
+#    (Basic targets)
+#    General matrix-matrix multiplication (source: tests/smoke/alp_gemm.cpp)
+#    Householder tridiagonalization of a real symmetric/complex Hermitian matrix (source: tests/smoke/alp_zhetrd.cpp)
+#    Divide and conquer tridiagonal eigensolver for tridiagonal, real symmetric matrices (source: tests/smoke/alp_dstedc.cpp)
+#    Eigensolver for real symmetric matrices (source: tests/smoke/alp_syevd.cpp)
+#    Householder QR decomposition of a real/complex general matrix (source: tests/smoke/alp_zgeqrf.cpp)
+
+#    (Challenge targets)
+#    Triangular linear system solve using backsubstitution of upper tridiagonal, real/complex matrix (source: tests/smoke/alp_backsubstitution.cpp)
+#    Triangular linear system solve using forwardsubstitution of lower tridiagonal, real/complex matrix (source: tests/smoke/alp_forwardsubstitution.cpp)
+#    Cholesky decomposition of a symmetric/Hermitian positive definite matrix (source: tests/smoke/alp_cholesky.cpp)
+#    Householder LU decomposition of a real/complex general matrices (source: tests/smoke/alp_zgetrf.cpp)
+#    Inverse of a symmetric/Hermitian positive definite matrix (source code: tests/smoke/alp_potri.cpp)
+#    Singular value decomposition of a real/complex general matrix (source code: tests/smoke/alp_zgesvd.cpp)
+
+# This tests are collected and run as ALP smoketests as follows:
+
+LOGDIR=$ALP_BUILD/logs
+mkdir -p $LOGDIR
+
+cmake -DWITH_ALP_REFERENCE_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make install -j$(nproc) || ( echo "test failed" &&  exit 1 )
+SMOKE_PRINT_TIME=ON make smoketests_alp -j$(nproc) | tee $LOGDIR/alp_smoketests.log
+
+# To compile and run the LAPACK-based tests (not ALP code).
+# Here you can use gcc flags, i.e. "-L/path/tolapack/ -llapack" (or simply " -llapack" to use system installed lapack library).
+KBLAS_LIB=$BLAS_ROOT/lib/kblas/locking
+USECASES=("dstedc" "dsyevd" "dsytrd" "zhetrd" "dgeqrf" "dgesvd" "dgetrf" "dpotri")
+
+for USECASE in "${USECASES[@]}"
+do
+    install/bin/grbcxx -o ${USECASE}_lapack_reference.exe $ALP_SOURCE/tests/performance/lapack_${USECASE}.cpp $LAPACK_LIB/liblapack.a $KBLAS_LIB/libkblas.so -Wl,-rpath $KBLAS_LIB -I$LAPACK_INCLUDE -lgfortran || ( echo "Compiling ${USECASE} failed" &&  exit 1 )
+done
+
+LOGFILE=$LOGDIR/lapack_smoketests.log
+echo "#####################################################################"
+echo " LAPACK smoketests (seq)"  | tee -a $LOGFILE
+echo "#####################################################################"
+for USECASE in "${USECASES[@]}"
+do
+    ( ./${USECASE}_lapack_reference.exe -n 100 -repeat 20 || ( echo "test ${USECASE} failed" &&  exit 1 ) )
+done | tee -a $LOGFILE
+
+####################
+####################
+# Performance tests
+####################
+####################
+
+# This tests have been executed:
+#    On a Kunpeng 920 node with 1 core (alp_dispatch) or 64 cores (alp_omp);
+#    Compiling with gcc 9.4.0 compiler;
+#    Linking against KunpengBLAS (Kunpeng BoostKit 22.0.RC1) and netlib LAPACK.
+#    All tests report time in milliseconds after "time (ms, ...)" text line.
+#
+# These instructions assume that you are using "Kunpeng BoostKit 22.0.RC1" extracted in a directory BLAS_ROOT
+# which should contain include/kblas.h file and the lib/kblas/ directory.
+# However, any other blas library could also be used.
+
+####################
+# Compilation and execution of the sequential Cholesky decomposition tests
+# which are testing our ALP Cholesky implementation, based on the alp_dispatch backend, against the potrf LAPACK functionality.
+####################
+
+# If no LAPACK library can be found by the compiler in system directories, LAPACK_LIB and LAPACK_INCLUDE have to be properly set and explicitly provided when calling cmake.
+# If you are using locally installed kblas, make sure to set proper BLAS_ROOT path to "kml" directory, i.e. extracted boostkit-kml-1.6.0-1.aarch64.rpm.
+cmake -DKBLAS_ROOT="$BLAS_ROOT" -DWITH_ALP_DISPATCH_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make install -j$(nproc) || ( echo "test failed" &&  exit 1 )
+
+# To compile and run the LAPACK Cholesky test (not ALP code).
+# Here you can use gcc flags, i.e. "-L/path/toib/ -llapack" (or simply " -llapack" to use system installed lapack library).
+# A consistent test should use the same BLAS in LAPACK as in the ALP-based tests.
+install/bin/grbcxx  -b alp_dispatch -o cholesky_lapack_reference.exe $ALP_SOURCE/tests/performance/lapack_cholesky.cpp $LAPACK_LIB/liblapack.a -I$LAPACK_INCLUDE -lgfortran || ( echo "test failed" &&  exit 1 )
+
+LOGFILE=$LOGDIR/lapack_doptrf_seq.log
+echo "#####################################################################"
+echo " Testing potrf: LAPACK + KunpengBLAS (seq)" | tee -a $LOGFILE
+echo "#####################################################################"
+for MSIZE in {400..4000..100}
+do 
+    ( ./cholesky_lapack_reference.exe -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 ) )
+done | tee -a $LOGFILE
+echo " Tests completed."
+echo "#####################################################################"
+
+# Run the Cholesky ALP dispatch sequential test. 
+# Some facts about the test:
+#    The algorithm is a blocked variant of Cholesky with block size BS = 64 (as done in LAPACK).
+#    It recursively requires an unblocked version of the same algorithm (of size BSxBS) which does not dispatch to LAPACK.
+#    All BLAS functions needed by the algorithm are dispatched to the external BLAS library.
+make test_alp_cholesky_perf_alp_dispatch -j$(nproc) || ( echo "test failed" &&  exit 1 )
+LOGFILE=$LOGDIR/alp_dpotrf_seq.log
+echo "#####################################################################"
+echo " Testing potrf: ALP + KunpengBLAS (seq)" | tee -a $LOGFILE
+echo "#####################################################################"
+for MSIZE in {400..4000..100}
+do 
+    ( tests/performance/alp_cholesky_perf_alp_dispatch -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 ) )
+done | tee -a $LOGFILE
+echo " Tests completed."
+echo "#####################################################################"
+
+####################
+# Compilation and execution of the shared-memory Cholesky decomposition tests
+# which are testing our ALP Cholesky implementation, based on the alp_dispatch backend, against the potrf LAPACK functionality.
+# Differently from the sequential test we link against the shared-memory KBLAS library.
+####################
+
+# Assuming that you are currently in the "build" directory of the ALP cloned repository.
+# If no LAPACK library can be found by the compiler in system directories, LAPACK_LIB and LAPACK_INCLUDE have to be properly set and explicitly provided when calling cmake.
+# If you are using locally installed kblas, make sure to set proper BLAS_ROOT path to "kml" directory, i.e. extracted boostkit-kml-1.6.0-1.aarch64.rpm.
+
+subbuild="build_potrf_with_omp_blas"
+rm -rf $subbuild && mkdir $subbuild && cd $subbuild
+cmake -DKBLAS_ROOT="$BLAS_ROOT" -DKBLAS_IMPL=omp -DWITH_ALP_OMP_BACKEND=ON -DWITH_ALP_DISPATCH_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make install -j$(nproc) || ( echo "test failed" &&  exit 1 )
+
+# To compile and run the LAPACK Cholesky test (not ALP code).
+# Here you can use gcc flags, i.e. "-L/path/toib/ -llapack" (or simply " -llapack" to use system installed lapack library).
+# A consistent test should use the same BLAS in LAPACK as in the ALP-based tests.
+install/bin/grbcxx  -b alp_dispatch -o cholesky_lapack_omp.exe $ALP_SOURCE/tests/performance/lapack_cholesky.cpp $LAPACK_LIB/liblapack.a -I$LAPACK_INCLUDE -lgfortran || ( echo "test failed" &&  exit 1 )
+
+LOGFILE=$LOGDIR/lapack_dpotrf_omp.log
+for NT in 1 64 96
+do
+    echo "#####################################################################" 
+    echo " Testing potrf: LAPACK + KunpengBLAS (omp) with OMP_NUM_THREADS=${NT}"
+    echo "#####################################################################" 
+    for MSIZE in {400..4000..100}
+    do 
+        OMP_NUM_THREADS=${NT} ./cholesky_lapack_omp.exe -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+    done
+    echo " Tests completed."
+    echo "#####################################################################"
+done | tee -a $LOGFILE
+
+# Run the Cholesky ALP dispatch sequential test. 
+# Some facts about the test:
+#    The algorithm is a blocked variant of Cholesky with block size BS = 64 (as done in LAPACK).
+#    It recursively requires an unblocked version of the same algorithm (of size BSxBS) which does not dispatch to LAPACK.
+#    All BLAS functions needed by the algorithm are dispatched to the external BLAS library.
+make test_alp_cholesky_perf_alp_dispatch -j$(nproc) || ( echo "test failed" &&  exit 1 )
+LOGFILE=$LOGDIR/alp_dpotrf_omp.log
+for NT in 1 64 96
+do
+    echo "##########################################################################"
+    echo "Testing potrf: Testing ALP + KunpengBLAS (omp) with OMP_NUM_THREADS=${NT}"
+    echo "##########################################################################"
+    for MSIZE in {400..4000..100}
+    do 
+        OMP_NUM_THREADS=${NT} tests/performance/alp_cholesky_perf_alp_dispatch -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+    done
+    echo " Tests completed."
+    echo "##########################################################################"
+done | tee -a $LOGFILE
+cd $ALP_BUILD
+
+####################
+# Compilation and execution of shared memory parallel mxm tests
+# which are testing our ALP shared memory backend (alp_omp) mxm implementation against the BLAS's gemm functionality.
+# mxm is an inplace, ALP primitive that computes C = C + A*B, with matrices of conforming sizes.
+####################
+
+# Our current shared memory backend implementation is not very flexible and can only use squared thread grids.
+# In the tests below we run both LAPACK and ALP using 64 threads.
+# To ensure a fair comparison, we link with the omp version of KunpengBLAS.
+#
+# You can compile with omp version of kblas library by additionally providing " -DKBLAS_IMPL=omp"  flag when calling cmake.
+# However, this should be compiled in a different directory from the other blas calls, as follows:
+subbuild="build_mxm_with_omp_blas"
+rm -rf $subbuild && mkdir $subbuild && cd $subbuild
+cmake -DKBLAS_ROOT="$BLAS_ROOT" -DKBLAS_IMPL=omp -DWITH_ALP_OMP_BACKEND=ON -DWITH_ALP_DISPATCH_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make install  -j$(nproc) || ( echo "test failed" &&  exit 1 )
+
+# Compile and run gemm-based BLAS test.
+install/bin/grbcxx -b alp_dispatch -o blas_mxm.exe $ALP_SOURCE/tests/performance/blas_mxm.cpp -lgfortran || ( echo "test failed" &&  exit 1 )
+
+LOGFILE=$LOGDIR/kblas_mxm_omp.log
+echo "##########################################################################"
+echo "Testing mxm: Testing KunpengBLAS (omp) with OMP_NUM_THREADS=64" | tee -a $LOGFILE
+echo "##########################################################################"
+for MSIZE in {1024..10240..1024}
+do 
+    OMP_NUM_THREADS=64 ./blas_mxm.exe -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+done | tee -a $LOGFILE
+echo " Tests completed."
+echo "##########################################################################"
+
+cd $ALP_BUILD
+
+# Run mxm omp test.
+# Some facts about the ALP test:
+#    The ALP mxm implementation is based on a 2.5D algorithm;
+#    In this test we execute with a 3D thread grid of size 4x4x4;
+#    We set OMP_NUM_THREADS=64 threads and fix GOMP_CPU_AFFINITY="0-15 24-39 48-63 72-87" to reflect the NUMA domains in the node;
+#    The algorithm is allocating memory using a 2D block-cyclic layout with blocks of size 128x128.
+
+subbuild="build_mxm_with_alp_omp"
+rm -rf $subbuild && mkdir $subbuild && cd $subbuild
+cmake -DKBLAS_ROOT="$BLAS_ROOT" -DWITH_ALP_DISPATCH_BACKEND=ON -DWITH_ALP_OMP_BACKEND=ON -DCMAKE_INSTALL_PREFIX=./install $ALP_SOURCE || ( echo "test failed" &&  exit 1 )
+make test_alp_mxm_perf_alp_omp -j$(nproc) || ( echo "test failed" &&  exit 1 )
+
+LOGFILE=$LOGDIR/alp_mxm_omp.log
+echo "##########################################################################"
+echo "Testing mxm: Testing KunpengBLAS (omp) with:" | tee -a $LOGFILE
+echo " OMP_NUM_THREADS=64 GOMP_CPU_AFFINITY=\"0-15 24-39 48-63 72-87\"" | tee -a $LOGFILE
+echo "##########################################################################"
+for MSIZE in {1024..10240..1024}
+do 
+    GOMP_CPU_AFFINITY="0-15 24-39 48-63 72-87" OMP_NUM_THREADS=64 tests/performance/alp_mxm_perf_alp_omp -n ${MSIZE} -repeat 10 || ( echo "test failed" &&  exit 1 )
+done | tee -a $LOGFILE
+echo " Tests completed."
+echo "##########################################################################"
+
+cd $ALP_BUILD
diff --git a/bootstrap.sh b/bootstrap.sh
index e24d75d45..1b623b4f5 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -74,6 +74,7 @@ the location where LPF is installed"
 	echo "  --with-banshee=<path/>              - path to the the tools to compile the banshee backend"
 	echo "  --with-snitch=<path/>               - path to the tools for Snitch support within the banshee backend"
 	echo "  --with-datasets=<path/>             - path to the main testing datasets (use tools/downloadDatasets.sh to download)"
+	echo "  --no-alp-reference                  - to compile without support for dense algebraic programming"
 	echo "  --no-reference                      - disables the reference and reference_omp backends"
 	echo "  --no-hyperdags                      - disables the hyperdags backend"
 	echo "  --with-hyperdags-using=<backend>    - uses the given backend reference for HyperDAG generation"
@@ -104,6 +105,9 @@ hyperdags_using=reference
 nonblocking=yes
 banshee=no
 lpf=no
+alp_reference=yes
+alp_dispatch=no
+alp_omp=no
 show=no
 FLAGS=$''
 LPF_INSTALL_PATH=
@@ -163,6 +167,14 @@ or assume default paths (--with-lpf)"
 	--with-datasets=*)
 			DATASETS_PATH="${arg#--with-datasets=}"
 			;;
+	--no-alp-reference)
+			alp_reference=no
+			;;
+	--no-dense)
+			alp_reference=no
+			alp_dispatch=no
+			alp_omp=no
+			;;
 	--no-reference)
 			reference=no
 			;;
@@ -363,6 +375,21 @@ the current directory before invocation or confirm the deletion of its content w
 	if [[ "${nonblocking}" == "no" ]]; then
 		CMAKE_OPTS+=" -DWITH_NONBLOCKING_BACKEND=OFF"
 	fi
+	if [[ "${alp_reference}" == "no" ]]; then
+		CMAKE_OPTS+=" -DWITH_ALP_REFERENCE_BACKEND=OFF"
+	else
+		CMAKE_OPTS+=" -DWITH_ALP_REFERENCE_BACKEND=ON"
+	fi
+	if [[ "${alp_dispatch}" == "no" ]]; then
+		CMAKE_OPTS+=" -DWITH_ALP_DISPATCH_BACKEND=OFF"
+	else
+		CMAKE_OPTS+=" -DWITH_ALP_DISPATCH_BACKEND=ON"
+	fi
+	if [[ "${alp_omp}" == "no" ]]; then
+		CMAKE_OPTS+=" -DWITH_ALP_OMP_BACKEND=OFF"
+	else
+		CMAKE_OPTS+=" -DWITH_ALP_OMP_BACKEND=ON"
+	fi
 	if [[ "${lpf}" == "yes" ]]; then
 		CMAKE_OPTS+=" -DLPF_INSTALL_PATH='${ABSOLUTE_LPF_INSTALL_PATH}'"
 	fi
diff --git a/cmake/AddGRBInstall.cmake b/cmake/AddGRBInstall.cmake
index 94bd58f31..78caf2fe1 100644
--- a/cmake/AddGRBInstall.cmake
+++ b/cmake/AddGRBInstall.cmake
@@ -45,6 +45,9 @@ install( EXPORT GraphBLASTargets
 set( ALP_UTILS_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}" )
 set( SHMEM_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/sequential" )
 set( HYPERDAGS_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hyperdags" )
+set( ALP_REFERENCE_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/alp/reference" )
+set( ALP_DISPATCH_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/alp/dispatch" )
+set( ALP_OMP_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/alp/omp" )
 set( BSP1D_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/spmd" )
 set( HYBRID_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hybrid" )
 
@@ -121,6 +124,34 @@ if( WITH_REFERENCE_BACKEND )
 	)
 endif()
 
+if( WITH_ALP_REFERENCE_BACKEND )
+	addBackendWrapperGenOptions( "alp_reference"
+		COMPILE_DEFINITIONS "ALP_REFERENCE_INCLUDE_DEFS" "${ALP_REFERENCE_SELECTION_DEFS}"
+		LINK_FLAGS "${ALP_REFERENCE_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a"
+	)
+endif()
+
+if( WITH_ALP_DISPATCH_BACKEND )
+	set( _blas_libraries ${BLAS_LIBRARIES} )
+	foreach( _lib ${BLAS_LIBRARIES} )
+		get_filename_component(_path ${_lib} DIRECTORY)
+		set( _path " -Wl,-rpath ${_path}"  )
+		list(APPEND _lib_lists ${_path} )
+	endforeach()
+	#list( TRANSFORM _blas_libraries PREPEND "-l" )
+	list( JOIN _lib_lists " " _blas_rpaths )
+	list( JOIN _blas_libraries " " _blas_link_libs )
+	set( _cxx_additional_includes " -I${INCLUDE_INSTALL_DIR}/blas_wrapper " )
+	if( NOT ${KBLAS_INCLUDE_DIR} STREQUAL "" )
+		set( _cxx_additional_includes " -I${KBLAS_INCLUDE_DIR} ${_cxx_additional_includes}" )
+	endif()
+	addBackendWrapperGenOptions( "alp_dispatch"
+		COMPILE_DEFINITIONS "ALP_DISPATCH_INCLUDE_DEFS" "${ALP_DISPATCH_SELECTION_DEFS}"
+		LINK_FLAGS "${ALP_DISPATCH_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a ${_blas_link_libs} ${_blas_rpaths}"
+		COMPILE_OPTIONS "${_cxx_additional_includes}"
+	)
+endif()
+
 if( WITH_OMP_BACKEND )
 	addBackendWrapperGenOptions( "reference_omp"
 		COMPILE_DEFINITIONS "${REFERENCE_OMP_SELECTION_DEFS}"
@@ -146,6 +177,14 @@ if( WITH_NONBLOCKING_BACKEND )
 	)
 endif()
 
+if( WITH_ALP_OMP_BACKEND )
+	addBackendWrapperGenOptions( "alp_omp"
+		COMPILE_DEFINITIONS "${ALP_OMP_SELECTION_DEFS}"
+		LINK_FLAGS "'${ALP_OMP_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
+			"'${ALP_UTILS_INSTALL_DIR}/lib${ALP_UTILS_LIBRARY_OUTPUT_NAME}.a'" "${NUMA_LFLAG}"
+	)
+endif()
+
 # distributed memory backends
 if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	assert_valid_variables( LPFRUN LPFCPP )
diff --git a/cmake/AddGRBVars.cmake b/cmake/AddGRBVars.cmake
index fab0f9ac9..7b5d12d9a 100644
--- a/cmake/AddGRBVars.cmake
+++ b/cmake/AddGRBVars.cmake
@@ -33,6 +33,10 @@ set( BSP1D_BACKEND_DEFAULT_NAME "backend_bsp1d" )
 set( HYBRID_BACKEND_DEFAULT_NAME "backend_hybrid" )
 set( HYPERDAGS_BACKEND_DEFAULT_NAME "backend_hyperdags" )
 set( NONBLOCKING_BACKEND_DEFAULT_NAME "backend_nonblocking" )
+set( ALP_REFERENCE_BACKEND_DEFAULT_NAME "backend_alp_reference" )
+set( ALP_DISPATCH_BACKEND_DEFAULT_NAME "backend_alp_dispatch" )
+set( ALP_OMP_BACKEND_DEFAULT_NAME "backend_alp_omp" )
+
 
 ### COMPILER DEFINITIONS FOR HEADERS INCLUSION AND FOR BACKEND SELECTION
 
@@ -42,6 +46,9 @@ set( REFERENCE_OMP_INCLUDE_DEFS "_GRB_WITH_OMP" )
 set( HYPERDAGS_INCLUDE_DEFS "_GRB_WITH_HYPERDAGS" )
 set( NONBLOCKING_INCLUDE_DEFS "_GRB_WITH_NONBLOCKING" )
 set( LPF_INCLUDE_DEFS "_GRB_WITH_LPF" )
+set( ALP_REFERENCE_INCLUDE_DEFS "_ALP_WITH_REFERENCE" )
+set( ALP_DISPATCH_INCLUDE_DEFS "_ALP_WITH_DISPATCH" )
+set( ALP_OMP_INCLUDE_DEFS "_ALP_WITH_OMP;_ALP_OMP_WITH_DISPATCH" )
 
 # compiler definitions to select a backend
 set( REFERENCE_SELECTION_DEFS "_GRB_BACKEND=reference" )
@@ -51,6 +58,12 @@ set( HYPERDAGS_SELECTION_DEFS
 	"_GRB_WITH_HYPERDAGS_USING=${WITH_HYPERDAGS_USING}"
 )
 set( NONBLOCKING_SELECTION_DEFS "_GRB_BACKEND=nonblocking" )
+set( ALP_REFERENCE_SELECTION_DEFS "_ALP_BACKEND=reference" )
+set( ALP_DISPATCH_SELECTION_DEFS "_ALP_BACKEND=dispatch" )
+set( ALP_OMP_SELECTION_DEFS
+		"_ALP_BACKEND=omp"
+		"_ALP_SECONDARY_BACKEND=dispatch"
+)
 set( BSP1D_SELECTION_DEFS
 		"_GRB_BACKEND=BSP1D"
 		"_GRB_BSP1D_BACKEND=reference"
@@ -64,7 +77,7 @@ set( HYBRID_SELECTION_DEFS
 set( NO_NUMA_DEF "_GRB_NO_LIBNUMA" )
 
 ### **ALL** BACKENDS, EVEN IF NOT ENABLED BY USER
-set( ALL_BACKENDS "reference" "reference_omp" "hyperdags" "nonblocking" "bsp1d" "hybrid" )
+set( ALL_BACKENDS "reference" "reference_omp" "hyperdags" "nonblocking" "bsp1d" "hybrid" "alp_reference" "alp_dispatch" "alp_omp" )
 
 # list of user-enabled backends, for tests and wrapper scripts (do not change!)
 set( AVAILABLE_BACKENDS "" )
@@ -90,6 +103,18 @@ if( WITH_NONBLOCKING_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "nonblocking" )
 endif()
 
+if( WITH_ALP_REFERENCE_BACKEND )
+	list( APPEND AVAILABLE_BACKENDS "alp_reference" )
+endif()
+
+if( WITH_ALP_DISPATCH_BACKEND )
+	list( APPEND AVAILABLE_BACKENDS "alp_dispatch" )
+endif()
+
+if( WITH_ALP_OMP_BACKEND )
+	list( APPEND AVAILABLE_BACKENDS "alp_omp" )
+endif()
+
 # distributed memory backends
 if( WITH_BSP1D_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "bsp1d" )
@@ -99,5 +124,7 @@ if( WITH_HYBRID_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "hybrid" )
 endif()
 
+message( STATUS "\n######### Configured with the following backends: #########\n${AVAILABLE_BACKENDS}\n" )
+
 # add your own here!
 
diff --git a/cmake/Blas.cmake b/cmake/Blas.cmake
new file mode 100644
index 000000000..09ffae4a5
--- /dev/null
+++ b/cmake/Blas.cmake
@@ -0,0 +1,26 @@
+add_library(cblas INTERFACE)
+if(KBLAS_ROOT)
+	find_package(Kblas REQUIRED)
+	target_link_libraries(cblas INTERFACE Kblas::Kblas)
+	set(HEADER_NAME "kblas")
+else()
+	find_package(BLAS REQUIRED)
+        add_library( extBlas::extBlas UNKNOWN IMPORTED )
+        set_target_properties( extBlas::extBlas
+                PROPERTIES
+                IMPORTED_LOCATION "${BLAS_LIBRARIES}"
+		INTERFACE_LINK_OPTIONS "${BLAS_LINKER_FLAGS}"
+                #INTERFACE_INCLUDE_DIRECTORIES ${}
+        )
+
+	target_link_libraries(cblas INTERFACE extBlas::extBlas)
+        set(HEADER_NAME "cblas")
+endif()
+
+file(WRITE "${CMAKE_BINARY_DIR}/blas_wrapper/alp_blas.h" "#include \"${HEADER_NAME}.h\"\n" )
+#target_include_directories(cblas INTERFACE "${CMAKE_BINARY_DIR}/blas_wrapper" )
+
+target_include_directories( cblas INTERFACE
+        $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/blas_wrapper>
+        $<INSTALL_INTERFACE:.>
+)
diff --git a/cmake/FindKblas.cmake b/cmake/FindKblas.cmake
new file mode 100644
index 000000000..101b7df29
--- /dev/null
+++ b/cmake/FindKblas.cmake
@@ -0,0 +1,137 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#[===================================================================[
+Find libkblas inside the standard system directories
+
+Read-only output variables:
+  KBLAS_FOUND
+	Indicates that the library has been found.
+
+  KBLAS_INCLUDE_DIR
+	Points to the libkblas include directory.
+
+  KBLAS_LIBRARY
+	Points to the libkblas that can be passed to target_link_libararies.
+
+creates a target Kblas::Kblas to link against libkblas
+#]===================================================================]
+
+# documentation of find_path() https://cmake.org/cmake/help/latest/command/find_path.html
+# documentation of find_library() https://cmake.org/cmake/help/latest/command/find_library.html
+
+if(NOT KBLAS_IMPL)
+	set(KBLAS_IMPL "locking")
+else()
+	#if( "${KBLAS_IMPL}" IN_LIST "locking;nolocking;omp;pthread" )
+	if( NOT "${KBLAS_IMPL}" MATCHES "^(locking|nolocking|omp|pthread)$")
+		message( "KBLAS_IMPL =  ${KBLAS_IMPL}")
+		message(ERROR " wrong kblas implementation requested")
+	endif()
+endif()
+
+# find the root directory for libkblas
+find_path( KBLAS_ROOT_DIR
+	NAMES include/kblas.h # by checking where "include/kblas.h" exists
+	PATHS ${KBLAS_ROOT}  # take as a hint the environment variable KBLAS_ROOT and
+						 # add it to the default search paths
+	DOC "KBLAS root directory"
+)
+
+# look for the include directory
+# we should not assume the header is present, because some distributions have
+# different packages for binary-only versions (e.g., libkblas) and for
+# development-oriented versions (e.g., libkblas-dev); hence, look for the header
+# explicitly and raise an error if you cannot find it (otherwise targets will
+# surely not compile!)
+find_path( KBLAS_INCLUDE_DIR
+	NAMES kblas.h # by looking for this header file
+	HINTS ${KBLAS_ROOT_DIR} # start looking from KBLAS_ROOT_DIR, the most likely place
+	PATH_SUFFIXES include  # when inspecting a path, look inside the include directory
+	DOC "KBLAS include directory"
+)
+
+# look for the binary library libkblas
+# do not give thorough hints here, because various Linux distributions may have different
+# conventions on shared binarie directories (/lib, /usr/lib, /usr/lib64, ...)
+# and we don't want to "blind" CMake's search
+find_library( KBLAS_LIBRARY
+	NAMES kblas # hence, CMake looks for libkblas.so, libkblas.so.<some version>,
+			   # libkblas.a and so on (read find_library() guide for more details)
+	HINTS "${KBLAS_ROOT_DIR}/lib/kblas/${KBLAS_IMPL}" # start looking from KBLAS_ROOT_DIR, the most likely place
+	DOC "KBLAS library"
+)
+set( BLAS_LIBRARIES ${KBLAS_LIBRARY} )
+
+find_library( GFORTRAN_LIBRARY
+	NAMES gfortran # hence, CMake looks for libgfortran.so, libgfortran.so.<some version>,
+			   # libgfortran.a and so on (read find_library() guide for more details)
+	HINTS ${CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES}
+)
+
+# if the listed variables are set to existing paths, set the Kblas_FOUND variable
+# if not and the REQUIRED option was given when calling this find_module(),
+# raise an error (some components were not found and we need all of them)
+include( FindPackageHandleStandardArgs )
+find_package_handle_standard_args( Kblas
+	REQUIRED_VARS KBLAS_ROOT_DIR KBLAS_INCLUDE_DIR KBLAS_LIBRARY GFORTRAN_LIBRARY
+)
+
+# if we found the library, create a dedicated target with all needed information
+if( Kblas_FOUND )
+	# do not show these variables as cached ones
+	mark_as_advanced( KBLAS_ROOT_DIR KBLAS_INCLUDE_DIR KBLAS_LIBRARY BLAS_LIBRARIES )
+
+	# create an imported target, i.e. a target NOT built internally, as from
+	# https://cmake.org/cmake/help/latest/command/add_library.html#imported-libraries
+	# this way, depending targets may link against libkblas with target_link_libraries(),
+	# as if it was an internal target
+	# UNKNOWN tells CMake to inspect the library type (static or shared)
+	# e.g., if you compiled your own static libkblas and injected it via KBLAS_ROOT
+	# it will work out without changes
+        add_library ( gfortran::gfortran UNKNOWN IMPORTED )
+	add_library ( Kblas::Kblas UNKNOWN IMPORTED )
+	# set its properties to the appropiate locations, for both headers and binaries
+        set_target_properties( gfortran::gfortran
+                PROPERTIES
+                IMPORTED_LOCATION "${GFORTRAN_LIBRARY}"
+        )
+	set_target_properties( Kblas::Kblas
+		PROPERTIES
+		IMPORTED_LOCATION "${KBLAS_LIBRARY}"
+		INTERFACE_INCLUDE_DIRECTORIES ${KBLAS_INCLUDE_DIR}
+	)
+	if(NOT LibM_FOUND)
+		find_package(LibM REQUIRED)
+	endif()
+	target_link_libraries(Kblas::Kblas INTERFACE LibM::LibM gfortran::gfortran)
+	if(${KBLAS_IMPL} STREQUAL "omp")
+		if(NOT OpenMP_FOUND)
+			find_package(OpenMP REQUIRED)
+		endif()
+	        target_link_libraries(Kblas::Kblas INTERFACE OpenMP::OpenMP_C)
+	elseif(${KBLAS_IMPL} STREQUAL "pthread")
+		if(NOT Threads_FOUND)
+			find_package(Threads REQUIRED)
+		endif()
+		if(NOT CMAKE_USE_PTHREADS_INIT)
+			message(ERROR "pthread not found")
+		endif()
+		target_link_libraries(Kblas::Kblas INTERFACE Threads::Threads)
+	endif()
+endif()
+
+
diff --git a/cmake/KunpengBLAS.cmake b/cmake/KunpengBLAS.cmake
new file mode 100644
index 000000000..cb4ad4f6b
--- /dev/null
+++ b/cmake/KunpengBLAS.cmake
@@ -0,0 +1,67 @@
+#
+#   Copyright 2022 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#[===================================================================[
+Find libkml inside the standard system directories
+
+Read-only output variables:
+  KML_FOUND
+	Indicates that the library has been found.
+
+  KML_INCLUDE_DIR
+	Points to the libkml include directory.
+
+creates a target kml::kml to link against libkml
+#]===================================================================]
+
+# documentation of find_path() https://cmake.org/cmake/help/latest/command/find_path.html
+# documentation of find_library() https://cmake.org/cmake/help/latest/command/find_library.html
+
+# find the root directory for libkml
+find_path( KML_ROOT_DIR
+	NAMES lib/kml.h # by checking where "lib/kml.h" exists
+	HINTS ${KML_SOURCE} # start looking from KML_SOURCE, the most likely place
+)
+
+# if the listed variables are set to existing paths, set the kml_FOUND variable
+# if not and the REQUIRED option was given when calling this find_module(),
+# raise an error (some components were not found and we need all of them)
+include( FindPackageHandleStandardArgs )
+find_package_handle_standard_args( KML
+	REQUIRED_VARS KML_ROOT_DIR
+)
+
+# if we found the library, create a dedicated target with all needed information
+if( KML_FOUND  )
+	# do not show these variables as cached ones
+	mark_as_advanced( KML_ROOT_DIR )
+
+	# create an imported target, i.e. a target NOT built internally, as from
+	# https://cmake.org/cmake/help/latest/command/add_library.html#imported-libraries
+	# this way, depending targets may link against libkml with target_link_libraries(),
+	# as if it was an internal target
+	# UNKNOWN tells CMake to inspect the library type (static or shared)
+	# e.g., if you compiled your own static libkml and injected it via KML_ROOT
+	# it will work out without changes
+	add_library ( kml INTERFACE )
+	# set its properties to the appropiate locations, for both headers and binaries
+	# set_target_properties( kml::kml
+	# 	PROPERTIES
+	# 	INTERFACE_INCLUDE_DIRECTORIES "${KML_ROOT_DIR}"
+	# )
+	target_include_directories ( kml INTERFACE ${KML_ROOT_DIR} 
+	)
+endif()
diff --git a/docs/alp-public.conf b/docs/alp-public.conf
new file mode 100644
index 000000000..937501d52
--- /dev/null
+++ b/docs/alp-public.conf
@@ -0,0 +1,2453 @@
+# Doxyfile 1.8.14
+
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "ALP Public API"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = alpha
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs/alp-public
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = YES
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = YES
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = NO
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = YES
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = NO
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = NO
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= NO
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = NO
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = include/alp.hpp \
+                         include/alp/
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          = *.hpp \
+                         *.cpp \
+                         *.h \
+                         *.c
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = include/alp/base
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        = internal \
+                         IMF \
+                         imf \
+                         AutoDeleter \
+                         SizeOf
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         = amsmath
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = __DOXYGEN__
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 319bdf093..5b5bd7f87 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -75,6 +75,7 @@ install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/base/"
 add_library( alp_utils_headers INTERFACE )
 target_include_directories( alp_utils_headers INTERFACE
 	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+	$<INSTALL_INTERFACE:.>
 )
 
 # copy utils headers
@@ -107,6 +108,11 @@ install( TARGETS backend_headers_nodefs EXPORT GraphBLASTargets
 install( TARGETS alp_utils_headers EXPORT GraphBLASTargets
 	INCLUDES DESTINATION "${INCLUDE_INSTALL_DIR}"
 )
+if( WITH_ALP_REFERENCE_BACKEND_HEADERS OR
+	WITH_ALP_OMP_BACKEND_HEADERS
+)
+	add_subdirectory( alp )
+endif()
 
 if( WITH_REFERENCE_BACKEND_HEADERS )
 	add_library( backend_reference_headers INTERFACE )
diff --git a/include/alp.hpp b/include/alp.hpp
new file mode 100644
index 000000000..3ebc36637
--- /dev/null
+++ b/include/alp.hpp
@@ -0,0 +1,177 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __DOXYGEN__
+
+/**
+ * Define this macro to disable libnuma use.
+ */
+#define _ALP_NO_LIBNUMA
+
+/**
+ * Define this macro to disable thread pinning.
+ */
+#define _ALP_NO_PINNING
+
+/**
+ * Defie this macro to compile with PlatformBSP support.
+ */
+#define _ALP_WITH_LPF
+
+/**
+ * Which GraphBLAS backend should be default.
+ *
+ * Known single user-process options:
+ *  -# reference
+ *  -# reference_omp
+ *
+ * Known multiple user-process options:
+ *  -# BSP1D
+ */
+#define _ALP_BACKEND reference
+
+/**
+ * Which GraphBLAS backend the BSP1D backend should use within a single user
+ * process. For possible values, see the single user process options for
+ * #_ALP_BACKEND.
+ */
+#define _ALP_BSP1D_BACKEND
+
+/**
+ * \mainpage Algebraic Programming (ALP) API Specification.
+ *
+ * This document specifies the ALP API.
+ *
+ * \par Containers
+ *
+ * ALP defines the following containers for users to interface with:
+ *   -# alp::Scalar
+ *   -# alp::Vector
+ *   -# alp::Matrix
+ *
+ * Containers take as a template argument \a T the type that the container
+ * stores. The type \a T can be any C++ plain-old-data type.
+ *
+ * ALP defines primitives for performing IO to and from containers in the
+ * \ref IO module.
+ *
+ * \par Algebraic structures
+ *
+ * ALP defines the following algebraic structures to interface with:
+ *   -# All binary operators defined in alp::operators;
+ *   -# identities defined in alp::identities;
+ *   -# alp::Monoid structures by combining binary operators and identities;
+ *   -# alp::Semiring structures by combining two operators and two identites.
+ *
+ * For example, a real semiring is composed as follows:
+ * \code
+ * alp::Semiring<
+ *    alp::operators::add< double >, alp::operators::mul< double >,
+ *    alp::identities::zero, alp::identities::one
+ * > reals;
+ * \endcode
+ * This semiring forms the basis of most numerical linear algebra.
+ *
+ * Our definition of monoid and semirings imply that the domains they operate
+ * over are derived from the operators. For example, to perform half precision
+ * multiplication and accumulate in single precision, the following semiring
+ * may be defined:
+ * \code
+ * alp::Semiring<
+ *     alp::operators::add< short float, float, float >,
+ *     alp::operators::mul< short float >,
+ *     alp::identities::zero, alp::identities::one
+ * > mixedReals;
+ * \endcode
+ *
+ * \par Algebraic primitives
+ *
+ * Operations on containers proceed by calling ALP primitives, which are
+ * parametrised in the algebraic structure the operation should proceed with.
+ * Primitives are grouped in modules that follow roughly the traditional BLAS
+ * taxonomy:
+ *   -# \ref BLAS0
+ *   -# \ref BLAS1
+ *   -# \ref BLAS2
+ *   -# \ref BLAS3
+ *
+ * \par Algebraic structures and views
+ *
+ * Containers may have structures (e.g., symmetric) and views (e.g., transpose),
+ * and may be sparse or dense as per alp::Density. Operations are in principle
+ * defined for both sparse \em and dense containers, as well as mixtures of
+ * sparse and dense containers, provided that the right algebraic structures are
+ * provided -- for example, a sparse vector cannot be reduced into a scalar via
+ * alp::foldl when an (associative) operator is given; instead, a monoid
+ * structure is required in order to interpret any missing values in a sparse
+ * vector.
+ *
+ * Views allow for the selection of submatrices from a larger matrix, such as
+ * for example necessary to express Cholesky factorisation algorithms. Views are
+ * constructed through alp::get_view. Please see the slides for concrete
+ * examples.
+ */
+#endif
+
+#ifndef _H_ALP
+#define _H_ALP
+
+// do NOT remove this #if, in order to protect this header from
+// clang-format re-ordering
+#if 1
+// load active configuration
+ #include <alp/config.hpp> //defines _ALP_BACKEND
+#endif
+
+// #pragma message "Included ALP.hpp"
+
+// collects the user-level includes
+#include <alp/backends.hpp>
+#include <alp/blas0.hpp>
+#include <alp/blas1.hpp>
+#include <alp/blas2.hpp>
+#include <alp/blas3.hpp>
+#include <alp/config.hpp>
+#include <alp/density.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/exec.hpp>
+#include <alp/identities.hpp>
+#include <alp/imf.hpp>
+#include <alp/init.hpp>
+#include <alp/io.hpp>
+#include <alp/iomode.hpp>
+#include <alp/ops.hpp>
+#include <alp/rc.hpp>
+#include <alp/rels.hpp>
+#include <alp/semiring.hpp>
+#include <alp/structures.hpp>
+#include <alp/type_traits.hpp>
+#include <alp/utils.hpp>
+#include <alp/views.hpp>
+
+#ifdef _ALP_BACKEND
+// #pragma message "_ALP_BACKEND defined"
+// include also the main data types in order to have the default definitions
+// but ONLY if a default backend is define; otherwise, the previous headers
+// contain the relevant definitions (without defaults)
+ #include <alp/matrix.hpp>
+ #include <alp/scalar.hpp>
+ #include <alp/vector.hpp>
+#endif
+
+#endif // end ``_H_ALP''
+
diff --git a/include/alp/CMakeLists.txt b/include/alp/CMakeLists.txt
new file mode 100644
index 000000000..d763bf935
--- /dev/null
+++ b/include/alp/CMakeLists.txt
@@ -0,0 +1,156 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Definition of GraphBLAS include targets: all targets here defined
+# are interface targets for headers and basic definitions required to build
+# GraphBLAS backends and tests. Importing targets have all basic dependencies
+# and definitions to compile against each backend, but MUST explicitly
+# set a default backend (if they want to do so).
+#
+assert_defined_variables( ALP_REFERENCE_INCLUDE_DEFS WITH_ALP_REFERENCE_BACKEND_HEADERS )
+assert_defined_variables( ALP_OMP_INCLUDE_DEFS WITH_ALP_OMP_BACKEND_HEADERS )
+assert_defined_variables( ALP_DISPATCH_INCLUDE_DEFS WITH_ALP_DISPATCH_BACKEND_HEADERS )
+assert_valid_variables( INCLUDE_INSTALL_DIR )
+
+# to avoid flaky acrobatics with regex or glob expressions, copy main files directly
+install( FILES "../alp.hpp" DESTINATION "${INCLUDE_INSTALL_DIR}" )
+set( root_files
+	"../alp.hpp"
+	"backends.hpp"
+	"blas0.hpp"
+	"blas1.hpp"
+	"blas2.hpp"
+	"blas3.hpp"
+	"config.hpp"
+	"density.hpp"
+	"descriptors.hpp"
+	"exec.hpp"
+	"identities.hpp"
+	"imf.hpp"
+	"init.hpp"
+	"internalops.hpp"
+	"io.hpp"
+	"iomode.hpp"
+	"matrix.hpp"
+	"monoid.hpp"
+	"ops.hpp"
+	"phase.hpp"
+	"rc.hpp"
+	"rels.hpp"
+	"scalar.hpp"
+	"semiring.hpp"
+	"storage.hpp"
+	"structures.hpp"
+	"type_traits.hpp"
+	"utils.hpp"
+	"vector.hpp"
+	"views.hpp"
+)
+set( ALP_INCLUDE_INSTALL_DIR "${INCLUDE_INSTALL_DIR}/alp")
+install( FILES ${root_files} DESTINATION "${ALP_INCLUDE_INSTALL_DIR}" )
+
+# copy base headers and all its subdirectories (if any)
+# note: leave the slash "/" at the end of the DIRECTORY path,
+# othwerise CMake will create a "graphblas/base" directory inside DESTINATION !!!
+# https://cmake.org/cmake/help/latest/command/install.html#installing-directories
+install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/base/"
+	DESTINATION "${ALP_INCLUDE_INSTALL_DIR}/base"
+	FILES_MATCHING REGEX "${HEADERS_REGEX}"
+)
+
+# copy utils headers
+install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/utils/"
+	DESTINATION "${ALP_INCLUDE_INSTALL_DIR}/utils"
+	FILES_MATCHING REGEX "${HEADERS_REGEX}"
+)
+
+if( WITH_ALP_REFERENCE_BACKEND_HEADERS )
+	add_library( backend_alp_reference_headers INTERFACE )
+	target_link_libraries( backend_alp_reference_headers INTERFACE backend_headers_nodefs )
+	target_compile_definitions( backend_alp_reference_headers INTERFACE "${ALP_REFERENCE_INCLUDE_DEFS}" )
+
+	install( TARGETS backend_alp_reference_headers EXPORT GraphBLASTargets )
+endif()
+
+if( WITH_ALP_REFERENCE_BACKEND )
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/alp/reference"
+		DESTINATION "${ALP_INCLUDE_INSTALL_DIR}/alp_reference"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+endif()
+
+if( WITH_ALP_DISPATCH_BACKEND_HEADERS OR ( WITH_ALP_OMP_BACKEND_HEADERS AND ( "${_ALP_SECONDARY_BACKEND}" EQUAL "dispatch" ) ) )
+	install( FILES "${CMAKE_BINARY_DIR}/blas_wrapper/alp_blas.h" DESTINATION "${INCLUDE_INSTALL_DIR}/blas_wrapper" )
+
+	install(
+		TARGETS cblas EXPORT GraphBLASTargets
+		INCLUDES DESTINATION "${INCLUDE_INSTALL_DIR}/blas_wrapper"
+	)
+endif()
+
+
+if( WITH_ALP_DISPATCH_BACKEND_HEADERS )
+	add_library( backend_alp_dispatch_headers INTERFACE )
+	target_link_libraries( backend_alp_dispatch_headers INTERFACE backend_headers_nodefs )
+	target_compile_definitions( backend_alp_dispatch_headers INTERFACE "${ALP_DISPATCH_INCLUDE_DEFS}" )
+	target_link_libraries( backend_alp_dispatch_headers INTERFACE cblas )
+	
+	install( TARGETS backend_alp_dispatch_headers EXPORT GraphBLASTargets )
+endif()
+
+if( WITH_ALP_DISPATCH_BACKEND )
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/dispatch"
+		DESTINATION "${ALP_INCLUDE_INSTALL_DIR}/alp_dispatch"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+endif()
+
+if( WITH_ALP_OMP_BACKEND_HEADERS )
+	add_library( backend_alp_omp_headers INTERFACE )
+	target_link_libraries( backend_alp_omp_headers INTERFACE backend_headers_nodefs )
+	target_link_libraries( backend_alp_omp_headers INTERFACE OpenMP::OpenMP_CXX )
+	target_link_libraries( backend_alp_omp_headers INTERFACE cblas )
+	target_compile_definitions( backend_alp_omp_headers INTERFACE "${ALP_OMP_INCLUDE_DEFS}" )
+
+	install( TARGETS backend_alp_omp_headers EXPORT GraphBLASTargets )
+endif()
+
+if( WITH_ALP_OMP_BACKEND )
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/alp/omp"
+		DESTINATION "${ALP_INCLUDE_INSTALL_DIR}/alp_omp"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+endif()
+
+# this target lists the algorithms implemented on top of the generic functionalities,
+# hence it depends only on backend_headers_nodefs
+add_library( alp_algorithms INTERFACE )
+target_link_libraries( alp_algorithms INTERFACE backend_headers_nodefs )
+
+target_include_directories(
+	alp_algorithms INTERFACE
+
+	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/alp/algorithms>
+	$<INSTALL_INTERFACE:alp/algorithms>
+)
+
+install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/"
+	DESTINATION "${ALP_INCLUDE_INSTALL_DIR}/algorithms"
+	FILES_MATCHING REGEX "${HEADERS_REGEX}"
+)
+
+install( TARGETS alp_algorithms EXPORT GraphBLASTargets )
diff --git a/include/alp/algorithms/backsubstitution.hpp b/include/alp/algorithms/backsubstitution.hpp
new file mode 100644
index 000000000..e4c266687
--- /dev/null
+++ b/include/alp/algorithms/backsubstitution.hpp
@@ -0,0 +1,150 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <iomanip>
+
+#include <alp.hpp>
+#ifdef DEBUG
+#include "../../../tests/utils/print_alp_containers.hpp"
+#endif
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 *        Solves linear system Ax=b
+		 *        where A is UpperTriangular matrix, b is given RHS vector
+		 *        and x is the solution.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[in]  A    input upper trinagular matrix
+		 * @param[in]  b    input RHS vector
+		 * @param[out] x    solution vector
+		 * @param[in]  ring The semiring used in the computation
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename D = double,
+			typename View,
+			typename ImfR,
+			typename ImfC,
+			typename Vecx,
+			typename Vecb,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_vector< Vecx >::value &&
+				is_vector< Vecb >::value
+			> * = nullptr
+		>
+		RC backsubstitution(
+			Matrix< D, structures::UpperTriangular, Dense, View, ImfR, ImfC > &A,
+			Vecx &x,
+			Vecb &b,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+
+			RC rc = SUCCESS;
+
+			if( ( nrows( A ) != size( x ) ) || ( size( b ) != size( x ) ) ) {
+				std::cerr << "Incompatible sizes in trsv.\n";
+				return FAILED;
+			}
+
+			const size_t n = nrows( A );
+
+			for( size_t k = 0; k < n ; ++k ) {
+				Scalar< D > alpha( ring.template getZero< D >() );
+				const size_t i = n - k - 1;
+				//x[i]=(b[i]-A[i,i:].dot(x[i:]))/A[i,i]
+ 				auto A_i = get_view( A, i, utils::range( i, n ) );
+				auto A_ii = get_view( A, i, utils::range( i, i + 1 ) );
+				auto x_i = get_view( x, utils::range( i, i + 1 ) );
+				auto b_i = get_view( b, utils::range( i, i + 1 ) );
+				auto x_i_n = get_view( x, utils::range( i, n ) );
+				rc = rc ? rc : alp::dot( alpha, A_i, alp::conjugate( x_i_n ), ring );
+				rc = rc ? rc : alp::set( x_i, b_i );
+				rc = rc ? rc : alp::foldl( x_i, alpha, minus );
+ 				rc = rc ? rc : alp::set( alpha, Scalar< D >( ring.template getZero< D >() ) );
+ 				rc = rc ? rc : alp::foldl( alpha, A_ii, ring.getAdditiveMonoid() );
+ 				rc = rc ? rc : alp::foldl( x_i, alpha, divide );
+			}
+
+			return rc;
+		}
+
+		template<
+			typename D = double,
+			typename ViewA,
+			typename ImfRA,
+			typename ImfCA,
+			typename StructX,
+			typename ViewX,
+			typename ImfRX,
+			typename ImfCX,
+			typename StructB,
+			typename ViewB,
+			typename ImfRB,
+			typename ImfCB,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >
+		>
+		RC backsubstitution(
+			Matrix< D, structures::UpperTriangular, Dense, ViewA, ImfRA, ImfCA > &A,
+			Matrix< D, StructX, Dense, ViewX, ImfRX, ImfCX > &X,
+			Matrix< D, StructB, Dense, ViewB, ImfRB, ImfCB > &B,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+
+			RC rc = SUCCESS;
+
+			if (
+				( nrows( X ) != nrows( B ) ) ||
+				( ncols( X ) != ncols( B ) ) ||
+				( ncols( A ) != nrows( X ) )
+			) {
+				std::cerr << "Incompatible sizes in trsm.\n";
+				return FAILED;
+			}
+
+			const size_t m = nrows( X );
+			const size_t n = ncols( X );
+
+			for( size_t i = 0; i < n ; ++i ) {
+				auto x = get_view( X, utils::range( 0, m ), i );
+				auto b = get_view( B, utils::range( 0, m ), i );
+				rc = rc ? rc : algorithms::backsubstitution( A, x, b, ring, minus, divide );
+			}
+
+			assert( rc == SUCCESS );
+			return rc;
+		}
+
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/cholesky.hpp b/include/alp/algorithms/cholesky.hpp
new file mode 100644
index 000000000..124e63319
--- /dev/null
+++ b/include/alp/algorithms/cholesky.hpp
@@ -0,0 +1,495 @@
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <iomanip>
+
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#include <alp/algorithms/forwardsubstitution.hpp>
+#include <alp/algorithms/fused_mxm_foldl.hpp>
+#include "../../../tests/utils/print_alp_containers.hpp"
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * Computes the Cholesky decomposition U^TU = H of a real symmetric
+		 * positive definite (SPD) (or complex Hermitian positive definite)
+		 * matrix H where \a U is upper triangular.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[out] U    output upper triangular matrix
+		 * @param[in]  H    input real symmetric positive definite matrix
+		 * @param[in]  ring The semiring used in the computation
+		 * @return RC        SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename MatU,
+			typename MatH,
+			typename D = typename MatU::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatU >::value &&
+				is_matrix< MatH >::value &&
+				structures::is_a< typename MatU::structure, structures::UpperTriangular >::value &&
+				// TODO: structures::SymmetricPositiveDefinite should be replced
+				//       with structures::SymmetricPositiveDefinitePositiveDefinite
+				(
+					(
+						!grb::utils::is_complex< D >::value &&
+						structures::is_a< typename MatH::structure, structures::SymmetricPositiveDefinite >::value
+					) || (
+						grb::utils::is_complex< D >::value &&
+						structures::is_a< typename MatH::structure, structures::HermitianPositiveDefinite >::value
+					)
+				) &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC cholesky_uptr(
+			MatU &U,
+			const MatH &H,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+#ifdef DEBUG
+			std::cout << "Entered cholesky_uptr out-of-place non-blocked version.\n";
+#endif
+			RC rc = SUCCESS;
+
+			if(
+				( nrows( U ) != nrows( H ) ) ||
+				( ncols( U ) != ncols( H ) )
+			) {
+				std::cerr << "Incompatible sizes in cholesky_uptr.\n";
+				return FAILED;
+			}
+
+			const size_t n = nrows( H );
+
+			// Out of place specification of the operation
+			Matrix< D, typename MatH::structure > UU( n );
+			rc = rc ? rc : set( UU, H );
+
+#ifdef DEBUG
+			if( rc != SUCCESS ) {
+				std::cerr << " set( UU, H ) failed\n";
+				return rc;
+			}
+			print_matrix( " -- UU --  " , UU );
+#endif
+
+			for( size_t k = 0; k < n; ++k ) {
+#ifdef DEBUG
+				std::cout << "============ Iteration " << k << " ============" << std::endl;
+#endif
+
+				auto a = get_view( UU, k, utils::range( k, n ) );
+#ifdef DEBUG
+				print_vector( " -- a --  " , a );
+#endif
+
+				// U[ k, k ] = alpha = sqrt( UU[ k, k ] )
+				Scalar< D > alpha;
+				rc = rc ? rc : eWiseLambda(
+					[ &alpha, &ring ]( const size_t i, D &val ) {
+						if( i == 0 ) {
+							(void) set( alpha, alp::Scalar< D >( std::sqrt( val ) ) );
+							val = *alpha;
+						}
+					},
+					a
+				);
+
+#ifdef DEBUG
+				std::cout << "alpha " << *alpha << std::endl;
+				if( rc != SUCCESS ) {
+					std::cerr << " eWiseLambda( lambda, view ) (0) failed\n";
+					return rc;
+				}
+#endif
+
+				auto v = get_view( UU, k, utils::range( k + 1, n ) );
+#ifdef DEBUG
+				print_vector( " -- v --  " , v );
+#endif
+				// UU[ k + 1: , k ] = UU[ k + 1: , k ] / alpha
+				rc = rc ? rc : foldl( v, alpha, divide );
+
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cerr << " eWiseLambda( lambda, view ) (1) failed\n";
+					return rc;
+				}
+#endif
+
+				// UU[ k+1: , k+1: ] -= v*v^T
+				auto Uprim = get_view( UU, utils::range( k + 1, n ), utils::range( k + 1, n ) );
+
+				auto vstar = conjugate( v );
+				auto vvt = outer( vstar, ring.getMultiplicativeOperator() );
+#ifdef DEBUG
+				print_vector( " -- v --  " , v );
+				print_matrix( " vvt ", vvt );
+#endif
+				rc = rc ? rc : foldl( Uprim, vvt, minus );
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cerr << " eWiseLambda( lambda, view ) (2) failed\n";
+					return rc;
+				}
+#endif
+			}
+
+			// Finally collect output into U matrix and return
+			for( size_t k = 0; k < n; ++k ) {
+
+				// U[ k, k: ] = UU[ k, k: ]
+				auto vU  = get_view( U, k, utils::range( k, n )  );
+				auto vUU = get_view( UU, k, utils::range( k, n )  );
+
+				rc = set( vU, vUU );
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cerr << " set( view, view ) failed\n";
+					return rc;
+				}
+#endif
+			}
+
+			return rc;
+		}
+
+		/**
+		 * Computes the blocked version Cholesky decomposition U^TU = H of a real symmetric
+		 * positive definite (SPD) matrix H where \a U is upper triangular.
+		 * U^T  is equvalent to transpose(U)
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @param[out] U    output upper triangular matrix
+		 * @param[in]  H    input real symmetric positive definite matrix
+		 * @param[in]  ring The semiring used in the computation
+		 * @return RC        SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename MatU,
+			typename MatH,
+			typename D = typename MatU::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			std::enable_if_t<
+				is_matrix< MatU >::value &&
+				is_matrix< MatH >::value &&
+				structures::is_a< typename MatU::structure, structures::UpperTriangular >::value &&
+				structures::is_a< typename MatH::structure, structures::SymmetricPositiveDefinite >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value
+			> * = nullptr
+		>
+		RC cholesky_uptr_blk(
+			MatU &U,
+			const MatH &H,
+			const size_t &bs,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus()
+		) {
+#ifdef DEBUG
+			std::cout << "Entered cholesky_upr out-of-place blocked version.\n";
+#endif
+			const Scalar< D > zero( ring.template getZero< D >() );
+
+			if(
+				( nrows( U ) != nrows( H ) ) ||
+				( ncols( U ) != ncols( H ) )
+			) {
+				std::cerr << "Incompatible sizes in cholesky_uptr_blk.\n";
+				return FAILED;
+			}
+
+			RC rc = SUCCESS;
+
+			const size_t n = nrows( U );
+
+			Matrix< D, typename MatH::structure > UU( n );
+			rc = rc ? rc : set( UU, H );
+#ifdef DEBUG
+			if( rc != SUCCESS ) {
+				std::cout << "set failed\n";
+				return rc;
+			}
+#endif
+
+			//nb: number of blocks of (max) size bz
+			if( ( bs == 0 ) || ( bs > n ) ) {
+				std::cerr << "Block size has illegal value, bs =   " << bs << " .\n";
+				std::cerr << "It should be from interval < 0,  " << n << "] .\n";
+				return FAILED;
+			}
+			size_t nb = n / bs;
+			if( n % bs != 0 ){
+				nb = nb + 1;
+			}
+
+
+			for( size_t i = 0; i < nb; ++i ) {
+				const size_t a = std::min( i * bs, n );
+				const size_t b = std::min( ( i + 1 ) * bs, n );
+				const size_t c = n;
+
+				const utils::range range1( a, b );
+				const utils::range range2( b, c );
+
+				auto A11 = get_view( UU, range1, range1 );
+
+				// for complex we should conjugate A12
+				auto A12 = get_view< structures::General >( UU, range1, range2 );
+
+				//A11=cholesky(A11)
+				auto A11_out = get_view( U, range1, range1 );
+
+				rc = rc ? rc : cholesky_uptr( A11_out, A11, ring );
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cout << "cholesky_uptr failed\n";
+					return rc;
+				}
+#endif
+
+				auto A12_out = get_view< structures::General >(	U, range1, range2 );
+				auto A11_out_T = get_view< alp::view::transpose >( A11_out );
+
+				rc = rc ? rc : algorithms::forwardsubstitution(
+					A11_out_T,
+					A12_out,
+					A12,
+					ring
+				);
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cout << "Forwardsubstitution failed\n";
+					return rc;
+				}
+#endif
+
+				Matrix< D, typename MatH::structure > Reflector( ncols( A12_out ) );
+				rc = rc ? rc : set( Reflector, zero );
+				rc = rc ? rc : mxm( Reflector, get_view< alp::view::transpose >( A12_out ), A12_out, ring );
+
+				auto A22 = get_view( UU, range2, range2 );
+				rc = rc ? rc : foldl( A22, Reflector, minus );
+			}
+
+			return rc;
+		}
+
+		/** inplace non-blocked versions, part below diagonal is not modified */
+		template<
+			typename MatU,
+			typename D = typename MatU::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatU >::value &&
+				structures::is_a< typename MatU::structure, structures::Square >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC cholesky_uptr(
+			MatU &U,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+#ifdef DEBUG
+			std::cout << "Entered cholesky_upr in-place non-blocked version.\n";
+#endif
+			const Scalar< D > zero( ring.template getZero< D >() );
+
+			RC rc = SUCCESS;
+
+			const size_t n = nrows( U );
+
+			for( size_t k = 0; k < n; ++k ) {
+#ifdef DEBUG
+				std::cout << "============ Iteration " << k << " ============" << std::endl;
+#endif
+
+				auto a = get_view( U, k, utils::range( k, n ) );
+
+				// U[ k, k ] = alpha = sqrt( UU[ k, k ] )
+				Scalar< D > alpha;
+				rc = rc ? rc : eWiseLambda(
+					[ &alpha, &ring ]( const size_t i, D &val ) {
+						if( i == 0 ) {
+							(void) set( alpha, alp::Scalar< D >( std::sqrt( val ) ) );
+							val = *alpha;
+						}
+					},
+					a
+				);
+
+#ifdef DEBUG
+				std::cout << "alpha " << *alpha << std::endl;
+				if( rc != SUCCESS ) {
+					std::cerr << " eWiseLambda( lambda, view ) (0) failed\n";
+					return rc;
+				}
+#endif
+
+				auto v = get_view( U, k, utils::range( k + 1, n ) );
+#ifdef DEBUG
+				print_vector( " -- v --  " , v );
+#endif
+				// UU[ k, k + 1: ] = UU[ k, k + 1: ] / alpha
+				rc = rc ? rc : foldl( v, alpha, divide );
+
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cerr << " eWiseLambda( lambda, view ) (1) failed\n";
+					return rc;
+				}
+#endif
+
+				// UU[ k+1: , k+1: ] -= v*v^T
+				auto Uprim = get_view( U, utils::range( k + 1, n ), utils::range( k + 1, n ) );
+
+				auto vvt = outer( v, ring.getMultiplicativeOperator() );
+#ifdef DEBUG
+				print_vector( " -- v --  " , v );
+				print_matrix( " vvt ", vvt );
+#endif
+
+				rc = rc ? rc : foldl( Uprim, vvt, minus );
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cerr << " eWiseLambda( lambda, view ) (2) failed\n";
+					return rc;
+				}
+#endif
+
+			}
+
+			return rc;
+		}
+
+
+		/** inplace blocked version, part below diagonal is not modified */
+		template<
+			typename MatU,
+			typename D = typename MatU::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			std::enable_if_t<
+				is_matrix< MatU >::value &&
+				structures::is_a< typename MatU::structure, structures::Square >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value
+			> * = nullptr
+		>
+		RC cholesky_uptr_blk(
+			MatU &U,
+			const size_t &bs,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus()
+		) {
+#ifdef DEBUG
+			std::cout << "Entered cholesky_upr in-place blocked version.\n";
+#endif
+			const Scalar< D > zero( ring.template getZero< D >() );
+
+			RC rc = SUCCESS;
+
+			const size_t n = nrows( U );
+
+			//nb: number of blocks of (max) size bz
+			if( ( bs == 0 ) || ( bs > n ) ) {
+				std::cerr << "Block size has illegal value, bs =   " << bs << " .\n";
+				std::cerr << "It should be from interval < 0,  " << n << "] .\n";
+				return FAILED;
+			}
+			size_t nb = n / bs;
+			if( n % bs != 0 ){
+				nb = nb + 1;
+			}
+
+
+			for( size_t i = 0; i < nb; ++i ) {
+				const size_t a = std::min( i * bs, n );
+				const size_t b = std::min( ( i + 1 ) * bs, n );
+				const size_t c = n;
+
+				const utils::range range1( a, b );
+				const utils::range range2( b, c );
+
+				auto A11 = get_view< structures::Square >( U, range1, range1 );
+
+				// for complex we should conjugate A12
+				auto A12 = get_view< structures::General >( U, range1, range2 );
+
+				rc = rc ? rc : cholesky_uptr( A11, ring );
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cout << "cholesky_uptr failed\n";
+					return rc;
+				}
+#endif
+
+				//auto A11_T = get_view< alp::view::transpose >( A11 );
+				auto A11UT = get_view< structures::UpperTriangular >( U, range1, range1 );
+
+				auto A11UT_T = get_view< alp::view::transpose >( A11UT );
+
+				rc = rc ? rc : algorithms::forwardsubstitution(	A11UT_T, A12, ring );
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cout << "Forwardsubstitution failed\n";
+					return rc;
+				}
+#endif
+
+				auto A22UT = get_view< structures::Symmetric >( U, range2, range2 );
+				rc = rc ? rc : algorithms::fused_symm_mxm_foldl( A22UT, A12, ring, minus );
+#ifdef DEBUG
+				if( rc != SUCCESS ) {
+					std::cout << "algorithms::fused_symm_mxm_foldl failed\n";
+					return rc;
+				}
+#endif
+			}
+
+			return rc;
+		}
+
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/conjugate_gradient.hpp b/include/alp/algorithms/conjugate_gradient.hpp
new file mode 100644
index 000000000..3515a070b
--- /dev/null
+++ b/include/alp/algorithms/conjugate_gradient.hpp
@@ -0,0 +1,295 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author Aristeidis Mastoras
+ */
+
+#ifndef _H_ALP_ALGORITHMS_CONJUGATE_GRADIENT_ALP
+#define _H_ALP_ALGORITHMS_CONJUGATE_GRADIENT_ALP
+
+#include <cstdio>
+
+#include <alp.hpp>
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * Solves a linear system \f$ b = Ax \f$ with \f$ x \f$ unknown by the Conjugate
+		 * Gradients (CG) method on general fields.
+		 *
+		 * Does not perform any preconditioning.
+		 *
+		 * @tparam descr        The user descriptor
+		 * @tparam IOType       The input/output vector nonzero type
+		 * @tparam ResidualType The type of the residual
+		 * @tparam NonzeroType  The matrix nonzero type
+		 * @tparam InputType    The right-hand side vector nonzero type
+		 * @tparam Ring         The semiring under which to perform CG
+		 * @tparam Minus        The minus operator corresponding to the inverse of the
+		 *                      additive operator of the given \a Ring.
+		 * @tparam Divide       The division operator corresponding to the inverse of
+		 *                      the multiplicative operator of the given \a Ring.
+		 *
+		 * Valid descriptors to this algorithm are:
+		 *   -# descriptors::no_casting
+		 *
+		 * By default, i.e., if none of \a ring, \a minus, or \a divide (nor their
+		 * types) are explicitly provided by the user, the natural field on double
+		 * data types will be assumed.
+		 *
+		 * \note An abstraction of a field that encapsulates \a Ring, \a Minus, and
+		 *       \a Divide may be more appropriate. This will also naturally ensure
+		 *       that demands on domain types are met.
+		 *
+		 * @param[in,out] x              On input: the initial guess to the solution.
+		 *                               On output: the last computed approximation.
+		 * @param[in]     A              The (square) positive semi-definite system
+		 *                               matrix.
+		 * @param[in]     b              The known right-hand side in \f$ Ax = b \f$.
+		 * @param[in]     max_iterations The maximum number of CG iterations.
+		 * @param[in]     tol            The requested relative tolerance.
+		 * @param[out]    residual       The residual corresponding to output \a x.
+		 * @param[in,out] r              A temporary vector of the same size as \a x.
+		 * @param[in,out] u              A temporary vector of the same size as \a x.
+		 * @param[in,out] temp           A temporary vector of the same size as \a x.
+		 * @param[in]     ring           The semiring under which to perform the CG.
+		 * @param[in]     minus          The inverse of the additive operator of
+		 *                               \a ring.
+		 * @param[in]     divide         The inverse of the multiplicative operator
+		 *                               of \a ring.
+		 *
+		 * \todo There is a sqrt(...) operator that lives outside of the current
+		 *       algebraic abstractions. Would be great if that could be eliminated.
+		 *       See internal issue #89.
+		 */
+		template< Descriptor descr = descriptors::no_operation,
+			typename IOType, typename IOStructure,
+			typename ResidualType, typename ResidualStructure,
+			typename NonzeroType, typename NonzeroStructure, typename NonzeroStorage, typename NonzeroView,
+			typename InputType, typename InputStructure, typename InputStorage, typename InputView,
+			class Ring = Semiring< operators::add< IOType >, operators::mul< IOType >, identities::zero, identities::one >,
+			class Minus = operators::subtract< IOType >,
+			class Divide = operators::divide< IOType > >
+		RC conjugate_gradient( Vector< IOType, structures::General, Sparse > & x,
+			const Matrix< NonzeroType, structures::SymmetricPositiveDefinite, Sparse > & A,
+			const Vector< InputType, structures::General, Sparse > & b,
+			const size_t max_iterations,
+			Scalar< ResidualType > tol,
+			size_t & iterations,
+			Scalar< ResidualType, ResidualStructure > & residual,
+			Vector< IOType, structures::General, Sparse > & r,
+			Vector< IOType, structures::General, Sparse > & u,
+			Vector< IOType, structures::General, Sparse > & temp,
+			const Ring & ring = Ring(),
+			const Minus & minus = Minus(),
+			const Divide & divide = Divide() ) {
+
+			// static checks
+			static_assert( std::is_floating_point< ResidualType >::value,
+				"Can only use the CG algorithm with floating-point residual "
+				"types." ); // unless some different norm were used: issue #89
+			static_assert( !( descr & descriptors::no_casting ) || (
+					std::is_same< IOType, ResidualType >::value &&
+					std::is_same< IOType, NonzeroType >::value &&
+					std::is_same< IOType, InputType >::value
+				), "One or more of the provided containers have differing element types "
+				"while the no-casting descriptor has been supplied"
+			);
+			static_assert( !( descr & descriptors::no_casting ) || (
+					std::is_same< NonzeroType, typename Ring::D1 >::value &&
+					std::is_same< IOType, typename Ring::D2 >::value &&
+					std::is_same< InputType, typename Ring::D3 >::value &&
+					std::is_same< InputType, typename Ring::D4 >::value
+				), "no_casting descriptor was set, but semiring has incompatible domains "
+				"with the given containers."
+			);
+			static_assert( !( descr & descriptors::no_casting ) || (
+					std::is_same< InputType, typename Minus::D1 >::value &&
+					std::is_same< InputType, typename Minus::D2 >::value &&
+					std::is_same< InputType, typename Minus::D3 >::value
+				), "no_casting descriptor was set, but given minus operator has "
+				"incompatible domains with the given containers."
+			);
+			static_assert( !( descr & descriptors::no_casting ) || (
+					std::is_same< ResidualType, typename Divide::D1 >::value &&
+					std::is_same< ResidualType, typename Divide::D2 >::value &&
+					std::is_same< ResidualType, typename Divide::D3 >::value
+				), "no_casting descriptor was set, but given divide operator has "
+				"incompatible domains with the given tolerance type."
+			);
+			static_assert( std::is_floating_point< ResidualType >::value,
+				"Require floating-point residual type."
+			);
+
+			constexpr const Descriptor descr_dense = descr | descriptors::dense;
+			Scalar< ResidualType > zero( ring.template getZero< ResidualType >() );
+			const size_t n = ncols( A );
+
+			// dynamic checks
+			{
+				const size_t m = nrows( A );
+				if( size( x ) != n ) {
+					return MISMATCH;
+				}
+				if( size( b ) != m ) {
+					return MISMATCH;
+				}
+				if( size( r ) != n || size( u ) != n || size( temp ) != n ) {
+					std::cerr << "Error: provided workspace vectors are not of the correct "
+						<< "length.\n";
+					return MISMATCH;
+				}
+				if( m != n ) {
+					std::cerr << "Warning: alp::algorithms::conjugate_gradient requires "
+						<< "square input matrices, but a non-square input matrix was "
+						<< "given instead.\n";
+					return ILLEGAL;
+				}
+
+				// capacities
+				if( capacity( x ) != n ) {
+					return ILLEGAL;
+				}
+				if( capacity( r ) != n || capacity( u ) != n || capacity( temp ) != n ) {
+					return ILLEGAL;
+				}
+
+				// others
+				if( tol <= zero ) {
+					std::cerr << "Error: tolerance input to CG must be strictly positive\n";
+					return ILLEGAL;
+				}
+			}
+
+			// make x and b structurally dense (if not already) so that the remainder
+			// algorithm can safely use the dense descriptor for faster operations
+			{
+				RC rc = SUCCESS;
+				if( nnz( x ) != n ) {
+					rc = set< descriptors::invert_mask | descriptors::structural >(
+						x, x, zero
+					);
+				}
+				if( rc != SUCCESS ) {
+					return rc;
+				}
+				assert( nnz( x ) == n );
+			}
+
+			Scalar< ResidualType > alpha, sigma, bnorm;
+
+			// temp = 0
+			RC ret = set( temp, zero );
+
+			// temp += A * x
+			ret = ret ? ret : mxv< descr_dense >( temp, A, x, ring );
+
+			// r = b - temp;
+			ret = ret ? ret : set( r, zero );
+			ret = ret ? ret : foldl( r, b, ring.getAdditiveMonoid() );
+			assert( nnz( r ) == n );
+			assert( nnz( temp ) == n );
+			ret = ret ? ret : foldl< descr_dense >( r, temp, minus );
+			assert( ret == SUCCESS );
+			assert( nnz( r ) == n );
+
+			// u = r;
+			ret = ret ? ret : set( u, r );
+
+			// sigma = r' * r;
+			sigma = zero;
+			ret = ret ? ret : dot< descr_dense >( sigma, r, r, ring );
+
+			// bnorm = b' * b;
+			bnorm = zero;
+			ret = ret ? ret : dot< descr_dense >( bnorm, b, b, ring );
+
+			if( ret == SUCCESS ) {
+				foldl(tol, tol, ring.getMultiplicativeMonoid() );
+				foldl(tol, bnorm, ring.getMultiplicativeMonoid() );
+			}
+
+			size_t iter = 0;
+
+			do {
+				// temp = 0
+				set( temp, zero );
+
+				// temp += A * u;
+				ret = ret ? ret : mxv< descr_dense >( temp, A, u, ring );
+
+				// residual = u' * temp
+				residual = zero;
+				ret = ret ? ret : dot< descr_dense >( residual, temp, u, ring );
+
+				// alpha = sigma / residual;
+				ret = ret ? ret : apply( alpha, sigma, residual, divide );
+
+				// x = x + alpha * u;
+				ret = ret ? ret : eWiseMul< descr_dense >( x, alpha, u, ring );
+
+				// temp = alpha .* temp
+				// Warning: operator-based foldr requires temp be dense
+				ret = ret ? ret : foldr( alpha, temp, ring.getMultiplicativeMonoid() );
+
+				// r = r - temp;
+				ret = ret ? ret : foldl< descr_dense >( r, temp, minus );
+
+				// residual = r' * r;
+				residual = zero;
+				ret = ret ? ret : dot< descr_dense >( residual, r, r, ring );
+
+				if( ret == SUCCESS ) {
+					if( residual < tol ) {
+						break;
+					}
+				}
+
+				// alpha = residual / sigma;
+				ret = ret ? ret : apply( alpha, residual, sigma, divide );
+
+				// temp = r + alpha * u;
+				ret = ret ? ret : set( temp, r );
+				ret = ret ? ret : eWiseMul< descr_dense >( temp, alpha, u, ring );
+				assert( nnz( temp ) == size( temp ) );
+
+				// u = temp
+				std::swap( u, temp ); 
+
+				// sigma = residual;
+				sigma = residual;
+
+			} while( iter++ < max_iterations && ret == SUCCESS );
+
+			// output
+			iterations = iter;
+
+			if( ret != SUCCESS ) {
+				return FAILED;
+			} else {
+				return SUCCESS;
+			}
+		}
+
+	} // namespace algorithms
+
+} // end namespace alp
+
+#endif // end _H_GRB_ALGORITHMS_CONJUGATE_GRADIENT_ALP
diff --git a/include/alp/algorithms/forwardsubstitution.hpp b/include/alp/algorithms/forwardsubstitution.hpp
new file mode 100644
index 000000000..05c0dd9bc
--- /dev/null
+++ b/include/alp/algorithms/forwardsubstitution.hpp
@@ -0,0 +1,301 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <iomanip>
+
+#include <alp.hpp>
+#ifdef DEBUG
+#include "../../../tests/utils/print_alp_containers.hpp"
+#endif
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * Solves linear system Ax=b
+		 * where A is LowerTriangular matrix, b is given RHS vector
+		 * and x is the solution.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[in]  A    input upper trinagular matrix
+		 * @param[in]  b    input RHS vector
+		 * @param[out] x    solution vector
+		 * @param[in]  ring The semiring used in the computation
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename MatA,
+			typename Vecx,
+			typename Vecb,
+			typename D = typename MatA::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_vector< Vecx >::value &&
+				is_vector< Vecb >::value &&
+				is_matrix< MatA >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value &&
+				structures::is_a< typename MatA::structure, structures::LowerTriangular >::value
+			> * = nullptr
+		>
+		RC forwardsubstitution(
+			MatA &A,
+			Vecx &x,
+			Vecb &b,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+
+			RC rc = SUCCESS;
+
+			if( ( ncols( A ) != size( x ) ) || ( size( b ) != size( x ) ) ) {
+				std::cerr << "Incompatible sizes in trsv.\n";
+				return FAILED;
+			}
+
+			const size_t n = nrows( A );
+
+			for( size_t i = 0; i < n ; ++i ) {
+				Scalar< D > alpha( ring.template getZero< D >() );
+				auto A_i = get_view( A, i, utils::range( 0, i ) );
+				auto A_ii = get_view( A, i, utils::range( i, i + 1 ) );
+				auto x_i = get_view( x, utils::range( i, i + 1 ) );
+				auto b_i = get_view( b, utils::range( i, i + 1 ) );
+				auto x_0_i = get_view( x, utils::range( 0, i ) );
+				rc = rc ? rc : alp::dot( alpha, A_i, alp::conjugate( x_0_i ), ring );
+				rc = rc ? rc : alp::set( x_i, b_i );
+				rc = rc ? rc : alp::foldl( x_i, alpha, minus );
+ 				rc = rc ? rc : alp::set( alpha, Scalar< D >( ring.template getZero< D >() ) );
+ 				rc = rc ? rc : alp::foldl( alpha, A_ii, ring.getAdditiveMonoid() );
+ 				rc = rc ? rc : alp::foldl( x_i, alpha, divide );
+			}
+
+			return rc;
+		}
+
+		/** matrix version */
+		template<
+			typename MatA,
+			typename MatX,
+			typename MatB,
+			typename D = typename MatA::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatA >::value &&
+				is_matrix< MatX >::value &&
+				is_matrix< MatB >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value &&
+				structures::is_a< typename MatA::structure, structures::LowerTriangular >::value &&
+				structures::is_a< typename MatX::structure, typename MatB::structure >::value
+			> * = nullptr
+		>
+		RC forwardsubstitution(
+			MatA &A,
+			MatX &X,
+			MatB &B,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+
+			RC rc = SUCCESS;
+
+			if (
+				( nrows( X ) != nrows( B ) ) ||
+				( ncols( X ) != ncols( B ) ) ||
+				( ncols( A ) != nrows( X ) )
+			) {
+				std::cerr << "Incompatible sizes in trsm.\n";
+				return FAILED;
+			}
+
+			const size_t m = nrows( X );
+			const size_t n = ncols( X );
+
+			for( size_t i = 0; i < n ; ++i ) {
+				auto x = get_view( X, utils::range( 0, m ), i );
+				auto b = get_view( B, utils::range( 0, m ), i );
+				rc = rc ? rc : algorithms::forwardsubstitution( A, x, b, ring, minus, divide );
+			}
+
+			return rc;
+		}
+
+		/** inplace versions */
+		template<
+			typename MatA,
+			typename Vecx,
+			typename D = typename MatA::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_vector< Vecx >::value &&
+				is_matrix< MatA >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value &&
+				structures::is_a< typename MatA::structure, structures::LowerTriangular >::value
+			> * = nullptr
+		>
+		RC forwardsubstitution(
+			MatA &A,
+			Vecx &x,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+
+			RC rc = SUCCESS;
+
+			if( nrows( A ) != size( x ) ) {
+				std::cerr << "Incompatible sizes in trsv.\n";
+				return FAILED;
+			}
+
+			const size_t n = nrows( A );
+
+			for( size_t i = 0; i < n ; ++i ) {
+				Scalar< D > alpha( ring.template getZero< D >() );
+				auto A_i = get_view( A, i, utils::range( 0, i ) );
+				auto A_ii = get_view( A, i, utils::range( i, i + 1 ) );
+				auto x_i = get_view( x, utils::range( i, i + 1 ) );
+				auto x_0_i = get_view( x, utils::range( 0, i ) );
+				rc = rc ? rc : alp::dot( alpha, A_i, alp::conjugate( x_0_i ), ring );
+				rc = rc ? rc : alp::foldl( x_i, alpha, minus );
+ 				rc = rc ? rc : alp::set( alpha, Scalar< D >( ring.template getZero< D >() ) );
+ 				rc = rc ? rc : alp::foldl( alpha, A_ii, ring.getAdditiveMonoid() );
+ 				rc = rc ? rc : alp::foldl( x_i, alpha, divide );
+			}
+
+			return rc;
+		}
+
+		/** inplace matrix version */
+		template<
+			typename MatA,
+			typename MatX,
+			typename D = typename MatA::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatA >::value &&
+				is_matrix< MatX >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value &&
+				structures::is_a< typename MatA::structure, structures::LowerTriangular >::value &&
+				config::default_backend != Backend::dispatch
+			> * = nullptr
+		>
+		RC forwardsubstitution(
+			MatA &A,
+			MatX &X,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+
+			RC rc = SUCCESS;
+
+			if ( ncols( A ) != nrows( X ) ) {
+				std::cerr << "Incompatible sizes in trsm.\n";
+				return FAILED;
+			}
+
+			const size_t m = nrows( X );
+			const size_t n = ncols( X );
+
+			for( size_t i = 0; i < n ; ++i ) {
+				auto x = get_view( X, utils::range( 0, m ), i );
+				rc = rc ? rc : algorithms::forwardsubstitution( A, x, ring, minus, divide );
+			}
+
+			return rc;
+		}
+
+		/** inplace matrix version */
+		template<
+			typename MatA,
+			typename MatX,
+			typename D = typename MatA::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			typename Minus = operators::subtract< D >,
+			typename Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatA >::value &&
+				is_matrix< MatX >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value &&
+				structures::is_a< typename MatA::structure, structures::LowerTriangular >::value &&
+				config::default_backend == Backend::dispatch
+			> * = nullptr
+		>
+		RC forwardsubstitution(
+			MatA &A,
+			MatX &X,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+#ifdef DEBUG
+			std::cout << "Entered TRSM in-place matrix version (offloads to blas_trsm).\n";
+#endif
+			(void) ring;
+			(void) minus;
+			(void) divide;
+			RC rc = SUCCESS;
+
+			if ( ncols( A ) != nrows( X ) ) {
+				std::cerr << "Incompatible sizes in trsm.\n";
+				return FAILED;
+			}
+
+			const size_t m = nrows( X );
+			const size_t n = ncols( X );
+
+#ifdef _ALP_WITH_DISPATCH
+			cblas_dtrsm(
+				CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit,
+				m, n,
+				1,
+				internal::getRawPointerToFirstElement( A ), internal::getLeadingDimension( A ),
+				internal::getRawPointerToFirstElement( X ), internal::getLeadingDimension( X )
+			);
+#endif
+
+			return rc;
+		}
+
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/fused_mxm_foldl.hpp b/include/alp/algorithms/fused_mxm_foldl.hpp
new file mode 100644
index 000000000..b4581862d
--- /dev/null
+++ b/include/alp/algorithms/fused_mxm_foldl.hpp
@@ -0,0 +1,143 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include <alp.hpp>
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * Performs mxm followed by foldl: C += A*AT.
+		 * The purpose of this function is to simulate operation fusion.
+		 *
+		 * @tparam MatrixC  Type of a symmetric ALP Matrix C
+		 * @tparam MatrixA  Type of ALP Matrix A
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring passed to mxm
+		 * @tparam Op       Type of the operator passed to foldl
+		 *
+		 * @param[inout] C  Matrix C
+		 * @param[in] A     Matrix A
+		 * @param[in] ring  Ring passed to mxm
+		 * @param[in] op    Operator passed to foldl
+		 *
+		 * @return RC       SUCCESS if the execution was correct
+		 * \note This does not support complex numbers at the moment.
+		 */
+		template<
+			typename MatrixC,
+			typename MatrixA,
+			typename D = typename MatrixC::value_type,
+			typename Ring, typename Op,
+			std::enable_if_t<
+				alp::is_matrix< MatrixC >::value &&
+				alp::is_matrix< MatrixA >::value &&
+				alp::is_semiring< Ring >::value &&
+				alp::is_operator< Op >::value &&
+				config::default_backend != Backend::dispatch
+			> * = nullptr
+		>
+		RC fused_symm_mxm_foldl(
+			MatrixC &C,
+			MatrixA &A,
+			const Ring &ring = Ring(),
+			const Op &op = Op()
+		) {
+
+			// Verify that the C is of dimensions nrows(A) x nrows(A)
+			const size_t m = ncols( A );
+			if( ( nrows( C ) != m ) || ( ncols( C ) != m ) ) {
+				return MISMATCH;
+			}
+
+			const auto AT = get_view< view::transpose >( A );
+
+			Matrix< D, typename MatrixC::structure, Density::Dense > AAT( m );
+
+			RC rc = SUCCESS;
+
+			// AAT = 0
+			rc = rc ? rc : set( AAT, Scalar< D >( ring.template getZero< D >() ) );
+			assert( rc == SUCCESS );
+
+			// AAT += A * AT
+			rc = rc ? rc : mxm( AAT, AT, A, ring );
+			assert( rc == SUCCESS );
+
+			// C += AAT
+			rc = rc ? rc : foldl( C, AAT, op );
+			assert( rc == SUCCESS );
+
+			return rc;
+		}
+
+		/**
+		 * Specialization for dispatch backend. Offloads to syrk.
+		 * Assumes that A is transposed.
+		 */
+		template<
+			typename MatrixC,
+			typename MatrixA,
+			typename D = typename MatrixC::value_type,
+			typename Ring, typename Op,
+			std::enable_if_t<
+				alp::is_matrix< MatrixC >::value &&
+				alp::is_matrix< MatrixA >::value &&
+				alp::is_semiring< Ring >::value &&
+				alp::is_operator< Op >::value &&
+				config::default_backend == Backend::dispatch
+			> * = nullptr
+		>
+		RC fused_symm_mxm_foldl(
+			MatrixC &C,
+			MatrixA &A,
+			const Ring &ring = Ring(),
+			const Op &op = Op()
+		) {
+			(void) ring;
+			(void) op;
+
+			// Verify that the C is of dimensions nrows(A) x nrows(A)
+			const size_t k = nrows( A );
+			const size_t m = ncols( A );
+			if( ( nrows( C ) != m ) || ( ncols( C ) != m ) ) {
+				return MISMATCH;
+			}
+
+			RC rc = SUCCESS;
+
+#ifdef _ALP_WITH_DISPATCH
+			cblas_dsyrk(
+				CblasRowMajor, CblasUpper, CblasTrans,
+				m,
+				k,
+				-1,
+				internal::getRawPointerToFirstElement( A ),
+				internal::getLeadingDimension( A ),
+				1,
+				internal::getRawPointerToFirstElement( C ),
+				internal::getLeadingDimension( C )
+			);
+#endif
+
+			return rc;
+		}
+
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/gemm.hpp b/include/alp/algorithms/gemm.hpp
new file mode 100644
index 000000000..ed5f7f5a8
--- /dev/null
+++ b/include/alp/algorithms/gemm.hpp
@@ -0,0 +1,150 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include <alp.hpp>
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * @brief gemm_like example where a sub-matrix
+		 *        \f$C_blk = \alpha \cdot At_blk \cdot B_blk + \beta \cdot C_blk\f$,
+		 *        where \f$At_blk, B_blk, C_blk\f$ are sub-matrices (optionally at
+		 *        a stride both row- and column-wise) of matrices
+		 *        \f$A, B, C\f$, respectively, and \f$At_blk\f$ and \f$B_blk$ may be
+		 *        transposed views over the \f$A\f$ and \f$B\f$ sub-matrices
+		 *        depending on parameters \f$transposeA\f$ and \f$transposeB\f$, respectively.
+		 *
+		 * @tparam transposeA  Whether to transpose A
+		 * @tparam transposeB  Whether to transpose B
+		 * @tparam D         Data element type
+		 * @tparam Ring      Type of the semiring used in computation
+		 * @param m          Number of rows of matrices \a C_blk and \a At_blk
+		 * @param n          Number of columns of matrices \a C_blk and \a B_blk
+		 * @param k          Number of rows of matrix \a B_blk and columns of \a A_blk
+		 * @param alpha      Alpha scalar parameter
+		 * @param A          reference to matrix A
+		 * @param startAr    Row offset of \a At_blk within \a A
+		 * @param startAc    Column offset of \a At_blk within \a A
+		 * @param B          reference to matrix B
+		 * @param startBr    Row offset of \a B_blk within \a B
+		 * @param startBc    Column offset of \a B_blk within \a B
+		 * @param beta       Beta scalar parameter
+		 * @param C          reference to matrix C
+		 * @param startCr    Row offset of \a C_blk within \a C
+		 * @param startCc    Column offset of \a C_blk within \a C
+		 * @param ring       The semiring used for performing operations
+		 * @return RC        SUCCESS if the execution was correct
+		 */
+		template<
+			bool transposeA = false,
+			bool transposeB = false,
+			typename D = double,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >
+		>
+		RC gemm_like_example(
+			const size_t m,
+			const size_t n,
+			const size_t k,
+			const Scalar< D > &alpha,
+			Matrix< D, structures::General, Dense > &A,
+			const size_t startAr,
+			const size_t strideAr,
+			const size_t startAc,
+			const size_t strideAc,
+			Matrix< D, structures::General, Dense > &B,
+			const size_t startBr,
+			const size_t strideBr,
+			const size_t startBc,
+			const size_t strideBc,
+			const Scalar< D > &beta,
+			Matrix< D, structures::General, Dense > &C,
+			const size_t startCr,
+			const size_t strideCr,
+			const size_t startCc,
+			const size_t strideCc,
+			const Ring &ring = Ring()
+		) {
+
+			// Ensure the compatibility of parameters
+			const size_t endCr = startCr + m * strideCr;
+			const size_t endCc = startCc + n * strideCc;
+			const size_t endAr = transposeA ? startAr + k * strideAr : startAr + m * strideAr;
+			const size_t endAc = transposeA ? startAc + m * strideAc : startAc + k * strideAc;
+			const size_t endBr = transposeB ? startBr + n * strideBr : startBr + k * strideBr;
+			const size_t endBc = transposeB ? startBc + k * strideBc : startBc + n * strideBc;
+
+			if(
+				( endAr > nrows( A ) ) || ( endAc > ncols( A ) ) ||
+				( endBr > nrows( B ) ) || ( endBc > ncols( B ) ) ||
+				( endCr > nrows( C ) ) || ( endCc > ncols( C ) )
+			) {
+				return MISMATCH;
+			}
+
+			const size_t mA = transposeA ? k : m;
+			const size_t kA = transposeA ? m : k;
+			auto A_blk_orig = get_view(
+				A,
+				utils::range( startAr, startAr + mA * strideAr, strideAr ),
+				utils::range( startAc, startAc + kA * strideAc, strideAc )
+			);
+
+			auto A_blk = get_view< transposeA ? view::transpose : view::original >( A_blk_orig );
+
+			const size_t kB = transposeB ? n : k;
+			const size_t nB = transposeB ? k : n;
+			auto B_blk_orig = get_view(
+				B,
+				utils::range( startBr, startBr + kB * strideBr, strideBr ),
+				utils::range( startBc, startBc + nB * strideBc, strideBc )
+			);
+
+			auto B_blk = get_view< transposeB ? view::transpose : view::original >( B_blk_orig );
+
+			auto C_blk = get_view(
+				C,
+				utils::range( startCr, startCr + m * strideCr, strideCr ),
+				utils::range( startCc, startCc + n * strideCc, strideCc )
+			);
+
+			Matrix< D, structures::General, Dense > C_tmp( m, n );
+
+			RC rc = SUCCESS;
+
+			// C_blk = beta * C_blk
+			rc = rc ? rc : foldr( beta, C_blk, ring.getMultiplicativeMonoid() );
+			assert( rc == SUCCESS );
+
+			// C_tmp = 0
+			rc = rc ? rc : set( C_tmp, Scalar< D >( ring.template getZero< D >() ) );
+			assert( rc == SUCCESS );
+			// C_tmp += At_blk * B_blk
+			rc = rc ? rc : mxm( C_tmp, A_blk, B_blk, ring );
+			assert( rc == SUCCESS );
+
+			// C_blk += alpha * C_tmp
+			rc = rc ? rc : eWiseMul( C_blk, alpha, C_tmp, ring );
+			assert( rc == SUCCESS );
+
+			return rc;
+		}
+
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/householder_bidiag.hpp b/include/alp/algorithms/householder_bidiag.hpp
new file mode 100644
index 000000000..25b44cd73
--- /dev/null
+++ b/include/alp/algorithms/householder_bidiag.hpp
@@ -0,0 +1,204 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../tests/utils/print_alp_containers.hpp"
+#endif
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * Given a general matrix H perform an inplace Householder reflections in order to
+		 * eliminate column elements H[i+d:,i] (below diagonal or subdigonal), H = U H.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[in,out]   U updated orthogonal matrix
+		 * @param[in,out]   H updated general matrix with column
+		 *                         elements H[i+d:,i] eliminated
+		 * @param[in]       i column to eliminate
+		 * @param[in]       d offset from diagonal to eliminate, default 0
+		 * @param[in]       ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 */
+		template<
+			typename MatH,
+			typename MatU,
+			typename IndexType,
+			typename D = typename MatH::value_type,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				structures::is_a< typename MatH::structure, structures::General >::value &&
+				structures::is_a< typename MatU::structure, structures::Orthogonal >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC elminate_below_ith_diag(
+			const IndexType i,
+			MatH &H,
+			MatU &U,
+			const IndexType d = 0,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			const IndexType m = nrows( H );
+			const IndexType n = ncols( H );
+
+			// v=copy(A0[i+d:,i])
+			auto a = get_view( H, utils::range( i + d, m ), i );
+			Vector< D > v( m - ( i + d ) );
+			rc = rc ? rc : alp::set( v, a );
+
+			// alpha=v[0]/abs(v[0])
+			Scalar< D > alpha( zero );
+			auto v0 = get_view( v, utils::range( 0, 1 ) );
+			rc = rc ? rc : foldl( alpha, v0, ring.getAdditiveMonoid() );
+			rc = rc ? rc : foldl( alpha, Scalar< D >( std::abs( *alpha ) ), divide );
+
+			// alpha=alpha*norm(v)
+			Scalar< D > norm_v1( zero );
+			rc = rc ? rc : norm2( norm_v1, v, ring );
+			rc = rc ? rc : foldl( alpha, norm_v1, ring.getMultiplicativeOperator() );
+
+			// v[0]=v[0]-alpha
+			rc = rc ? rc : foldl( v0, alpha, minus );
+
+			// v=v/norm(v)
+			Scalar< D > norm_v2( zero );
+			rc = rc ? rc : norm2( norm_v2, v, ring );
+			rc = rc ? rc : foldl( v, norm_v2, divide );
+
+			//P1=zeros((m-(i+d),m-(i+d))).astype(complex)
+			//P1=P1-2*outer(v,conjugate(v))
+			auto vvh = outer( v, ring.getMultiplicativeOperator() );
+			Matrix< D, typename decltype( vvh )::structure, Dense > reflector( m - ( i + d ) );
+			rc = rc ? rc : alp::set( reflector, vvh );
+			rc = rc ? rc : foldl( reflector, Scalar< D > ( -2 ), ring.getMultiplicativeOperator() );
+
+			// A0=P.dot(A0)
+			auto Hupdate = get_view( H, utils::range( i + d, m ), utils::range( 0, n ) );
+			Matrix< D, structures::General, Dense > Temp1( m - ( i + d ) , n );
+			rc = rc ? rc : alp::set( Temp1, Hupdate );
+			rc = rc ? rc : mxm( Hupdate, reflector, Temp1, ring );
+
+			// Uk=Uk.dot(P)
+			auto Uupdate = get_view< structures::OrthogonalColumns >( U, utils::range( 0, m ), utils::range( i + d, m ) );
+			Matrix< D, structures::OrthogonalColumns, Dense > Temp2( m, m - ( i + d ) );
+			rc = rc ? rc : alp::set( Temp2, Uupdate );
+			rc = rc ? rc : mxm( Uupdate, Temp2, reflector, ring );
+
+			return rc;
+		}
+
+		/**
+		 * Computes Householder (inplace) bidiagonalisation of general matrix \f$H = U B V \f$
+		 * where \a H is general (complex or real),
+		 * \a U orthogonal, \a B is bidiagonal and  \a V orthogonal.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[in,out]   U updated orthogonal matrix
+		 * @param[in,out]   V updated orthogonal matrix
+		 * @param[in,out]   H input general matrix, output bidiagonal matrix (B)
+		 * @param[in]       ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename MatH,
+			typename D = typename MatH::value_type,
+			typename MatU,
+			typename MatV,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatH >::value &&
+				is_matrix< MatU >::value &&
+				is_matrix< MatV >::value &&
+				structures::is_a< typename MatH::structure, structures::General >::value &&
+				structures::is_a< typename MatU::structure, structures::Orthogonal >::value &&
+				structures::is_a< typename MatV::structure, structures::Orthogonal >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC householder_bidiag(
+			MatU &U,
+			MatH &H,
+			MatV &V,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			const size_t m = nrows( H );
+			const size_t n = ncols( H );
+
+			// check sizes
+			if(
+				( ncols( U ) != nrows( H ) ) ||
+				( ncols( H ) != nrows( V ) )
+			) {
+				std::cerr << "Incompatible sizes in householder_bidiag.\n";
+				return FAILED;
+			}
+
+			//for i in range(min(n,m)):
+			for( size_t i = 0; i < std::min( n, m ); ++i ) {
+				// eliminate column elements below ith diagonal element
+				if( i < std::min( n, m - 1 ) ) {
+					rc = rc ? rc : elminate_below_ith_diag( i, H, U, static_cast< size_t >( 0 ), ring, minus, divide );
+				}
+				// eliminate row elements to the right from (i+1)th diagonal element
+				if( i < std::min( n - 2, m ) ) {
+					auto HT = get_view< alp::view::transpose >( H );
+					auto VT = get_view< alp::view::transpose >( V );
+					rc = rc ? rc : elminate_below_ith_diag( i, HT, VT, static_cast< size_t >( 1 ), ring, minus, divide );
+				}
+			}
+
+			return rc;
+
+		}
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/householder_lu.hpp b/include/alp/algorithms/householder_lu.hpp
new file mode 100644
index 000000000..a4c8b7034
--- /dev/null
+++ b/include/alp/algorithms/householder_lu.hpp
@@ -0,0 +1,195 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric> //iota
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../tests/utils/print_alp_containers.hpp"
+#endif
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * Computes Householder LU decomposition of general matrix \f$H = LU\f$
+		 * where \a H is general (complex or real),
+		 * \a L lower trapezoidal,
+		 * \a U is upper trapezoidal.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[out] L    output lower trapezoidal matrix
+		 * @param[out] U    output upper trapezoidal matrix
+		 * @param[out] p    output permutation vector
+		 * @param[in]  H    input general matrix
+		 * @param[in]  ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename MatH,
+			typename D = typename MatH::value_type,
+			typename MatL,
+			typename MatU,
+			typename IndexType,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				std::is_integral< IndexType >::value &&
+				is_matrix< MatH >::value &&
+				is_matrix< MatL >::value &&
+				is_matrix< MatU >::value &&
+				structures::is_a< typename MatH::structure, structures::General >::value &&
+				structures::is_a< typename MatL::structure, structures::LowerTrapezoidal >::value &&
+				structures::is_a< typename MatU::structure, structures::UpperTrapezoidal >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC householder_lu(
+			const MatH &H,
+			MatL &L,
+			MatU &U,
+			Vector< IndexType > &p,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			const size_t m = nrows( H );
+			const size_t n = ncols( H );
+			const size_t kk = std::min( n, m );
+
+			// initialize permutation vector to identity permutation
+			alp::set< alp::descriptors::use_index >( p, alp::Scalar< IndexType >( 0 ) );
+
+			// check sizes
+			if(
+				( nrows( L ) != nrows( H ) ) ||
+				( ncols( U ) != ncols( H ) ) ||
+				( nrows( U ) != kk ) ||
+				( ncols( L ) != kk )
+			) {
+#ifdef DEBUG
+				std::cerr << " n, kk, m = " << n << ", "  << kk << ", " << m << "\n";
+				std::cerr << "Incompatible sizes in householder_lu.\n";
+#endif
+				return FAILED;
+			}
+
+
+			// L = identity( n )
+			auto Ldiag = alp::get_view< alp::view::diagonal >( L );
+			rc = rc ? rc : alp::set( Ldiag, one );
+			if( rc != SUCCESS ) {
+				std::cerr << " alp::set( L, I ) failed\n";
+				return rc;
+			}
+
+			// Out of place specification of the computation
+			MatH HWork( m, n );
+			rc = rc ? rc : alp::set( HWork, H );
+			if( rc != SUCCESS ) {
+				std::cerr << " alp::set( HWork, H ) failed\n";
+				return rc;
+			}
+
+			Vector< D > PivotVec( n );
+			rc = rc ? rc : alp::set( PivotVec, zero );
+
+			for( size_t k = 0; k < std::min( n, m ); ++k ) {
+				// =====   algorithm  =====
+				// a = H[ k, k ]
+				// v = H[ k + 1 : , k ]
+				// w = H[ k, k + 1 : ]
+				// Ak = H[ k + 1 :, k + 1 : ]
+				// v = v / a
+				// Ak = Ak - outer(v,w)
+				// scalar view should replace vector view of length 1 (issue #598)
+				// besides here there are many places in the use cases where this should be changed
+				auto a_view = alp::get_view( HWork, utils::range( k, k + 1 ), k );
+				auto v_view = alp::get_view( HWork, utils::range( k + 1, m ), k );
+				auto w_view = alp::get_view( HWork, k, utils::range( k + 1, n ) );
+				auto Ak_view = alp::get_view( HWork, utils::range( k + 1, m ), utils::range( k + 1, n ) );
+
+				Scalar< D > alpha( zero );
+				rc = rc ? rc : alp::foldl( alpha, a_view, ring.getAdditiveMonoid() );
+
+				// pivoting: find index ipivot
+				size_t ipivot = k;
+				rc = rc ? rc : eWiseLambda(
+					[ &alpha, &ipivot, &k ]( const size_t i, D &val ) {
+						if( std::abs( val ) > std::abs( *alpha ) ) {
+							*alpha = val;
+							ipivot = i + k + 1;
+						}
+					},
+					v_view
+				);
+				// do pivoting if needed
+				if( ipivot > k ) {
+					//p[ ipivot ] <-> p[ k ]
+					auto p1 = alp::get_view( p, utils::range( k, k + 1 ) );
+					auto p2 = alp::get_view( p, utils::range( ipivot, ipivot + 1 ) );
+					Vector< size_t > ptmp( 1 );
+					rc = rc ? rc : alp::set( ptmp, p1 );
+					rc = rc ? rc : alp::set( p1, p2 );
+					rc = rc ? rc : alp::set( p2, ptmp );
+
+					//HWork[ ipivot ] <-> HWork[ k ]
+					auto v1 = alp::get_view( HWork, k, utils::range( 0, n ) );
+					auto v2 = alp::get_view( HWork, ipivot, utils::range( 0, n ) );
+					rc = rc ? rc : alp::set( PivotVec, v1 );
+					rc = rc ? rc : alp::set( v1, v2 );
+					rc = rc ? rc : alp::set( v2, PivotVec );
+				}
+
+				rc = rc ? rc : alp::foldl( v_view, alpha, divide );
+
+				auto w_view_star = conjugate( w_view );
+				auto Reflector = alp::outer( v_view, w_view_star, ring.getMultiplicativeOperator() );
+
+				rc = rc ? rc : alp::foldl( Ak_view, Reflector, minus );
+
+			}
+
+
+			// save the result in L and U
+			auto H_Utrapez = get_view< structures::UpperTrapezoidal >( HWork, utils::range( 0, kk ), utils::range( 0, n ) );
+			rc = rc ? rc : alp::set( U, H_Utrapez );
+
+			auto H_Ltrapez = get_view< structures::LowerTrapezoidal >( HWork, utils::range( 1, m ), utils::range( 0, kk ) );
+			auto L_lowerTrapez = get_view( L, utils::range( 1, m ), utils::range( 0, kk ) );
+			rc = rc ? rc : alp::set( L_lowerTrapez, H_Ltrapez );
+
+			return rc;
+
+		}
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/householder_qr.hpp b/include/alp/algorithms/householder_qr.hpp
new file mode 100644
index 000000000..b62c87f7e
--- /dev/null
+++ b/include/alp/algorithms/householder_qr.hpp
@@ -0,0 +1,249 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../tests/utils/print_alp_containers.hpp"
+#endif
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * @brief Computes Householder QR decomposition of general matrix \f$H = QR\f$
+		 *        where \a H is general (complex or real),
+		 *        \a R is upper triangular (if H is not square,
+		 *        R is of the same shape with zeros below diagonal), and
+		 *        \a Q is orthogonal.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[out] Q    output orthogonal matrix such that H = Q T Q^T
+		 * @param[out] R    output same shape as H with zeros below diagonal
+		 * @param[in]  H    input general matrix
+		 * @param[in]  ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename D,
+			typename GeneralType,
+			typename GenView,
+			typename GenImfR,
+			typename GenImfC,
+			typename OrthogonalType,
+			typename OrthogonalView,
+			typename OrthogonalImfR,
+			typename OrthogonalImfC,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >
+		>
+		RC householder_qr(
+			Matrix< D, GeneralType, alp::Dense, GenView, GenImfR, GenImfC > &H,
+			Matrix< D, OrthogonalType, alp::Dense, OrthogonalView, OrthogonalImfR, OrthogonalImfC > &Q,
+			Matrix< D, GeneralType, alp::Dense, GenView, GenImfR, GenImfC > &R,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+			const size_t n = nrows( H );
+			const size_t m = ncols( H );
+
+#ifdef DEBUG
+			std::cout << " n, m= " << n << ", " << m << "\n";
+#endif
+
+			// Q = identity( n )
+			rc = alp::set( Q, zero );
+			auto Qdiag = alp::get_view< alp::view::diagonal >( Q );
+			rc = rc ? rc : alp::set( Qdiag, one );
+			if( rc != SUCCESS ) {
+				std::cerr << " alp::set( Q, I ) failed\n";
+				return rc;
+			}
+
+			// Out of place specification of the computation
+			Matrix< D, GeneralType, alp::Dense > RR( n, m );
+
+			rc = alp::set( RR, H );
+			if( rc != SUCCESS ) {
+				std::cerr << " alp::set( RR, H ) failed\n";
+				return rc;
+			}
+#ifdef DEBUG
+			print_matrix( " << RR >> ", RR );
+#endif
+
+			// a temporary for storing the alp::mxm result
+			Matrix< D, OrthogonalType, alp::Dense > Qtmp( n, n );
+
+			for( size_t k = 0; k < std::min( n-1, m ); ++k ) {
+#ifdef DEBUG
+				std::string matname( " << RR(" );
+				matname = matname + std::to_string( k );
+				matname = matname + std::string( ") >> " );
+				print_matrix( matname, RR );
+#endif
+
+ 				//const size_t m = n - k - 1;
+
+				// ===== Begin Computing v =====
+				// v = H[ k + 1 : , k ]
+				// alpha = norm( v ) * v[ 0 ] / norm( v[ 0 ] )
+				// v = v - alpha * e1
+				// v = v / norm ( v )
+
+				auto v_view = alp::get_view( RR, utils::range( k, n ), k );
+				Vector< D, GeneralType, alp::Dense > v( n - k );
+				rc = alp::set( v, v_view );
+				if( rc != SUCCESS ) {
+					std::cerr << " alp::set( v, view ) failed\n";
+					return rc;
+				}
+
+				Scalar< D > alpha( zero );
+				rc = alp::norm2( alpha, v, ring );
+				if( rc != SUCCESS ) {
+					std::cerr << " alp::norm2( alpha, v, ring ) failed\n";
+					return rc;
+				}
+
+				rc = alp::eWiseLambda(
+					[ &alpha, &ring, &divide, &minus ]( const size_t i, D &val ) {
+						if ( i == 0 ) {
+							Scalar< D > norm_v0( std::abs( val ) );
+							Scalar< D > val_scalar( val );
+							alp::foldl( alpha, val_scalar, ring.getMultiplicativeOperator() );
+							alp::foldl( alpha, norm_v0, divide );
+							alp::foldl( val_scalar, alpha, minus );
+							val = *val_scalar;
+						}
+					},
+					v
+				);
+				if( rc != SUCCESS ) {
+					std::cerr << " alp::eWiseLambda( lambda, v ) failed\n";
+					return rc;
+				}
+
+				Scalar< D > norm_v( zero );
+				rc = alp::norm2( norm_v, v, ring );
+				if( rc != SUCCESS ) {
+					std::cerr << " alp::norm2( norm_v, v, ring ) failed\n";
+					return rc;
+				}
+
+				rc = alp::foldl( v, norm_v, divide );
+#ifdef DEBUG
+				print_vector( " v = ", v );
+#endif
+				// ===== End Computing v =====
+
+				// ===== Calculate reflector Qk =====
+				// Q_k = identity( n )
+				typedef typename std::conditional<
+					grb::utils::is_complex< D >::value,
+					structures::Hermitian,
+					structures::Symmetric
+				>::type SymmOrHerm;
+				Matrix<	D, SymmOrHerm, alp::Dense > Qk( n );
+				rc = alp::set( Qk, zero );
+				auto Qk_diag = alp::get_view< alp::view::diagonal >( Qk );
+				rc = rc ? rc : alp::set( Qk_diag, one );
+
+				// this part can be rewriten without temp matrix using functors
+				Matrix<	D, SymmOrHerm, alp::Dense > vvt( n - k );
+
+				rc = rc ? rc : alp::set( vvt, alp::outer( v, ring.getMultiplicativeOperator() ) );
+				rc = rc ? rc : alp::foldr( Scalar< D >( 2 ), vvt, ring.getMultiplicativeOperator() );
+
+				// Qk = Qk - vvt ( expanded: I - 2 * vvt )
+				auto Qk_view = alp::get_view< SymmOrHerm >(
+					//auto Qk_view = alp::get_view< GeneralType >(
+					Qk,
+					utils::range( k, n ),
+					utils::range( k, n )
+				);
+				rc = rc ? rc : alp::foldl( Qk_view, vvt, minus );
+
+#ifdef DEBUG
+				print_matrix( " << Qk >> ", Qk );
+#endif
+				// ===== End of Calculate reflector Qk ====
+
+				// ===== Update RR =====
+				// RR = Qk * RR
+
+				// QkRR = Qk * RR
+				Matrix< D, GeneralType, alp::Dense > QkRR( n, m );
+				rc = rc ? rc : alp::set( QkRR, zero );
+				rc = rc ? rc : alp::mxm( QkRR, Qk, RR, ring );
+				if( rc != SUCCESS ) {
+					std::cerr << " alp::mxm( QkRR, Qk, RR, ring ); failed\n";
+					return rc;
+				}
+#ifdef DEBUG
+				print_matrix( " << Qk x RR  >> ", QkRR );
+#endif
+				rc = rc ? rc : alp::set( RR, QkRR );
+
+#ifdef DEBUG
+				print_matrix( " << RR( updated ) >> ", RR );
+#endif
+				// ===== End of Update R =====
+
+				// ===== Update Q =====
+				// Q = Q * conjugate(transpose(Qk))
+
+				// Qtmp = Q * conjugate(transpose(Qk))
+				rc = rc ? rc : alp::set( Qtmp, zero );
+				if( grb::utils::is_complex< D >::value ) {
+					rc = rc ? rc : alp::mxm(
+						Qtmp,
+						Q,
+						alp::conjugate( alp::get_view< alp::view::transpose >( Qk ) ),
+						ring
+					);
+				} else {
+					rc = rc ? rc : alp::mxm( Qtmp, Q, Qk, ring );
+				}
+
+				// Q = Qtmp
+				rc = rc ? rc : alp::set( Q, Qtmp );
+#ifdef DEBUG
+				print_matrix( " << Q updated >> ", Q );
+#endif
+				// ===== End of Update Q =====
+			}
+
+			// R = RR
+			rc = rc ? rc : alp::set( R, RR );
+			return rc;
+		}
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/householder_tridiag.hpp b/include/alp/algorithms/householder_tridiag.hpp
new file mode 100644
index 000000000..3d946f34c
--- /dev/null
+++ b/include/alp/algorithms/householder_tridiag.hpp
@@ -0,0 +1,229 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#include "../tests/utils/print_alp_containers.hpp"
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * @brief Computes Householder tridiagonalization \f$H = QTQ^T\f$
+		 *        where \a H is real symmetric, \a T is symmetric tridiagonal, and
+		 *        \a Q is orthogonal.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[out] Q    output orthogonal matrix such that H = Q T Q^T
+		 * @param[out] T    output symmetric tridiagonal matrix such that H = Q T Q^T
+		 * @param[in]  H    input symmetric matrix
+		 * @param[in]  ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename D,
+			typename SymmOrHermType,
+			typename SymmOrHermTridiagonalType,
+			typename OrthogonalType,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >
+		>
+		RC householder_tridiag(
+			Matrix< D, OrthogonalType, Dense > &Q,
+			Matrix< D, SymmOrHermTridiagonalType, Dense > &T,
+			Matrix< D, SymmOrHermType, Dense > &H,
+			const Ring & ring = Ring(),
+			const Minus & minus = Minus(),
+			const Divide & divide = Divide() ) {
+
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+			const size_t n = nrows( H );
+
+			// Q = identity( n )
+			rc = alp::set( Q, zero );
+			auto Qdiag = alp::get_view< alp::view::diagonal >( Q );
+			rc = rc ? rc : alp::set( Qdiag, one );
+			if( rc != SUCCESS ) {
+				std::cerr << " set( Q, I ) failed\n";
+				return rc;
+			}
+
+			// Out of place specification of the computation
+			Matrix< D, SymmOrHermType, Dense > RR( n );
+
+			rc = set( RR, H );
+			if( rc != SUCCESS ) {
+				std::cerr << " set( RR, H ) failed\n";
+				return rc;
+			}
+#ifdef DEBUG
+			print_matrix( " << RR >> ", RR );
+#endif
+
+			// a temporary for storing the mxm result
+			Matrix< D, OrthogonalType, Dense > Qtmp( n, n );
+
+			for( size_t k = 0; k < n - 2; ++k ) {
+#ifdef DEBUG
+				std::string matname(" << RR(");
+				matname = matname + std::to_string(k);
+				matname = matname + std::string( ") >> ");
+				print_matrix( matname , RR );
+#endif
+
+				const size_t m = n - k - 1;
+
+				// ===== Begin Computing v =====
+				// v = H[ k + 1 : , k ]
+				// alpha = norm( v ) * v[ 0 ] / norm( v[ 0 ] )
+				// v = v - alpha * e1
+				// v = v / norm ( v )
+
+				auto v_view = get_view( RR, k, utils::range( k + 1, n ) );
+				Vector< D, structures::General, Dense > v( n - ( k + 1 ) );
+				rc = set( v, v_view );
+				if( rc != SUCCESS ) {
+					std::cerr << " set( v, view ) failed\n";
+					return rc;
+				}
+
+				Scalar< D > alpha( zero );
+				rc = norm2( alpha, v, ring );
+				if( rc != SUCCESS ) {
+					std::cerr << " norm2( alpha, v, ring ) failed\n";
+					return rc;
+				}
+
+				rc = eWiseLambda(
+					[ &alpha, &ring, &divide, &minus ]( const size_t i, D &val ) {
+						if ( i == 0 ) {
+							Scalar< D > norm_v0( std::abs( val ) );
+							Scalar< D > val_scalar( val );
+							foldl( alpha, val_scalar, ring.getMultiplicativeOperator() );
+							foldl( alpha, norm_v0, divide );
+							foldl( val_scalar, alpha, minus );
+							val = *val_scalar;
+						}
+					},
+					v
+				);
+				if( rc != SUCCESS ) {
+					std::cerr << " eWiseLambda( lambda, v ) failed\n";
+					return rc;
+				}
+
+				Scalar< D > norm_v( zero );
+				rc = norm2( norm_v, v, ring );
+				if( rc != SUCCESS ) {
+					std::cerr << " norm2( norm_v, v, ring ) failed\n";
+					return rc;
+				}
+
+				rc = foldl(v, norm_v, divide );
+#ifdef DEBUG
+				print_vector( " v = ", v );
+#endif
+				// ===== End Computing v =====
+
+				// ===== Calculate reflector Qk =====
+				// Q_k = identity( n )
+				Matrix< D, SymmOrHermType, Dense > Qk( n );
+				rc = alp::set( Qk, zero );
+				auto Qk_diag = alp::get_view< alp::view::diagonal >( Qk );
+				rc = rc ? rc : alp::set( Qk_diag, one );
+
+				// this part can be rewriten without temp matrix using functors
+				Matrix< D, SymmOrHermType, Dense > vvt( m );
+
+				rc = rc ? rc : set( vvt, outer( v, ring.getMultiplicativeOperator() ) );
+				// vvt = 2 * vvt
+				rc = rc ? rc : foldr( Scalar< D >( 2 ), vvt, ring.getMultiplicativeOperator() );
+
+
+#ifdef DEBUG
+				print_matrix( " vvt ", vvt );
+#endif
+
+				// Qk = Qk - vvt ( expanded: I - 2 * vvt )
+				auto Qk_view = get_view< SymmOrHermType >( Qk, utils::range( k + 1, n ), utils::range( k + 1, n ) );
+				if ( grb::utils::is_complex< D >::value ) {
+					rc = rc ? rc : foldl( Qk_view, alp::get_view< alp::view::transpose >( vvt ), minus );
+				} else {
+					rc = rc ? rc : foldl( Qk_view, vvt, minus );
+				}
+
+#ifdef DEBUG
+				print_matrix( " << Qk >> ", Qk );
+#endif
+				// ===== End of Calculate reflector Qk ====
+
+				// ===== Update R =====
+				// Rk = Qk * Rk * Qk
+
+				// RRQk = RR * Qk
+				Matrix< D, structures::Square, Dense > RRQk( n );
+				rc = rc ? rc : set( RRQk, zero );
+				rc = rc ? rc : mxm( RRQk, RR, Qk, ring );
+				if( rc != SUCCESS ) {
+					std::cerr << " mxm( RRQk, RR, Qk, ring ); failed\n";
+					return rc;
+				}
+#ifdef DEBUG
+				print_matrix( " << RR x Qk = >> ", RRQk );
+#endif
+				// RR = Qk * RRQk
+				rc = rc ? rc : set( RR, zero );
+				rc = rc ? rc : mxm( RR, Qk, RRQk, ring );
+
+#ifdef DEBUG
+				print_matrix( " << RR( updated ) >> ", RR );
+#endif
+				// ===== End of Update R =====
+
+				// ===== Update Q =====
+				// Q = Q * Qk
+
+				// Qtmp = Q * Qk
+				rc = rc ? rc : set( Qtmp, zero );
+				rc = rc ? rc : mxm( Qtmp, Q, Qk, ring );
+
+				// Q = Qtmp
+				rc = rc ? rc : set( Q, Qtmp );
+#ifdef DEBUG
+				print_matrix( " << Q updated >> ", Q );
+#endif
+				// ===== End of Update Q =====
+			}
+
+			// T = RR
+
+			rc = rc ? rc : set( T, get_view< SymmOrHermTridiagonalType > ( RR ) );
+			return rc;
+		}
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/qr_eigensolver.hpp b/include/alp/algorithms/qr_eigensolver.hpp
new file mode 100644
index 000000000..d888e6a5f
--- /dev/null
+++ b/include/alp/algorithms/qr_eigensolver.hpp
@@ -0,0 +1,213 @@
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#include <alp/algorithms/householder_qr.hpp>
+#ifdef DEBUG
+#include "../tests/utils/print_alp_containers.hpp"
+#endif
+
+// TEMPDISABLE should be removed in the final version
+#define TEMPDISABLE
+
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * Calculate eigendecomposition of square matrix T
+		 *        \f$T = Qdiag(d)Q^T\f$ where
+		 *        \a T is real
+		 *        \a Q is orthogonal (columns are eigenvectors).
+		 *        \a d is vector containing eigenvalues.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type of minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[out] Q    output orthogonal matrix contaning eigenvectors
+		 * @param[out] d    output vector containg eigenvalues
+		 * @param[in]  T    input symmetric tridiagonal matrix
+		 * @param[in]  ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename MatA,
+			typename MatQ,
+			typename Vec,
+			typename D = typename MatA::value_type,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >
+		>
+		RC qr_eigensolver(
+			MatA &A,
+			MatQ &Q,
+			Vec &d,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			(void) ring;
+			(void) minus;
+			(void) divide;
+
+			const size_t max_it = 1.e+7;
+			const D tol = 1.e-6;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			RC rc = SUCCESS;
+
+			rc = rc ? rc : alp::set( d, zero );
+
+			const size_t n = nrows( A );
+
+			alp::Matrix< D, structures::General > Atmp( n, n );
+			rc = rc ? rc : alp::set( Atmp, zero );
+
+			// auto A_diag = alp::get_view< alp::view::diagonal >( A );
+
+			auto A_tmp_orig_view = alp::get_view< typename MatA::structure >( Atmp );
+
+			auto A_tmp_diag = alp::get_view< alp::view::diagonal >( Atmp );
+
+			auto A_tmp_supsquare = alp::get_view< alp::structures::Square >( Atmp, utils::range( 0, n - 1 ), utils::range( 1, n ) );
+			auto A_tmp_supdiag = alp::get_view< alp::view::diagonal >( A_tmp_supsquare );
+
+			auto A_tmp_subsquare = alp::get_view< alp::structures::Square >( Atmp, utils::range( 1, n ), utils::range( 0, n - 1 ) );
+			auto A_tmp_subdiag = alp::get_view< alp::view::diagonal >( A_tmp_subsquare );
+
+			rc = rc ? rc : alp::set( A_tmp_orig_view, A );
+			rc = rc ? rc : alp::set( A_tmp_subdiag, A_tmp_supdiag );
+
+// //#ifdef DEBUG
+// 			print_matrix( "  A(input)     =  ", A );
+// 			print_matrix( "  Atmp  =  ", Atmp );
+// //#endif
+
+			rc = rc ? rc : alp::set( Q, zero );
+			auto Q_diag = alp::get_view< alp::view::diagonal >( Q );
+			rc = rc ? rc : alp::set(
+				Q_diag,
+				one
+			);
+
+			alp::Matrix< D, structures::Orthogonal > qmat( n );
+			alp::Matrix< D, structures::General > rmat( n, n );
+			MatQ Q_tmp( n, n );
+
+			size_t k1 = 0;
+			size_t k2 = n;
+
+			for( size_t i = 0; i < max_it; ++i ) {
+// //#ifdef DEBUG
+// 				print_vector( "  A_tmp_supdiag   ", A_tmp_supdiag );
+// //#endif
+
+				Scalar< D > sdiagnorm1( zero );
+				auto sdiag1 = alp::get_view( A_tmp_supdiag, utils::range( k1, k1 + 1 ) );
+				rc = rc ? rc : alp::norm2( sdiagnorm1, sdiag1, ring );
+				if( std::abs( *sdiagnorm1 ) < tol ) {
+					++k1;
+				}
+				if ( k1 >= k2 - 1 ){
+					break;
+				}
+
+				Scalar< D > sdiagnorm2( zero );
+				auto sdiag2 = alp::get_view( A_tmp_supdiag, utils::range( k2 - 2, k2 - 1 ) );
+				rc = rc ? rc : alp::norm2( sdiagnorm2, sdiag2, ring );
+				if( std::abs( *sdiagnorm2 ) < tol ) {
+					--k2;
+				}
+				if ( k1 >= k2 - 1 ){
+					break;
+				}
+
+				if( ( k2 - k1 ) != n ) {
+					auto A_tmp_subprob = alp::get_view( Atmp, utils::range( k1, k2 ), utils::range( k1, k2 ) );
+					MatQ qmat2( k2 - k1 );
+					MatA A_sub_mat( k2 - k1 );
+					Vec d_tmp( k2 - k1 );
+					rc = rc ? rc : alp::set( A_sub_mat, zero );
+					auto view_t1 = alp::get_view< typename MatA::structure >( A_tmp_subprob );
+					rc = rc ? rc : alp::set( A_sub_mat, view_t1 );
+					rc = rc ? rc : alp::set( d_tmp, zero );
+// //#ifdef DEBUG
+// 					print_matrix( "  Atmp   ", Atmp );
+// 					print_matrix( "  A_tmp_subprob   ", A_tmp_subprob );
+// 					print_matrix( "  A_sub_mat   ", A_sub_mat );
+// //#endif
+					rc = rc ? rc : alp::set( qmat2, zero );
+					rc = rc ? rc : alp::algorithms::qr_eigensolver( A_sub_mat, qmat2, d_tmp );
+// // #ifdef DEBUG
+// 					std::cout << " d_tmp : \n";
+// 					print_vector( "  ---> d_tmp   ", d_tmp );
+// // #endif
+
+					//Q[:,k1:k2]=Q[:,k1:k2].dot(q1)
+					auto Q_update_view = alp::get_view< structures::OrthogonalColumns >( Q, utils::range( 0, n ), utils::range( k1, k2 ) );
+					alp::Matrix< D, structures::OrthogonalColumns > Q_tmp2( n, k2 - k1 );
+					rc = rc ? rc : alp::set( Q_tmp2, Q_update_view );
+					rc = rc ? rc : alp::set( Q_update_view, zero );
+					rc = rc ? rc : alp::mxm( Q_update_view, Q_tmp2, qmat2, ring );
+
+					rc = rc ? rc : alp::set( A_tmp_subprob, zero );
+					auto A_tmp_diag_update = alp::get_view< alp::view::diagonal >( A_tmp_subprob );
+					rc = rc ? rc : alp::set( A_tmp_diag_update, d_tmp );
+
+					break;
+				} else {
+
+					rc = rc ? rc : alp::set( qmat, zero );
+					rc = rc ? rc : alp::set( rmat, zero );
+					rc = rc ? rc : alp::algorithms::householder_qr( Atmp, qmat, rmat, ring );
+
+					rc = rc ? rc : alp::set( Q_tmp, Q );
+					rc = rc ? rc : alp::set( Q, zero );
+					rc = rc ? rc : alp::mxm( Q, Q_tmp, qmat, ring );
+					rc = rc ? rc : alp::set( Atmp, zero );
+					rc = rc ? rc : alp::mxm( Atmp, rmat, qmat, ring );
+
+				}
+
+// //#ifdef DEBUG
+// 				print_matrix( "  Atmp   ", Atmp );
+// //#endif
+
+//				if( i % ( n ) == 0 ) {
+					Scalar< D > supdiagnorm( zero );
+					rc = rc ? rc : alp::norm2( supdiagnorm, A_tmp_supdiag, ring );
+					if( std::abs( *supdiagnorm ) < tol * tol ) {
+						break;
+					}
+//				}
+			}
+
+			rc = rc ? rc : alp::set( d, A_tmp_diag );
+
+			return rc;
+		}
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/svd.hpp b/include/alp/algorithms/svd.hpp
new file mode 100644
index 000000000..d008cd221
--- /dev/null
+++ b/include/alp/algorithms/svd.hpp
@@ -0,0 +1,478 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#include <alp/algorithms/householder_bidiag.hpp>
+#ifdef DEBUG
+#include "../tests/utils/print_alp_containers.hpp"
+#endif
+
+namespace alp {
+
+	namespace algorithms {
+
+
+		/**
+		 * Calculate Givens rotation 2x2 matrix elements and overwrite
+		 * the content of matrix G. Givens rotation elements G=[[c,-s*],[s,c]] are determined by
+		 * input vector v=[a,b], so that G v = [r,0]
+		 */
+		// for a more general purpose
+		// a more stable implementations is needed
+		// todo: move to utils?
+		template<
+			typename MatG,
+			typename VecV,
+			typename D = typename MatG::value_type,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatG >::value &&
+				is_vector< VecV >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC Givens(
+			MatG &G,
+			VecV &v,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			(void) minus;
+			RC rc = SUCCESS;
+
+#ifdef DEBUG
+			if( ( nrows( G ) != 2 ) ||
+			    ( ncols( G ) != 2 ) ||
+			    ( size( v ) != 2 )
+			  ) {
+				std::cerr << "Wrong size in Givens.";
+				return FAILED;
+			}
+#endif
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			// c = abs(a) / sqrt(abs(a)**2 + abs(b)**2)
+			// s = (a/abs(a)) * conjugate(b) / sqrt(abs(a)**2 + abs(b)**2)
+			// return(array([[c,-conjugate(s)],[(s),c]]))
+			Scalar< D > c( zero );
+			Scalar< D > s( zero );
+			Scalar< D > d( zero );
+			rc = rc ? rc : alp::norm2( d, v, ring );
+			auto a = get_view( v, utils::range( 0, 1 ) );
+			auto b = get_view( v, utils::range( 1, 2 ) );
+
+			rc = rc ? rc : alp::norm2( c, a, ring );
+			rc = rc ? rc : alp::foldl( s, a, ring.getAdditiveMonoid() );
+
+			rc = rc ? rc : alp::foldl( s, c, divide );
+			rc = rc ? rc : alp::foldl( s, conjugate( b ), ring.getMultiplicativeMonoid() );
+
+			// return(array([[c,-conjugate(s)],[(s),c]]),r)
+			auto G11 = get_view( G, 0, utils::range( 0, 1 ) );
+			auto G12 = get_view( G, 0, utils::range( 1, 2 ) );
+			auto G21 = get_view( G, 1, utils::range( 0, 1 ) );
+			auto G22 = get_view( G, 1, utils::range( 1, 2 ) );
+			rc = rc ? rc : alp::set( G, zero );
+			rc = rc ? rc : alp::foldl( G11, c, ring.getAdditiveOperator() );
+			rc = rc ? rc : alp::foldl( G22, c, ring.getAdditiveOperator() );
+			rc = rc ? rc : alp::foldl( G21, s, ring.getAdditiveOperator() );
+			rc = rc ? rc : alp::set( G12, conjugate( G21 ) );
+			rc = rc ? rc : alp::foldl( G12, Scalar< D >( -1 ), ring.getMultiplicativeOperator() );
+			rc = rc ? rc : alp::foldl( G, d, divide );
+			return rc;
+		}
+
+
+		/** Golub-Kahan SVD step */
+		template<
+			typename MatB,
+			typename MatU,
+			typename MatV,
+			typename D = typename MatB::value_type,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatB >::value &&
+				is_matrix< MatU >::value &&
+				is_matrix< MatV >::value &&
+				structures::is_a< typename MatB::structure, structures::General >::value &&
+				structures::is_a< typename MatU::structure, structures::OrthogonalColumns >::value &&
+				structures::is_a< typename MatV::structure, structures::OrthogonalRows >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC gk_svd_step(
+			MatU &U,
+			MatB &B,
+			MatV &V,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			const size_t m = nrows( B );
+			const size_t n = ncols( B );
+			const size_t k = std::min( m, n );
+
+			// get lambda
+			// calculate eigenvalue llambda of
+			// which is closer to t22
+			auto BEnd = get_view( B, utils::range( k - 3, k ), utils::range( k - 2, k ) );
+			Matrix< D, structures::Square, Dense > BEndSquare( 2, 2 );
+			rc = rc ? rc : alp::set( BEndSquare, zero );
+			auto BEndT = get_view< alp::view::transpose >( BEnd );
+			auto BEndT_star = conjugate( BEndT );
+			rc = rc ? rc : mxm( BEndSquare, BEndT_star, BEnd, ring );
+
+			auto tdiag = get_view< alp::view::diagonal >( BEndSquare );
+			auto t11 = get_view( BEndSquare, 0, utils::range( 0, 1 ) );
+			auto t12 = get_view( BEndSquare, 0, utils::range( 1, 2 ) );
+			auto t22 = get_view( BEndSquare, 1, utils::range( 1, 2 ) );
+
+			Scalar< D > llambda( zero );
+			rc = rc ? rc : alp::foldl( llambda, tdiag, ring.getAdditiveMonoid() );
+			rc = rc ? rc : alp::foldl( llambda, alp::Scalar< D >( 2 ), divide );
+
+			Scalar< D > bb( zero );
+			rc = rc ? rc : alp::foldl( bb, t11, ring.getAdditiveMonoid() );
+			rc = rc ? rc : alp::foldl( bb, Scalar< D >( -1 ), ring.getMultiplicativeOperator() );
+			rc = rc ? rc : alp::foldl( bb, t22, ring.getAdditiveMonoid() );
+			rc = rc ? rc : alp::foldl( bb, alp::Scalar< D >( 2 ), divide );
+
+			Scalar< D > cc( zero );
+			rc = rc ? rc : alp::foldl( cc, conjugate( t12 ), ring.getAdditiveMonoid() );
+
+			Vector< D > DD( 2 );
+			rc = rc ? rc : alp::set( DD, zero );
+			auto DD0 =  get_view( DD, utils::range( 0, 1 ) );
+			auto DD1 =  get_view( DD, utils::range( 1, 2 ) );
+			rc = rc ? rc : alp::foldl( DD0, bb, ring.getAdditiveOperator() );
+			rc = rc ? rc : alp::foldl( DD1, cc, ring.getAdditiveOperator() );
+			rc = rc ? rc : alp::set( bb, zero );
+			rc = rc ? rc : alp::norm2( bb, DD, ring );
+
+			Scalar< D > t11scal( zero );
+			Scalar< D > t22scal( zero );
+			rc = rc ? rc : alp::foldl( t11scal, t11, ring.getAdditiveMonoid() );
+			rc = rc ? rc : alp::foldl( t22scal, t22, ring.getAdditiveMonoid() );
+
+			if ( std::real( *t11scal ) > std::real( *t22scal ) ) {
+				rc = rc ? rc : alp::foldl( llambda, bb, minus );
+			} else {
+				rc = rc ? rc : alp::foldl( llambda, bb, ring.getAdditiveOperator() );
+			}
+			// end of get lambda
+
+			Vector< D > rotvec( 2 );
+			auto Brow = get_view( B, 0, utils::range( 0, 2 ) );
+			auto B00 = get_view( B, 0, utils::range( 0, 1 ) );
+			Scalar< D > b00star( zero );
+			rc = rc ? rc : alp::foldl( b00star, conjugate( B00 ), ring.getAdditiveMonoid() );
+			rc = rc ? rc : alp::set( rotvec, Brow );
+			rc = rc ? rc : alp::foldl( rotvec, b00star, ring.getMultiplicativeOperator() );
+
+			auto rotvec0 = get_view( rotvec, utils::range( 0, 1 ) );
+			rc = rc ? rc : alp::foldl( rotvec0, llambda, minus );
+
+			Matrix< D, structures::Square, Dense > G( 2, 2 );
+			rc = rc ? rc : alp::set( G, zero );
+			rc = rc ? rc : Givens( G, rotvec );
+			auto Gdiag = get_view< alp::view::diagonal >( G );
+			auto Gstar = conjugate( G );
+			auto GT = get_view< alp::view::transpose >( G );
+			auto GTstar = conjugate( GT );
+
+			for( size_t i = 0; i < k - 1; ++i ){
+				// B[max(i-1,0):i+2,i:i+2]=B[max(i-1,0):i+2,i:i+2].dot(G)
+				auto Bblock1 = get_view( B, utils::range( ( i == 0 ? 0 : i - 1 ), i + 2 ), utils::range( i, i + 2 ) );
+				Matrix< D, structures::General, Dense > TMP1( nrows( Bblock1 ), ncols( Bblock1 ) );
+				rc = rc ? rc : alp::set( TMP1, Bblock1 );
+				rc = rc ? rc : alp::set( Bblock1, zero );
+				rc = rc ? rc : mxm( Bblock1, TMP1, G, ring );
+
+				// update V
+				// G2=G-identity(2).astype(complex)
+				rc = rc ? rc : alp::foldl( Gdiag, one, minus );
+				// V[i:i+2,:]=V[i:i+2,:] + conjugate(G2).dot(V[i:i+2,:])
+				auto Vstrip = get_view< structures::OrthogonalRows >( V, utils::range( i, i + 2 ), utils::range( 0, ncols( V ) ) );
+				Matrix< D, structures::OrthogonalRows, Dense > TMPStrip1( nrows( Vstrip ), ncols( Vstrip ) );
+				rc = rc ? rc : alp::set( TMPStrip1, Vstrip );
+				rc = rc ? rc : mxm( Vstrip, GTstar, TMPStrip1, ring );
+
+				// B[i:i+2,i:i+3]=G.T.dot(B[i:i+2,i:i+3])
+				auto Bblock2 = get_view( B, utils::range( i, i + 2 ), utils::range( i, std::min( i + 3, n ) ) );
+				Matrix< D, structures::General, Dense > TMP2( nrows( Bblock2 ), ncols( Bblock2 ) );
+				auto rotvec2 = get_view( B, utils::range( i, i + 2 ), i );
+				rc = rc ? rc : Givens( G, rotvec2 );
+				rc = rc ? rc : alp::set( TMP2, Bblock2 );
+				rc = rc ? rc : alp::set( Bblock2, zero );
+				rc = rc ? rc : mxm( Bblock2, GT, TMP2, ring );
+
+				// update U
+				// G2=G-identity(2).astype(complex)
+				rc = rc ? rc : alp::foldl( Gdiag, one, minus );
+				// U[:,k:k+2]=U[:,k:k+2]+U[:,k:k+2].dot(conjugate(G2))
+				auto Ustrip = get_view< structures::OrthogonalColumns >( U, utils::range( 0, nrows( U ) ), utils::range( i, i + 2 ) );
+				Matrix< D, structures::OrthogonalColumns, Dense > TMPStrip2( nrows( Ustrip ), ncols( Ustrip ) );
+				rc = rc ? rc : alp::set( TMPStrip2, Ustrip );
+				rc = rc ? rc : mxm( Ustrip, TMPStrip2, Gstar, ring );
+
+				if( i + 2 < k ) {
+					auto rotvec3 = get_view( B, i, utils::range( i + 1, i + 3 ) );
+					rc = rc ? rc : Givens( G, rotvec3 );
+				} else {
+					rc = rc ? rc : Givens( G, rotvec2 );
+				}
+			}
+
+			return rc;
+		}
+
+		/** Golub-Khan SVD algorithm */
+		template<
+			typename MatB,
+			typename MatU,
+			typename MatV,
+			typename D = typename MatB::value_type,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatB >::value &&
+				is_matrix< MatU >::value &&
+				is_matrix< MatV >::value &&
+				structures::is_a< typename MatB::structure, structures::General >::value &&
+				structures::is_a< typename MatU::structure, structures::Orthogonal >::value &&
+				structures::is_a< typename MatV::structure, structures::Orthogonal >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC svd_solve(
+			MatU &U,
+			MatB &B,
+			MatV &V,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			const size_t m = nrows( B );
+			const size_t n = ncols( B );
+			const size_t k = std::min( m, n );
+
+			const double tol = 1.e-12;
+			const size_t maxit = k * 5;
+
+			auto Bsupsquare =  get_view( B, utils::range( 0, k - 1 ) , utils::range( 1, k ) );
+			auto superdiagonal = get_view< alp::view::diagonal >( Bsupsquare );
+
+			size_t i1 = 0;
+			size_t i2 = k;
+
+			rc = rc ? rc : algorithms::householder_bidiag( U, B, V, ring, minus, divide );
+
+			// eliminate superdiagonal elements via Givens rotations
+			for( size_t i = 0; i < maxit; ++i ) {
+				// todo: In convergence test: replace absolute with relative tolerance check
+				// todo: check for zeroes in diagonal, if any do Givens rotatations
+				//      to move the zero from diagonal to superdiagonal
+				//      (no likely to affect randomly generated tests)
+
+				// check for zeros in superdiagonal, if any,
+				// move i1 and i2 to bound non-zero part of superdiagonal
+				for( ; i1 < i2; ++i1 ) {
+					auto B_l = get_view( superdiagonal, utils::range( i1, i1 + 1 ) );
+					Scalar< D > bnorm( zero );
+					rc = rc ? rc : alp::norm2( bnorm, B_l, ring );
+					if( std::abs( *bnorm ) > tol ) {
+						break;
+					}
+				}
+				for( ; i2 > i1; --i2 ) {
+					auto B_l = get_view( superdiagonal, utils::range( i2 - 2, i2 - 1 ) );
+					Scalar< D > bnorm( zero );
+					rc = rc ? rc : alp::norm2( bnorm, B_l, ring );
+					if( std::abs( *bnorm ) > tol ) {
+						break;
+					}
+				}
+				if( i2 <= i1 ){
+					break;
+				}
+
+				auto Bview = get_view( B, utils::range( i1, i2 ), utils::range( i1, i2 ) );
+				auto Uview = get_view< structures::OrthogonalColumns >( U, utils::range( 0, nrows( U ) ), utils::range( i1, i2 ) );
+				auto Vview = get_view< structures::OrthogonalRows >( V, utils::range( i1, i2 ), utils::range( 0, ncols( V ) ) );
+
+				rc = rc ? rc : algorithms::gk_svd_step( Uview, Bview, Vview, ring, minus, divide );
+
+				// check convergence
+				Scalar< D > sup_diag_norm( zero );
+				rc = rc ? rc : alp::norm2( sup_diag_norm, superdiagonal, ring );
+
+				if( std::abs( *sup_diag_norm ) < tol ) {
+					break ;
+				}
+			}
+
+			// Rotate diagonal elements in complex plane
+			// in order to have them on real axis (positive singular values)
+			auto BSquare = alp::get_view( B, utils::range( 0, k ), utils::range( 0, k ) );
+			auto DiagBview = alp::get_view< alp::view::diagonal >( BSquare );
+			for( size_t i = 0; i < size( DiagBview ); ++i ) {
+				Scalar< D > sigmaiphase( zero );
+				Scalar< D > sigmainorm( zero );
+				auto U_vi = get_view( U, utils::range( 0, nrows( U ) ), i );
+				auto B_vi = get_view( B, i, utils::range( 0, ncols( B ) ) );
+				auto d_i = get_view( DiagBview, utils::range( i, i + 1 ) );
+				rc = rc ? rc : alp::norm2( sigmainorm, d_i, ring );
+				if( std::abs( *sigmainorm ) > tol ) {
+					rc = rc ? rc : alp::foldl( sigmaiphase, d_i, ring.getAdditiveMonoid() );
+					rc = rc ? rc : alp::foldl( sigmaiphase, sigmainorm, divide );
+					rc = rc ? rc : alp::foldl( U_vi, sigmaiphase, ring.getMultiplicativeOperator() );
+					rc = rc ? rc : alp::foldl( B_vi, sigmaiphase, divide );
+				}
+			}
+
+			return rc;
+		}
+
+
+
+		/**
+		 * Computes singular value decomposition (inplace) of a
+		 * general matrix \f$H(input) = U B V \f$
+		 * where \a H is general (complex or real),
+		 * \a U orthogonal and \a V are orthogonal, \a B is nonzero only on diagonal
+		 * and it contains positive singular values.
+		 * If convergenece is not reached B will contain nonzeros on superdiagonal.
+		 *
+		 * @tparam          MatH type of general matrix on which we perform SVD
+		 * @tparam          MatU type of orthogonal matrix U, output of SVD
+		 * @tparam          MatS type of rectangulardiagonal matrix S,
+		 *                       output which has positive nonzero elements on diagonal
+		 * @tparam          MatV type of orthogonal matrix V, output of SVD
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[out]      U orthogonal matrix
+		 * @param[out]      V orthogonal matrix
+		 * @param[in,out]   B input general matrix, output bidiagonal matrix
+		 * @param[in]       ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename MatH,
+			typename MatU,
+			typename MatS,
+			typename MatV,
+			typename D = typename MatH::value_type,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_matrix< MatH >::value &&
+				is_matrix< MatU >::value &&
+				is_matrix< MatS >::value &&
+				is_matrix< MatV >::value &&
+				structures::is_a< typename MatH::structure, structures::General >::value &&
+				structures::is_a< typename MatU::structure, structures::Orthogonal >::value &&
+				structures::is_a< typename MatS::structure, structures::RectangularDiagonal >::value &&
+				structures::is_a< typename MatV::structure, structures::Orthogonal >::value &&
+				is_semiring< Ring >::value &&
+				is_operator< Minus >::value &&
+				is_operator< Divide >::value
+			> * = nullptr
+		>
+		RC svd(
+			const MatH &H,
+			MatU &U,
+			MatS &S,
+			MatV &V,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			const size_t m = nrows( H );
+			const size_t n = ncols( H );
+
+			// inplace work on B
+			MatH B( m, n );
+			rc = rc ? rc : set( B, H );
+
+			rc = rc ? rc : set( U, zero );
+			rc = rc ? rc : set( V, zero );
+
+			// set U to Identity
+			auto DiagU = alp::get_view< alp::view::diagonal >( U );
+			rc = rc ? rc : alp::set( U, zero );
+			rc = rc ? rc : alp::set( DiagU, one );
+			// set V to Identity
+			auto DiagV = alp::get_view< alp::view::diagonal >( V );
+			rc = rc ? rc : alp::set( V, zero );
+			rc = rc ? rc : alp::set( DiagV, one );
+
+			if( n > m ) {
+				auto UT = get_view< alp::view::transpose >( U );
+				auto BT = get_view< alp::view::transpose >( B );
+				auto VT = get_view< alp::view::transpose >( V );
+				rc = rc ? rc : algorithms::svd_solve( VT, BT, UT, ring, minus, divide );
+			} else {
+				rc = rc ? rc : algorithms::svd_solve( U, B, V, ring, minus, divide );
+			}
+
+			// update S
+			auto DiagS = alp::get_view< alp::view::diagonal >( S );
+			auto DiagB = alp::get_view< alp::view::diagonal >( B );
+			rc = rc ? rc : set( S, zero );
+			rc = rc ? rc : set( DiagS, DiagB );
+
+			return rc;
+		}
+
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/symherm_posdef_inverse.hpp b/include/alp/algorithms/symherm_posdef_inverse.hpp
new file mode 100644
index 000000000..e9ba929df
--- /dev/null
+++ b/include/alp/algorithms/symherm_posdef_inverse.hpp
@@ -0,0 +1,108 @@
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <alp/algorithms/cholesky.hpp>
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * Computes the Cholesky decomposition U^HU = H of a real symmetric
+		 * positive definite (SPD) (or complex Hermitian positive definite)
+		 * matrix H where \a U is upper triangular, and ^H is transpose in
+		 * the real case and transpose + complex conjugate in the complex case.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @param[out] L    output lower triangular matrix
+		 * @param[in]  H    input real symmetric positive definite matrix
+		 *                  or complex hermitian positive definite matrix
+		 * @param[in]  ring The semiring used in the computation
+		 * @return RC        SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename MatH,
+			typename D = typename MatH::value_type,
+			typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			std::enable_if_t<
+				is_matrix< MatH >::value &&
+				(
+					(
+						!grb::utils::is_complex< D >::value &&
+						structures::is_a< typename MatH::structure, structures::SymmetricPositiveDefinite >::value
+					) || (
+						grb::utils::is_complex< D >::value &&
+						structures::is_a< typename MatH::structure, structures::HermitianPositiveDefinite >::value
+					)
+				) &&
+				is_semiring< Ring >::value
+			> * = nullptr
+		>
+		RC symherm_posdef_inverse(
+			MatH &Hinv,
+			const MatH &H,
+			const Ring &ring = Ring()
+		) {
+			RC rc = SUCCESS;
+
+			const alp::Scalar< D > zero( ring.template getZero< D >() );
+			const alp::Scalar< D > one( ring.template getOne< D >() );
+
+			if( nrows( Hinv ) != nrows( H ) ) {
+				std::cerr << "Incompatible sizes in symherm_posdef_inverse.\n";
+				return FAILED;
+			}
+
+			const size_t N = nrows( H );
+
+			alp::Matrix< D, structures::UpperTriangular, Dense > U( N );
+
+			rc = rc ? rc : alp::set( U, zero );
+
+			rc = rc ? rc : algorithms::cholesky_uptr( U, H, ring );
+#ifdef DEBUG
+			print_matrix( "  U ", U );
+#endif
+			// H = U^H U
+			// H^-1 = U^-1 U^H-1
+			alp::Matrix< D, structures::UpperTriangular, Dense > Uinv( N );
+			rc = rc ? rc : alp::set( Uinv, zero );
+			auto Uinvdiag = alp::get_view< alp::view::diagonal >( Uinv );
+			auto UinvT = alp::get_view< alp::view::transpose >( Uinv );
+			rc = rc ? rc : alp::set( Uinvdiag, one );
+			auto UT = alp::get_view< alp::view::transpose >( U );
+			for( size_t i = 0; i < N; ++i ){
+				auto x = alp::get_view( UinvT, utils::range( i, N ), i );
+				auto UT_submatview = alp::get_view( UT, utils::range( i, N ), utils::range( i, N ) );
+				rc = rc ? rc : alp::algorithms::forwardsubstitution( UT_submatview, x, ring );
+			}
+#ifdef DEBUG
+			print_matrix( "  Uinv  ", Uinv );
+#endif
+			rc = rc ? rc : alp::set( Hinv, zero );
+			// conjugate(linv.T).dot(linv)
+			auto UinvTvstar = conjugate( UinvT );
+			rc = rc ? rc : alp::mxm( Hinv, Uinv, UinvTvstar, ring );
+#ifdef DEBUG
+			print_matrix( "  Hinv  ", Hinv );
+#endif
+			return rc;
+		}
+
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/algorithms/symm_tridiag_eigensolver.hpp b/include/alp/algorithms/symm_tridiag_eigensolver.hpp
new file mode 100644
index 000000000..e9e29d2ea
--- /dev/null
+++ b/include/alp/algorithms/symm_tridiag_eigensolver.hpp
@@ -0,0 +1,539 @@
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../tests/utils/print_alp_containers.hpp"
+#endif
+
+// TEMPDISABLE should be removed in the final version
+#define TEMPDISABLE
+
+
+namespace alp {
+
+	namespace algorithms {
+
+		/**
+		 * find zero of secular equation in interval <a,b>
+		 * using bisection
+		 * this is not an optimal algorithm and there are many
+		 * more efficient implementation
+		 */
+		template<
+			typename D,
+			typename VectorD,
+			typename VectorV,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_vector< VectorD >::value &&
+				is_vector< VectorV >::value
+			> * = nullptr
+		>
+		RC bisec_sec_eq(
+			Scalar< D > &lambda,
+			const VectorD &d,
+			// Vector v should be const, but that would disable eWiseLambda,
+			// to be resolved in the future
+			VectorV &v,
+			const Scalar< D > &a,
+			const Scalar< D > &b,
+			const D tol = 1.e-10,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			Scalar< D > x0( a );
+			rc = rc ? rc : alp::foldl( x0, b, ring.getAdditiveOperator() );
+			rc = rc ? rc : alp::foldl( x0, Scalar< D >( 2 ), divide );
+
+			Scalar< D > delta( a );
+			rc = rc ? rc : alp::foldl( delta, b, minus );
+			*delta = std::abs( *delta );
+
+			if( *delta < tol ) {
+				alp::set( lambda, x0 );
+				return rc;
+			}
+
+			//fx0=1+sum(v**2/(d-x0))
+			Scalar< D > fx0( one );
+			rc = rc ? rc : eWiseLambda(
+				[ &d, &x0, &fx0, &ring, &minus, &divide ]( const size_t i, D &val ) {
+					Scalar< D > alpha( val );
+					Scalar< D > beta( d[ i ] );
+					alp::foldl( alpha, Scalar< D > ( val ), ring.getMultiplicativeOperator() );
+					alp::foldl( beta, x0, minus );
+					alp::foldl( alpha, beta, divide );
+					alp::foldl( fx0, alpha, ring.getAdditiveOperator() );
+				},
+				v
+			);
+
+			if( std::abs( *fx0 ) < tol ) {
+				alp::set( lambda, x0 );
+				return rc;
+			}
+
+			if( *fx0 < *zero ) {
+				rc = rc ? rc : bisec_sec_eq( lambda, d, v, x0, b, tol );
+			} else {
+				rc = rc ? rc : bisec_sec_eq( lambda, d, v, a, x0, tol );
+			}
+
+			return rc;
+		}
+
+
+		/**
+		 * Calculate eigendecomposition of system D + vvt
+		 *        \f$D = diag(d)$ is diagonal matrix and
+		 *        \a vvt outer product outer(v,v)
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type of minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[out] Egvecs    output orthogonal matrix contaning eigenvectors
+		 * @param[out] egvals    output vector containg eigenvalues
+		 * @param[in]  ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename VectorEgVals,
+			typename VectorD,
+			typename VectorV,
+			typename OrthogonalMat,
+			typename D = typename OrthogonalMat::value_type,
+			class Ring = Semiring<
+				operators::add< D >,
+				operators::mul< D >,
+				identities::zero,
+				identities::one
+			>,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >,
+			std::enable_if_t<
+				is_vector< VectorD >::value &&
+				is_vector< VectorV >::value &&
+				is_matrix< OrthogonalMat >::value &&
+				alp::structures::is_a< typename OrthogonalMat::structure, alp::structures::Orthogonal >::value
+			> * = nullptr
+		>
+		RC eigensolveDiagPlusOuter(
+			VectorEgVals &egvals,
+			OrthogonalMat &Egvecs,
+			VectorD &d,
+			VectorV &v,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			RC rc = SUCCESS;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+			const size_t n = nrows( Egvecs );
+			const double eps = 1.e-7;
+
+			// all egvec/val are trivial when the corresponding
+			// element of v is zero
+			size_t count_direct_egvc = 0;
+			size_t count_non_direct_egvc = 0;
+
+			std::vector< size_t > direct_egvc_indx( n, 0 );
+			std::vector< size_t > non_direct_egvc_indx( n, 0 );
+			// the following loop should be replaced by ALP primitives
+			// since v is not sorted it seems that another sort is needed
+			// currently there is no simple way to impement this in ALP
+			for( size_t i = 0; i < n; i++ ) {
+				if( std::abs( v[ i ] ) < eps ) {
+					// in these cases equals are canonical vectors
+					// and eigenvalues are d[i]
+					direct_egvc_indx[ count_direct_egvc ] = i ;
+					++count_direct_egvc;
+				} else {
+					// these cases require complicated egval formula
+					// and for cases where egval is close to the singular point
+					// different algorithm for eigenvectors needs to be implemented
+					non_direct_egvc_indx[ count_non_direct_egvc ] = i;
+					++count_non_direct_egvc;
+				}
+			}
+			direct_egvc_indx.resize( count_direct_egvc );
+			non_direct_egvc_indx.resize( count_non_direct_egvc );
+			alp::Vector< size_t > select_direct( count_direct_egvc );
+			alp::Vector< size_t > select_non_direct( count_non_direct_egvc );
+			alp::buildVector( select_direct, direct_egvc_indx.begin(), direct_egvc_indx.end() );
+			alp::buildVector( select_non_direct, non_direct_egvc_indx.begin(), non_direct_egvc_indx.end() );
+
+#ifdef DEBUG
+			std::cout << " ---->     count_direct_egvc = " << count_direct_egvc << "\n";
+			std::cout << " ----> count_non_direct_egvc = " << count_non_direct_egvc << "\n";
+#endif
+			auto egvals_direct = alp::get_view< alp::structures::General >( egvals, select_direct );
+			auto egvals_non_direct = alp::get_view< alp::structures::General >( egvals, select_non_direct );
+
+			auto Egvecs_non_direct = alp::get_view< alp::structures::Orthogonal >(
+				Egvecs, select_non_direct, select_non_direct
+			);
+
+			// copy d -> egvals for direct part
+			rc = rc ? rc : alp::set(
+				egvals_direct,
+				get_view< alp::structures::General >( d, select_direct )
+			);
+
+			auto d_view = alp::get_view< alp::structures::General >( d, select_non_direct );
+			auto v_view = alp::get_view< alp::structures::General >( v, select_non_direct );
+
+#ifdef DEBUG
+			print_vector( "eigensolveDiagPlusOuter: d ", d );
+			print_vector( "eigensolveDiagPlusOuter: v ", v );
+			print_vector( "eigensolveDiagPlusOuter: d_view ", d_view );
+			print_vector( "eigensolveDiagPlusOuter: v_view ", v_view );
+#endif
+
+			// vec_b = {d_view[1], d_view[2], ... , d_view[N-1], d_view[N]+dot(v,v) }
+			size_t nn = alp::size( d_view );
+			alp::Vector< D > vec_b( nn );
+			auto v1 = alp::get_view( vec_b, utils::range( 0, nn - 1 ) );
+			auto v2 = alp::get_view( d_view, utils::range( 1, nn ) );
+			rc = rc ? rc : alp::set( v1, v2 );
+			auto v3 = alp::get_view( vec_b, utils::range( nn - 1, nn ) );
+			auto v4 = alp::get_view( d_view, utils::range( nn - 1, nn ) );
+			rc = rc ? rc : alp::set( v3, v4 );
+
+			// eWiseLambda currently does not work with select view
+			// dot() does not work with select view
+			// as a (temp) solution we use temp vectors
+			alp::Vector< D > vec_temp_egvals( nn );
+			alp::Vector< D > vec_temp_d( nn );
+			alp::Vector< D > vec_temp_v( nn );
+
+			rc = rc ? rc : alp::set( vec_temp_egvals, zero );
+			rc = rc ? rc : alp::set( vec_temp_d, d_view );
+			rc = rc ? rc : alp::set( vec_temp_v, v_view );
+
+			Scalar< D > alpha( zero );
+			// there is a bug in dot() when called on select views
+			//rc = rc ? rc : alp::dot( alpha, d_view, d_view, ring );
+			rc = rc ? rc : alp::dot( alpha, vec_temp_v, vec_temp_v, ring );
+
+			auto v5 = alp::get_view( vec_b, utils::range( alp::size( vec_b ) - 1, alp::size( vec_b ) ) );
+			rc = rc ? rc : alp::foldl( v5, alpha, ring.getAdditiveOperator() );
+
+			rc = rc ? rc : alp::eWiseLambda(
+				[ &d_view, &vec_temp_v, &vec_b, &ring, &divide ]( const size_t i, D &val ) {
+					Scalar< D > a( d_view[ i ] );
+					Scalar< D > b( vec_b[ i ] );
+					Scalar< D > w( a );
+					alp::foldl( w, b, ring.getAdditiveOperator() );
+					alp::foldl( w, Scalar< D >( 2 ), divide );
+					bisec_sec_eq( w, d_view, vec_temp_v, a, b );
+					val = *w;
+				},
+				vec_temp_egvals
+			);
+			rc = rc ? rc : alp::set( egvals_non_direct, vec_temp_egvals );
+
+			Matrix< D, structures::General, Dense > tmp_egvecs( nn, nn );
+			Matrix< D, structures::General, Dense > tmp_denominator( nn, nn );
+
+			alp::Vector< D > ones( nn );
+			rc = rc ? rc : alp::set( ones, one );
+			rc = rc ? rc : alp::set(
+				tmp_egvecs,
+				alp::outer( vec_temp_v, ones, ring.getMultiplicativeOperator() )
+			);
+
+			auto ddd = alp::outer( vec_temp_d, ones, ring.getMultiplicativeOperator() );
+			auto lll = alp::outer( ones, egvals_non_direct, ring.getMultiplicativeOperator() );
+			rc = rc ? rc : alp::set( tmp_denominator, ddd );
+			rc = rc ? rc : alp::foldl( tmp_denominator, lll, minus );
+			rc = rc ? rc : alp::foldl( tmp_egvecs, tmp_denominator, divide );
+
+			// while fold matrix -> vector would be a solution to
+			// normalize columns in tmp_egvecs,
+			// here we abuse the syntax and use eWiseLambda.
+			// Once fold matrix -> vector implemented, the next section should be rewritten
+			rc = rc ? rc : alp::eWiseLambda(
+				[ &tmp_egvecs, &nn, &ring, &divide, &zero ]( const size_t i, D &val ) {
+					(void) val;
+					auto egvec_i = get_view( tmp_egvecs, utils::range( 0, nn ), i );
+					Scalar< D > norm_i( zero );
+					alp::norm2( norm_i, egvec_i, ring );
+					alp::foldl( egvec_i, norm_i , divide );
+				},
+				ones
+			);
+
+			// update results
+			auto egvecs_view = alp::get_view( Egvecs_non_direct, utils::range( 0, nn ), utils::range( 0, nn ) );
+			auto tmp_egvecs_orth_view = alp::get_view< typename OrthogonalMat::structure >( tmp_egvecs );
+			rc = rc ? rc : alp::set( egvecs_view, tmp_egvecs_orth_view );
+
+			return rc;
+		}
+
+
+		/**
+		 * Calculate eigendecomposition of symmetric tridiagonal matrix T
+		 *        \f$T = Qdiag(d)Q^T\f$ where
+		 *        \a T is real symmetric tridiagonal
+		 *        \a Q is orthogonal (columns are eigenvectors).
+		 *        \a d is vector containing eigenvalues.
+		 *
+		 * @tparam D        Data element type
+		 * @tparam Ring     Type of the semiring used in the computation
+		 * @tparam Minus    Type of minus operator used in the computation
+		 * @tparam Divide   Type of divide operator used in the computation
+		 * @param[out] Q    output orthogonal matrix contaning eigenvectors
+		 * @param[out] d    output vector containg eigenvalues
+		 * @param[in]  T    input symmetric tridiagonal matrix
+		 * @param[in]  ring A semiring for operations
+		 * @return RC       SUCCESS if the execution was correct
+		 *
+		 */
+		template<
+			typename D,
+			typename SymmOrHermTridiagonalType,
+			typename OrthogonalType,
+			typename SymmHermTrdiViewType,
+			typename OrthViewType,
+			typename SymmHermTrdiImfR,
+			typename SymmHermTrdiImfC,
+			typename OrthViewImfR,
+			typename OrthViewImfC,
+			typename VecViewType,
+			typename VecImfR,
+			typename VecImfC,
+			class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+			class Minus = operators::subtract< D >,
+			class Divide = operators::divide< D >
+		>
+		RC symm_tridiag_dac_eigensolver(
+			Matrix<
+				D,
+				SymmOrHermTridiagonalType,
+				Dense,
+				SymmHermTrdiViewType,
+				SymmHermTrdiImfR,
+				SymmHermTrdiImfC
+			> &T,
+			Matrix<
+				D,
+				OrthogonalType,
+				Dense,
+				OrthViewType,
+				OrthViewImfR,
+				OrthViewImfC
+			> &Q,
+			Vector<
+				D,
+				structures::General,
+				Dense,
+				VecViewType,
+				VecImfR,
+				VecImfC
+			> &d,
+			const Ring &ring = Ring(),
+			const Minus &minus = Minus(),
+			const Divide &divide = Divide()
+		) {
+			(void) ring;
+			(void) minus;
+			(void) divide;
+
+			const Scalar< D > zero( ring.template getZero< D >() );
+			const Scalar< D > one( ring.template getOne< D >() );
+
+			RC rc = SUCCESS;
+
+			const size_t n = nrows( T );
+			const size_t m = n / 2;
+
+			if( n == 1 ) {
+				//d=T[0];
+				rc = rc ? rc : alp::eWiseLambda(
+					[ &d ]( const size_t i, const size_t j, D &val ) {
+						(void) i;
+						(void) j;
+						alp::set( d, Scalar< D > ( val ) );
+					},
+					T
+				);
+				// Q=[[1]]; a 1x1 matrix
+				rc = rc ? rc : alp::set( Q, one );
+
+				return rc;
+			}
+
+
+			Vector< D, structures::General, Dense > v( n );
+			rc = rc ? rc : alp::set( v, zero );
+
+			auto v1 = alp::get_view( T, utils::range( m - 1, m ), m );
+			auto v2 = alp::get_view( v, utils::range( m , m + 1 ) );
+			rc = rc ? rc : alp::set( v2, v1 );
+
+			auto v3 = alp::get_view( v, utils::range( m - 1 , m ) );
+			rc = rc ? rc : alp::set( v3, one );
+
+#ifdef DEBUG
+			print_vector( " v = ", v );
+#endif
+			Matrix< D, SymmOrHermTridiagonalType, Dense > Atmp( n );
+			rc = rc ? rc : alp::set( Atmp, T );
+			auto vvt =  alp::outer( v, ring.getMultiplicativeOperator() ) ;
+
+#ifdef DEBUG
+			print_matrix( " vvt = ", vvt );
+#endif
+			rc = rc ? rc : alp::foldl( Atmp, vvt, minus );
+
+#ifdef DEBUG
+			print_matrix( " Atmp(updated)  ", Atmp );
+#endif
+
+			auto Ttop = alp::get_view< SymmOrHermTridiagonalType >( Atmp, utils::range( 0, m ), utils::range( 0, m ) );
+			auto Tdown = alp::get_view< SymmOrHermTridiagonalType >( Atmp, utils::range( m, n ), utils::range( m, n ) );
+
+#ifdef DEBUG
+			print_matrix( " Ttop = ", Ttop );
+			print_matrix( " Tdown = ", Tdown );
+#endif
+
+			Vector< D, structures::General, Dense > dtmp( n );
+			rc = rc ? rc : alp::set( dtmp, zero );
+			auto dtop = alp::get_view( dtmp, utils::range( 0, m ) );
+			auto ddown = alp::get_view( dtmp, utils::range( m, n ) );
+
+			Matrix< D, OrthogonalType, Dense > U( n );
+			rc = rc ? rc : alp::set( U, zero );
+
+			auto Utop = alp::get_view< OrthogonalType >( U, utils::range( 0, m ), utils::range( 0, m ) );
+			auto Udown = alp::get_view< OrthogonalType >( U, utils::range( m, n ), utils::range( m, n ) );
+
+			rc = rc ? rc : symm_tridiag_dac_eigensolver( Ttop, Utop, dtop, ring );
+			rc = rc ? rc : symm_tridiag_dac_eigensolver( Tdown, Udown, ddown, ring );
+			//std::cout << " --> ust one iteration\n";
+
+#ifdef DEBUG
+			std::cout << " after symm_tridiag_dac_eigensolver call:\n";
+			print_matrix( " Utop = ", Utop );
+			print_matrix( " Udown = ", Udown );
+			print_matrix( " U = ", U );
+#endif
+
+			Vector< D, structures::General, Dense > z( n );
+			rc = rc ? rc : alp::set( z, zero );
+
+#ifdef DEBUG
+			print_vector( "  v  ", v );
+			print_vector( "  z  ", z );
+#endif
+
+#ifdef TEMPDISABLE
+			// while mxv does not support vectors/view
+			// we cast vector->matrix and use mxm
+			auto z_mat_view = alp::get_view< view::matrix >( z );
+			auto v_mat_view = alp::get_view< view::matrix >( v );
+			rc = rc ? rc : alp::mxm(
+				z_mat_view,
+				alp::get_view< alp::view::transpose >( U ),
+				v_mat_view,
+				ring
+			);
+#else
+			//z=U^T.dot(v)
+			rc = rc ? rc : alp::mxv(
+				z,
+				alp::get_view< alp::view::transpose >( U ),
+				v,
+				ring
+			);
+#endif
+
+#ifdef DEBUG
+			print_vector( "  d  ", dtmp );
+			print_vector( "  z  ", z );
+#endif
+
+			// permutation that sorts dtmp
+			alp::Vector< size_t > permutation_vec( n );
+			rc = rc ? rc : alp::sort( permutation_vec, dtmp, alp::relations::lt< D >() );
+
+			alp::Vector< size_t > no_permutation_vec( n );
+			rc = rc ? rc : alp::set< alp::descriptors::use_index >( no_permutation_vec, alp::Scalar< size_t >( 0 ) );
+
+			auto dtmp2 = alp::get_view< alp::structures::General >(
+				dtmp,
+				permutation_vec
+			);
+			auto ztmp2 = alp::get_view< alp::structures::General >(
+				z,
+				permutation_vec
+			);
+#ifdef DEBUG
+			print_vector( "  dtmp2  ", dtmp2 );
+			print_vector( "  ztmp2  ", ztmp2 );
+#endif
+
+
+			rc = rc ? rc : alp::set( d, zero );
+			Matrix< D, OrthogonalType, Dense > QdOuter( n );
+			rc = rc ? rc : alp::set( QdOuter, zero );
+			auto QdOuter_diag = alp::get_view< alp::view::diagonal >( QdOuter );
+			rc = rc ? rc : alp::set(
+				QdOuter_diag,
+				one
+			);
+
+			auto QdOuter2 = alp::get_view< alp::structures::Orthogonal >(
+				QdOuter, permutation_vec, no_permutation_vec
+			);
+
+			rc = rc ? rc : eigensolveDiagPlusOuter( d, QdOuter2, dtmp2, ztmp2 );
+#ifdef DEBUG
+			print_vector( "  d(out)  ", d );
+			print_matrix( "  QdOuter(out)  ", QdOuter );
+			print_matrix( "  U  ", U );
+#endif
+
+			rc = rc ? rc : alp::set( Q, zero );
+			rc = rc ? rc : alp::mxm( Q, U, QdOuter, ring	);
+
+#ifdef DEBUG
+			print_matrix( "  Q = U x Q   ", Q );
+#endif
+
+			return rc;
+		}
+	} // namespace algorithms
+} // namespace alp
diff --git a/include/alp/amf-based/functorbasedmatrix.hpp b/include/alp/amf-based/functorbasedmatrix.hpp
new file mode 100644
index 000000000..0e56a8688
--- /dev/null
+++ b/include/alp/amf-based/functorbasedmatrix.hpp
@@ -0,0 +1,151 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_AMF_BASED_FUNCTORBASEDMATRIX
+#define _H_ALP_AMF_BASED_FUNCTORBASEDMATRIX
+
+#include <functional>
+
+#include <alp/config.hpp>
+#include <alp/backends.hpp>
+#include <alp/imf.hpp>
+#include <alp/type_traits.hpp>
+#include <alp/utils.hpp>
+
+#include <alp/base/matrix.hpp>
+
+
+namespace alp {
+
+	namespace internal {
+
+		/** Forward declaration */
+		template< typename DerivedMatrix >
+		class MatrixBase;
+
+		/** Forward declaration */
+		template< typename T, typename ImfR, typename ImfC, typename DataLambdaType >
+		class FunctorBasedMatrix;
+
+		/** Functor reference getter used by friend functions of specialized Matrix */
+		template< typename T, typename ImfR, typename ImfC, typename DataLambdaType >
+		const typename FunctorBasedMatrix< T, ImfR, ImfC, DataLambdaType >::functor_type &getFunctor( const FunctorBasedMatrix< T, ImfR, ImfC, DataLambdaType > &A );
+
+		/**
+		 * Getter for the functor of a functor-based matrix.
+		 *
+		 * @tparam MatrixType  The type of input matrix.
+		 *
+		 * @param[in] A        Input matrix.
+		 *
+		 * @returns A constant reference to a functor object within the
+		 *          provided functor-based matrix.
+		 */
+		template<
+			typename MatrixType,
+			std::enable_if_t<
+				internal::is_functor_based< MatrixType >::value
+			> * = nullptr
+		>
+		const typename MatrixType::functor_type &getFunctor( const MatrixType &A ) {
+			return static_cast< const typename MatrixType::base_type & >( A ).getFunctor();
+		}
+
+		/**
+		 * Specialization of MatrixReference with a lambda function as a target.
+		 * Used as a result of low-rank operation to avoid the need for allocating a container.
+		 * The data is produced lazily by invoking the lambda function stored as a part of this object.
+		 *
+		 * \note Views-over-lambda-functions types are used internally as results of low-rank operations and are not
+		 *       directly exposed to users. From the users perspective, the use of objects of this type does not differ
+		 *       from the use of other \a alp::Matrix types. The difference lies in a lazy implementation of the access
+		 *       to matrix elements, which is not exposed to the user.
+		 *
+		 */
+		template< typename T, typename ImfR, typename ImfC, typename DataLambdaType >
+		class FunctorBasedMatrix : public MatrixBase< FunctorBasedMatrix< T, ImfR, ImfC, DataLambdaType > > {
+			public:
+
+				/** Expose static properties */
+				typedef T value_type;
+				/** Type returned by access function */
+				typedef T access_type;
+				typedef T const_access_type;
+				/** Type of the index used to access the physical storage */
+				typedef std::pair< size_t, size_t > storage_index_type;
+
+			protected:
+
+				typedef FunctorBasedMatrix< T, ImfR, ImfC, DataLambdaType > self_type;
+				friend MatrixBase< self_type >;
+
+				typedef std::function< bool() > initialized_functor_type;
+				const initialized_functor_type initialized_lambda;
+
+				const ImfR imf_r;
+				const ImfC imf_c;
+
+				const DataLambdaType data_lambda;
+
+				std::pair< size_t, size_t > dims() const noexcept {
+					return std::make_pair( imf_r.n, imf_c.n );
+				}
+
+				const DataLambdaType &getFunctor() const noexcept {
+					return data_lambda;
+				}
+
+				bool getInitialized() const noexcept {
+					return initialized_lambda();
+				}
+
+				void setInitialized( const bool ) noexcept {
+					static_assert( "Calling setInitialized on a FunctorBasedMatrix is not allowed." );
+				}
+
+				access_type access( const storage_index_type &storage_index ) const {
+					T result = 0;
+					data_lambda( result, imf_r.map( storage_index.first ), imf_c.map( storage_index.second ) );
+					return static_cast< access_type >( result );
+				}
+
+				storage_index_type getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					(void)s;
+					(void)P;
+					return std::make_pair( i, j );
+				}
+
+			public:
+
+				FunctorBasedMatrix(
+					initialized_functor_type initialized_lambda,
+					ImfR imf_r,
+					ImfC imf_c,
+					const DataLambdaType data_lambda
+				) :
+					initialized_lambda( initialized_lambda ),
+					imf_r( imf_r ),
+					imf_c( imf_c ),
+					data_lambda( data_lambda ) {}
+
+		}; // class FunctorBasedMatrix
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // end ``_H_ALP_AMF_BASED_FUNCTORBASEDMATRIX''
diff --git a/include/alp/amf-based/matrix.hpp b/include/alp/amf-based/matrix.hpp
new file mode 100644
index 000000000..c5c94df32
--- /dev/null
+++ b/include/alp/amf-based/matrix.hpp
@@ -0,0 +1,1369 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_AMF_BASED_MATRIX
+#define _H_ALP_AMF_BASED_MATRIX
+
+#include <algorithm>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+#include <alp/backends.hpp>
+#include <alp/config.hpp>
+#include <alp/density.hpp>
+#include <alp/imf.hpp>
+#include <alp/ops.hpp>
+#include <alp/rc.hpp>
+#include <alp/structures.hpp>
+#include <alp/type_traits.hpp>
+#include <alp/utils.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/matrix.hpp>
+
+#include "functorbasedmatrix.hpp"
+#include "storage.hpp"
+#include "storagebasedmatrix.hpp"
+#include "vector.hpp"
+
+
+namespace alp {
+
+	/** Identifies any backend's implementation of ALP matrix as an ALP matrix. */
+	template<
+		typename T, typename Structure, enum Density density,
+		typename View, typename ImfR, typename ImfC, enum Backend backend
+	>
+	struct is_matrix< Matrix< T, Structure, density, View, ImfR, ImfC, backend > > : std::true_type {};
+
+	// Matrix-related implementation
+
+	namespace internal {
+
+		/** Forward declaration */
+		template< typename DerivedMatrix >
+		class MatrixBase;
+
+		template< typename DerivedMatrix >
+		std::pair< size_t, size_t > dims( const MatrixBase< DerivedMatrix > &A ) noexcept;
+
+		template<
+			typename MatrixType,
+			std::enable_if_t< internal::is_storage_based< MatrixType >::value > *
+		>
+		size_t getStorageDimensions( const MatrixType &A ) noexcept;
+
+		template< typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType>::value > *
+		>
+		bool getInitialized( const MatrixType &A ) noexcept;
+
+		template< typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType>::value > *
+		>
+		void setInitialized( MatrixType &, const bool ) noexcept;
+
+		/** Forward declarations for access functions */
+		template<
+			typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType >::value > * = nullptr
+		>
+		typename MatrixType::const_access_type access( const MatrixType &, const typename MatrixType::storage_index_type & );
+
+		template<
+			typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType >::value > * = nullptr
+		>
+		typename MatrixType::access_type access( MatrixType &, const typename MatrixType::storage_index_type & );
+
+		template<
+			typename MatrixType,
+			std::enable_if< is_matrix< MatrixType >::value > * = nullptr
+		>
+		typename MatrixType::storage_index_type getStorageIndex( const MatrixType &A, const size_t i, const size_t j, const size_t s = 0, const size_t P = 1 );
+
+		template< typename DerivedMatrix >
+		std::pair< size_t, size_t > dims( const MatrixBase< DerivedMatrix > & A ) noexcept;
+
+		template<
+			typename MatrixType,
+			std::enable_if_t< internal::is_storage_based< MatrixType >::value > *
+		>
+		size_t getStorageDimensions( const MatrixType &A ) noexcept;
+
+		template< typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType>::value > *
+		>
+		bool getInitialized( const MatrixType &A ) noexcept;
+
+		template< typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType>::value > *
+		>
+		void setInitialized( MatrixType &, const bool ) noexcept;
+		/**
+		 * Base Matrix class containing attributes common to all Matrix specialization
+		 */
+		template< typename DerivedMatrix >
+		class MatrixBase {
+
+			friend std::pair< size_t, size_t > dims<>( const MatrixBase< DerivedMatrix > &A ) noexcept;
+
+			template< typename MatrixType, std::enable_if_t< is_matrix< MatrixType>::value > * >
+			friend bool getInitialized( const MatrixType &A ) noexcept;
+
+			template< typename MatrixType, std::enable_if_t< is_matrix< MatrixType>::value > * >
+			friend void setInitialized( MatrixType &A, const bool initialized ) noexcept;
+
+			protected:
+
+				std::pair< size_t, size_t > dims() const noexcept {
+					return static_cast< const DerivedMatrix & >( *this ).dims();
+				}
+
+				template<
+					typename MatrixType,
+					std::enable_if_t< is_matrix< MatrixType >::value > *
+				>
+				friend typename MatrixType::const_access_type access( const MatrixType &A, const typename MatrixType::storage_index_type &storageIndex );
+
+				template<
+					typename MatrixType,
+					std::enable_if_t< is_matrix< MatrixType >::value > *
+				>
+				friend typename MatrixType::access_type access( MatrixType &A, const typename MatrixType::storage_index_type &storageIndex );
+
+				template<
+					typename MatrixType,
+					std::enable_if< is_matrix< MatrixType >::value > *
+				>
+				friend typename MatrixType::storage_index_type getStorageIndex( const MatrixType &A, const size_t i, const size_t j, const size_t s, const size_t P );
+
+				bool getInitialized() const {
+					return static_cast< const DerivedMatrix & >( *this ).getInitialized();
+				}
+
+				void setInitialized( const bool initialized ) {
+					static_cast< DerivedMatrix & >( *this ).setInitialized( initialized );
+				}
+
+				template< typename ConstAccessType, typename StorageIndexType >
+				ConstAccessType access( const StorageIndexType storageIndex ) const {
+					static_assert( std::is_same< ConstAccessType, typename DerivedMatrix::const_access_type >::value );
+					static_assert( std::is_same< StorageIndexType, typename DerivedMatrix::storage_index_type >::value );
+					return static_cast< const DerivedMatrix & >( *this ).access( storageIndex );
+				}
+
+				template< typename AccessType, typename StorageIndexType >
+				AccessType access( const StorageIndexType &storageIndex ) {
+					static_assert( std::is_same< AccessType, typename DerivedMatrix::access_type >::value );
+					static_assert( std::is_same< StorageIndexType, typename DerivedMatrix::storage_index_type >::value );
+					return static_cast< DerivedMatrix & >( *this ).access( storageIndex );
+				}
+
+				template< typename StorageIndexType >
+				StorageIndexType getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					static_assert( std::is_same< StorageIndexType, typename DerivedMatrix::storage_index_type >::value );
+					return static_cast< const DerivedMatrix & >( *this ).getStorageIndex( i, j, s, P );
+				}
+
+		};
+
+		template<
+			typename T,
+			typename Structure,
+			enum Density density,
+			typename View,
+			typename ImfR,
+			typename ImfC,
+			enum Backend backend
+		>
+		struct matrix_base_class {
+			typedef typename std::conditional<
+				internal::is_view_over_functor< View >::value,
+				internal::FunctorBasedMatrix< T, ImfR, ImfC, typename View::applied_to >,
+				internal::StorageBasedMatrix< T,
+					typename internal::determine_amf_type< Structure, View, ImfR, ImfC, backend >::type,
+					internal::requires_allocation< View >::value,
+					backend
+				>
+			>::type type;
+		};
+
+	} // namespace internal
+
+	/**
+	 * \brief An ALP structured matrix.
+	 *
+	 * This is an opaque data type for structured matrices.
+	 *
+	 * A structured matrix exposes a mathematical
+	 * \em logical layout which allows to express implementation-oblivious concepts
+	 * including the matrix structure itself and \em views on the matrix.
+	 * The logical layout of a structured matrix maps to a physical counterpart via
+	 * a storage scheme which typically depends on the chosen structure and the selected
+	 * backend. alp::Matrix and alp::Vector may be used as interfaces to such a physical
+	 * layout.
+	 * To visualize this, you may think of a band matrix. Using a
+	 * full dense or a banded storage schemes would require
+	 * the use of a \a alp::Matrix container (see include/alp/density.hpp for
+	 * more details about the supported storage schemes). However, the interpration of its
+	 * content would differ in the two cases being a function of both the Structure
+	 * information and the storage scheme combined.
+	 *
+	 * Views can be used to create logical \em perspectives on top of a container.
+	 * For example, one may decide to refer to the transpose of a matrix or to treat
+	 * for a limited part of my program a square matrix as symmetric.
+	 * If a view can be expressed as a concept \em invariant of specific runtime features,
+	 * such views can be defined statically (for example, one may always refer to the
+	 * transpose or the diagonal of a matrix irrespective of features such as the matrix's
+	 * size). Other may depend on features such as the size of a matrix
+	 * (e.g., gathering/scattering the rows/columns of a matrix or permuting them).
+	 *
+	 * Structured matrices defined as views on other matrices do not instantiate a
+	 * new container but refer to the one used by their targets.
+	 * See the two specializations
+	 * \a Matrix<T, structures::General, Density::Dense, View, backend >
+	 * and \a Matrix<T, structures::General, Density::Dense, view::Indentity<void>, backend >
+	 * as examples of structured matrix types without and with physical container, respectively.
+	 *
+	 *
+	 * @tparam T         The type of the matrix elements. \a T shall not be a GraphBLAS
+	 *                   type.
+	 * @tparam Structure One of the matrix structures defined in \a alp::structures.
+	 * @tparam density   Either \em enum \a Density::Dense or \em enum
+	 *                   \a storage::Sparse.
+	 * @tparam View      One of the matrix views in \a alp::view.
+	 *                   All static views except for \a view::Original (via
+	 *                   \a view::Original<void> cannot instantiate a new container
+	 *                   and only allow to refer to a previously defined
+	 *                   \a Matrix.
+	 *                   The \a View parameter should not be used directly
+	 *                   by the user but selected via \a get_view function.
+	 *
+	 * See examples of Matrix definitions within \a include/alp/reference/matrix.hpp
+	 * and the \a dense_structured_matrix.cpp unit test.
+	 *
+	 */
+	template< typename T, typename Structure, enum Density density, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	class Matrix :
+		public internal::matrix_base_class< T, Structure, Density::Dense, View, ImfR, ImfC, backend >::type {
+
+		protected:
+			typedef Matrix< T, Structure, Density::Dense, View, ImfR, ImfC, backend > self_type;
+
+			/*********************
+				Storage info friends
+			******************** */
+
+			template< typename fwd_iterator >
+			friend RC buildMatrix( Matrix< T, Structure, Density::Dense, View, ImfR, ImfC, backend > &A,
+				const fwd_iterator & start, const fwd_iterator & end );
+
+			template< typename fwd_iterator >
+			RC buildMatrixUnique( const fwd_iterator &start, const fwd_iterator &end ) {
+				std::cout << "Building Matrix<>; calling buildMatrix( Matrix<> )\n";
+				return buildMatrix( *(this->_container), start, end );
+			}
+
+		public:
+			/** Exposes the types and the static properties. */
+			typedef Structure structure;
+			/**
+			 * Indicates if a matrix needs to allocate data-related memory
+			 * (for the internal container or functor object).
+			 * False if it is a view over another matrix or a functor.
+			 */
+			static constexpr bool requires_allocation = internal::requires_allocation< View >::value;
+
+			/**
+			 * Expose the base type class to enable internal functions to cast
+			 * the type of objects of this class to the base class type.
+			 */
+			typedef typename internal::matrix_base_class< T, Structure, Density::Dense, View, ImfR, ImfC, backend >::type base_type;
+
+			template < view::Views view_tag, bool d = false >
+			struct view_type;
+
+			template < bool d >
+			struct view_type< view::original, d > {
+				using type = Matrix< T, Structure, Density::Dense, view::Original< self_type >, imf::Id, imf::Id, backend >;
+			};
+
+			template < bool d >
+			struct view_type< view::gather, d > {
+				using type = Matrix<
+					T,
+					typename structures::apply_view< view::gather, Structure >::type,
+					Density::Dense, view::Gather< self_type >, imf::Strided, imf::Strided, backend
+				>;
+			};
+
+			template < bool d >
+			struct view_type< view::transpose, d > {
+				using type = Matrix<
+					T,
+					typename structures::apply_view< view::transpose, Structure >::type,
+					Density::Dense, view::Transpose< self_type >, imf::Id, imf::Id, backend
+				>;
+			};
+
+			template < bool d >
+			struct view_type< view::diagonal, d > {
+				using type = Vector< T, structures::General, Density::Dense, view::Diagonal< self_type >, imf::Id, imf::Zero, backend >;
+			};
+
+			/**
+			 * Constructor for a storage-based matrix that allocates storage.
+			 * Specialization for a matrix with not necessarily equal row and column dimensions.
+			 */
+			template<
+				typename ThisStructure = Structure,
+				std::enable_if_t<
+					internal::is_view_over_storage< View >::value &&
+					internal::requires_allocation< View >::value &&
+					!structures::is_in< structures::Square, typename ThisStructure::inferred_structures >::value
+				> * = nullptr
+			>
+			Matrix( const size_t rows, const size_t cols, const size_t cap = 0 ) :
+				base_type(
+					storage::AMFFactory< backend >::template FromPolynomial<
+						Structure, ImfR, ImfC
+					>::Create(
+						ImfR( rows ),
+						ImfC( cols )
+					)
+				) {
+
+				(void) cap;
+
+				// This check should be performed in the class body rather than here.
+				// Allocation-requiring matrix with incompatible IMFs should not be instantiable at all.
+				// Here it is only forbidden to invoke this constructor for such a matrix.
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					( std::is_same< ImfC, imf::Id >::value || std::is_same< ImfC, imf::Zero >::value ),
+					"This constructor can only be used with a matrix having Id IMFs."
+				);
+
+			}
+
+			/*
+			 * Constructor for a storage-based matrix that allocates storage.
+			 * Specialization for matrices with equal row and column dimensions.
+			 */
+			template<
+				typename ThisStructure = Structure,
+				std::enable_if_t<
+					internal::is_view_over_storage< View >::value &&
+					internal::requires_allocation< View >::value &&
+					structures::is_in< structures::Square, typename ThisStructure::inferred_structures >::value
+				> * = nullptr
+			>
+			Matrix( const size_t dim, const size_t cap = 0 ) :
+				base_type(
+					storage::AMFFactory< backend >::template FromPolynomial<
+						Structure, ImfR, ImfC
+					>::Create(
+						ImfR( dim ),
+						ImfC( dim )
+					)
+				) {
+
+				(void) cap;
+
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					( std::is_same< ImfC, imf::Id >::value || std::is_same< ImfC, imf::Zero >::value ),
+					"This constructor can only be used with a matrix having Id IMFs."
+				);
+
+			}
+
+			/**
+			 * Constructor for a view over another storage-based matrix.
+			 *
+			 * @tparam SourceType  The type of the target matrix.
+			 *
+			 */
+			template<
+				typename SourceType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_storage< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Matrix( SourceType &source_matrix, ImfR imf_r, ImfC imf_c ) :
+				base_type(
+					getContainer( source_matrix ),
+					storage::AMFFactory< backend >::template Compose<
+						ImfR, ImfC, typename SourceType::base_type::amf_type
+					>::Create( imf_r, imf_c, internal::getAmf( source_matrix ) )
+				) {}
+
+			/**
+			 * Constructor for a view over another matrix applying a view defined
+			 * by View template parameter of the constructed matrix.
+			 *
+			 * @tparam SourceType  The type of the target matrix.
+			 *
+			 */
+			template<
+				typename SourceType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_storage< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Matrix( SourceType &source_matrix ) :
+				base_type(
+					getContainer( source_matrix ),
+					storage::AMFFactory< backend >::template Reshape< View::type_id, typename SourceType::amf_type >::Create( internal::getAmf( source_matrix ) )
+				) {}
+
+			/**
+			 * Constructor for a view over an internal container of another matrix.
+			 *
+			 * @tparam SourceType  The type of the target matrix.
+			 * @tparam AmfType  The type of the amf corresponding to the layout of
+			 *                  the provided container.
+			 *                  Used as a template parameter to avoid hard
+			 *                  compilation error in the case of FunctorBasedMatrix,
+			 *                  when base_type::amf_type does not exist.
+			 */
+			template<
+				typename BufferType,
+				typename AmfType,
+				std::enable_if_t<
+					!is_container< BufferType >::value &&
+					internal::is_view_over_storage< View >::value
+				> * = nullptr
+			>
+			Matrix( BufferType &&buffer, const size_t buffer_size, AmfType &&amf ) :
+				base_type(
+					buffer,
+					buffer_size,
+					std::forward< typename base_type::amf_type >( amf )
+				) {
+				static_assert(
+					std::is_same< typename base_type::amf_type, typename std::remove_reference< AmfType >::type >::value,
+					"The type of the provided AMF does not match the type of constructed container's AMF"
+				);
+			}
+
+			/**
+			 * Constructor for a view over another matrix' internal container.
+			 *
+			 * @tparam ContainerType  The type of the internal container.
+			 * @tparam AmfType     The type of the amf used to construct the matrix.
+			 *                     Used as a template parameter to benefit from
+			 *                     SFINAE for the case of FunctorBasedMatrix, when
+			 *                     base_type::amf_type does not exist and, therefore,
+			 *                     using the expression base_type::amf_type would
+			 *                     result in a hard compilation error.
+			 */
+			template<
+				typename ContainerType,
+				typename AmfType,
+				std::enable_if_t<
+					internal::is_container< ContainerType >::value &&
+					internal::is_view_over_storage< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Matrix( ContainerType &container, AmfType &&amf ) :
+				base_type(
+					container,
+					std::forward< typename base_type::amf_type >( amf )
+				) {
+				static_assert(
+					std::is_same< typename base_type::amf_type, typename std::remove_reference< AmfType >::type >::value,
+					"The AMF type of the constructor parameter needs to match the AMF type of this container specialization."
+				);
+			}
+
+			/**
+			 * Constructor for a functor-based matrix that allocates memory.
+			 * Specialization for a matrix with non necessarily equal row and column dimensions.
+			 *
+			 * @tparam LambdaType  The type lambda function associated to the data.
+			 *
+			 */
+			template<
+				typename LambdaType,
+				std::enable_if_t<
+					std::is_same< LambdaType, typename View::applied_to >::value &&
+					internal::is_view_over_functor< View >::value &&
+					internal::requires_allocation< View >::value &&
+					!structures::is_in< structures::Square, typename Structure::inferred_structures >::value
+				> * = nullptr
+			>
+			Matrix( std::function< bool() > initialized, const size_t rows, const size_t cols, LambdaType lambda ) :
+				base_type( initialized, imf::Id( rows ), imf::Id( cols ), lambda ) {
+
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					std::is_same< ImfC, imf::Id >::value,
+					"This constructor can only be used with Id IMFs."
+				);
+
+			}
+
+			/**
+			 * Constructor for a functor-based matrix that allocates memory.
+			 * Specialization for a matrix with equal row and column dimensions.
+			 *
+			 * @tparam LambdaType  The type lambda function associated to the data.
+			 *
+			 */
+			template<
+				typename LambdaType,
+				std::enable_if_t<
+					std::is_same< LambdaType, typename View::applied_to >::value &&
+					internal::is_view_over_functor< View >::value &&
+					internal::requires_allocation< View >::value &&
+					structures::is_in< structures::Square, typename Structure::inferred_structures >::value
+				> * = nullptr
+			>
+			Matrix( std::function< bool() > initialized, const size_t dim, LambdaType lambda ) :
+				base_type( initialized, imf::Id( dim ), imf::Id( dim ), lambda ) {
+
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					std::is_same< ImfC, imf::Id >::value,
+					"This constructor can only be used with Id IMFs."
+				);
+
+			}
+
+			/**
+			 * Constructor for a view over another functor-based matrix.
+			 *
+			 * @tparam SourceType  The type of the target matrix.
+			 *
+			 */
+			template<
+				typename SourceType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_functor< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Matrix( SourceType &source_matrix, ImfR imf_r, ImfC imf_c ) :
+				base_type( getFunctor( source_matrix ), imf_r, imf_c ) {}
+
+			/**
+			 * @deprecated
+			 * Constructor for a view over another functor-based matrix.
+			 *
+			 * @tparam SourceType  The type of the target matrix.
+			 *
+			 */
+			template<
+				typename SourceType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_functor< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Matrix( SourceType &source_matrix ) :
+				Matrix( getFunctor( source_matrix ),
+					imf::Id( nrows ( source_matrix ) ),
+					imf::Id( ncols ( source_matrix ) )
+				) {
+
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					std::is_same< ImfC, imf::Id >::value,
+					"This constructor can only be used with Id IMFs."
+				);
+
+			}
+	}; // ALP Matrix
+
+	namespace structures {
+
+		/**
+		 * Calculates the iteration space for row-dimension for the given matrix and band index.
+		 *
+		 * @tparam MatrixType The type of ALP matrix
+		 * @tparam band_index The index of the desired matrix band
+		 *
+		 * @param[in] A       ALP matrix
+		 *
+		 * @returns a pair of size_t values,
+		 *          the first representing lower and the second upper limit.
+		 *
+		 * \note    Each backend shall specialize this function as its implementation
+		 *          depends on the way backend handles storage of different structures.
+		 */
+		template<
+			size_t band_index, typename MatrixType,
+			std::enable_if_t<
+				is_matrix< MatrixType >::value
+			> * = nullptr
+		>
+		std::pair< size_t, size_t > calculate_row_coordinate_limits( const MatrixType &A );
+
+		/**
+		 * Calculates the iteration space for column-dimension for the given matrix, band index and row index.
+		 *
+		 * @tparam MatrixType The type of ALP matrix
+		 * @tparam band_index The index of the desired matrix band
+		 *
+		 * @param[in] A       ALP matrix
+		 * @param[in] row     Row index
+		 *
+		 * @returns a pair of size_t values,
+		 *          the first representing lower and the second upper limit.
+		 *
+		 * \note    Each backend shall specialize this function as its implementation
+		 *          depends on the way backend handles storage of different structures.
+		 */
+		template<
+			size_t band_index, typename MatrixType,
+			std::enable_if_t<
+				is_matrix< MatrixType >::value
+			> * = nullptr
+		>
+		std::pair< size_t, size_t > calculate_column_coordinate_limits( const MatrixType &A, const size_t row );
+
+	} // namespace structures
+
+	/**
+	 *
+	 * @brief Generate a view specified by \a target_view where the type is compliant with the
+	 * 		  \a source matrix.
+	 * 		  The function guarantees the created view is non-overlapping with other
+	 *        existing views only when the check can be performed in constant time.
+	 *
+	 * @tparam target_view  One of the supported views listed in \a view::Views
+	 * @tparam SourceMatrix The type of the source matrix
+	 *
+	 * @param source        The source ALP matrix
+	 *
+	 * @return A new \a target_view view over the source matrix.
+	 *
+     * \parblock
+     * \par Performance semantics.
+     *        -# This function performs
+     *           \f$ \Theta(nref) \f$ amount of work where \f$ nref \f$ is the number
+	 * 			 of available views of \a source.
+     *        -# A call to this function may use \f$ \mathcal{O}(1) \f$ bytes
+     *           of memory beyond the memory in use at the function call entry.
+     *        -# This function may make system calls.
+     * \endparblock
+	 *
+	 */
+	template<
+		enum view::Views target_view = view::original,
+		typename SourceMatrix,
+		std::enable_if_t<
+			is_matrix< SourceMatrix >::value &&
+			target_view != view::diagonal
+		> * = nullptr
+	>
+	typename SourceMatrix::template view_type< target_view >::type
+	get_view( SourceMatrix &source ) {
+
+		using target_strmat_t = typename SourceMatrix::template view_type< target_view >::type;
+
+		return target_strmat_t( source );
+	}
+
+	/** Specialization for diagonal view over Square matrix */
+	template<
+		enum view::Views target_view = view::original,
+		typename SourceMatrix,
+		std::enable_if_t<
+			is_matrix< SourceMatrix >::value &&
+			target_view == view::diagonal &&
+			structures::is_in< structures::Square, typename SourceMatrix::structure::inferred_structures >::value
+		> * = nullptr
+	>
+	typename SourceMatrix::template view_type< view::diagonal >::type
+	get_view( SourceMatrix &source ) {
+
+		using target_t = typename SourceMatrix::template view_type< view::diagonal >::type;
+		return target_t( source );
+	}
+
+	/**
+	 * Specialization for diagonal view over non-Square matrix.
+	 * A diagonal view is created over a intermediate gather
+	 * view with a square structure.
+	 */
+	template<
+		enum view::Views target_view = view::original,
+		typename SourceMatrix,
+		std::enable_if_t<
+			is_matrix< SourceMatrix >::value &&
+			target_view == view::diagonal &&
+			!structures::is_in< structures::Square, typename SourceMatrix::structure::inferred_structures >::value
+		> * = nullptr
+	>
+	typename internal::new_container_type_from<
+		typename SourceMatrix::template view_type< view::gather >::type
+	>::template change_structure< structures::Diagonal >::type
+	::template view_type< view::diagonal >::type
+	get_view( SourceMatrix &source ) {
+
+		const size_t source_rows = nrows( source );
+		const size_t source_cols = ncols( source );
+		const size_t smaller_dimension = std::min( source_rows, source_cols );
+		auto square_view = get_view< structures::Diagonal >( source, utils::range( 0, smaller_dimension ), utils::range( 0, smaller_dimension ) );
+		return get_view< view::diagonal >( square_view );
+	}
+
+	/**
+	 *
+	 * @brief Generate an original view where the type is compliant with the source Matrix.
+	 * 		  Version where a target structure is specified. It can only generate a valid type if the target
+	 * 		  structure is the same as the source's
+	 * 		  or a more specialized one that would preserve its static properties (e.g., symmetric reference
+	 * 		  to a square matrix -- any assumption based on symmetry would not break those based on square).
+	 * 		  The function guarantees the created view is non-overlapping with other existing views only when the
+	 * 		  check can be performed in constant time.
+	 *
+	 * @tparam TargetStructure  The target structure of the new view. It should verify
+	 *                          <code> alp::is_in<Structure, TargetStructure::inferred_structures> </code>.
+	 * @tparam SourceMatrix     The type of the source matrix
+	 *
+	 * @param source            The source ALP matrix
+	 *
+	 * @return A new original view over the source ALP matrix.
+	 *
+     * \parblock
+     * \par Performance semantics.
+     *        -# This function performs
+     *           \f$ \Theta(nref) \f$ amount of work where \f$ nref \f$ is the number
+	 * 			 of available views of \a source.
+     *        -# A call to this function may use \f$ \mathcal{O}(1) \f$ bytes
+     *           of memory beyond the memory in use at the function call entry.
+     *        -# This function may make system calls.
+     * \endparblock
+	 *
+	 */
+	template<
+		typename TargetStructure,
+		typename SourceMatrix,
+		std::enable_if< is_matrix< SourceMatrix >::value > * = nullptr
+	>
+	typename internal::new_container_type_from<
+		typename SourceMatrix::template view_type< view::original >::type
+	>::template change_structure< TargetStructure >::type
+	get_view( SourceMatrix &source ) {
+
+		static_assert( structures::is_in< typename SourceMatrix::structure, typename TargetStructure::inferred_structures >::value,
+			"Can only create a view when the target structure is compatible with the source." );
+
+		using target_strmat_t = typename internal::new_container_type_from<
+			typename SourceMatrix::template view_type< view::original >::type
+		>::template change_structure< TargetStructure >::type;
+
+		return target_strmat_t( source );
+	}
+
+	namespace internal {
+
+		/**
+		 * Implement a gather through a View over compatible Structure using provided Index Mapping Functions.
+		 * The compatibility depends on the TargetStructure, SourceStructure and IMFs, and is calculated during runtime.
+		 */
+		template<
+			typename TargetStructure, typename TargetImfR, typename TargetImfC,
+			typename SourceMatrix,
+			std::enable_if_t< is_matrix< SourceMatrix >::value > * = nullptr
+		>
+		typename internal::new_container_type_from<
+			typename SourceMatrix::template view_type< view::gather >::type
+		>::template change_structure< TargetStructure >::_and_::
+		template change_imfr< TargetImfR >::_and_::
+		template change_imfc< TargetImfC >::type
+		get_view( SourceMatrix &source, TargetImfR imf_r, TargetImfC imf_c ) {
+
+			//if( std::dynamic_pointer_cast< imf::Select >( imf_r ) || std::dynamic_pointer_cast< imf::Select >( imf_c ) ) {
+			//	throw std::runtime_error("Cannot gather with imf::Select yet.");
+			//}
+			// No static check as the compatibility depends on IMF, which is a runtime level parameter
+			//if( ! (TargetStructure::template isInstantiableFrom< Structure >( static_cast< TargetImfR & >( imf_r ), static_cast< TargetImfR & >( imf_c ) ) ) ) {
+			if( ! (structures::isInstantiable< typename SourceMatrix::structure, TargetStructure >::check( imf_r, imf_c ) ) ) {
+				throw std::runtime_error("Cannot gather into specified TargetStructure from provided SourceStructure and Index Mapping Functions.");
+			}
+
+			using target_t = typename internal::new_container_type_from<
+				typename SourceMatrix::template view_type< view::gather >::type
+			>::template change_structure< TargetStructure >::_and_::
+			template change_imfr< TargetImfR >::_and_::
+			template change_imfc< TargetImfC >::type;
+
+			return target_t( source, imf_r, imf_c );
+		}
+	} // namespace internal
+
+	/**
+	 *
+	 * @brief Generate an original view where the type is compliant with the source Matrix.
+	 * Version where a range of rows and columns are selected to form a new view with specified target
+	 * structure. It can only generate a valide type if the target
+	 * structure is guaranteed to preserve the static properties of the source's structure.
+	 * A structural check of this kind as well as non-overlapping checks with existing views of \a source
+	 * are guaranteed only when each one of them incurs constant time work.
+	 *
+	 * @tparam TargetStructure  The target structure of the new view. It should verify
+	 *                          <code> alp::is_in<Structure, TargetStructure::inferred_structures> </code>.
+	 * @tparam SourceMatrix     The type of source ALP matrix
+	 *
+	 * @param source            The source ALP matrix
+	 * @param rng_r             A valid range of rows
+	 * @param rng_c             A valid range of columns
+	 *
+	 * @return A new original view over the source ALP matrix.
+	 *
+     * \parblock
+     * \par Performance semantics.
+     *        -# This function performs
+     *           \f$ \Theta(nref) \f$ amount of work where \f$ nref \f$ is the number
+	 * 			 of available views of \a source.
+     *        -# A call to this function may use \f$ \mathcal{O}(1) \f$ bytes
+     *           of memory beyond the memory in use at the function call entry.
+     *        -# This function may make system calls.
+     * \endparblock
+	 *
+	 */
+	template<
+		typename TargetStructure,
+		typename SourceMatrix,
+		std::enable_if_t< is_matrix< SourceMatrix >::value > * = nullptr
+	>
+	typename internal::new_container_type_from<
+		typename SourceMatrix::template view_type< view::gather >::type
+	>::template change_structure< TargetStructure >::type
+	get_view(
+		SourceMatrix &source,
+		const utils::range& rng_r, const utils::range& rng_c
+	) {
+
+		return internal::get_view< TargetStructure >(
+			source,
+			std::move( imf::Strided( rng_r.count(), nrows(source), rng_r.start, rng_r.stride ) ),
+			std::move( imf::Strided( rng_c.count(), ncols(source), rng_c.start, rng_c.stride ) )
+		);
+	}
+
+	/**
+     *
+	 * @brief Generate an original view where the type is compliant with the source Matrix.
+	 * Version where no target structure is specified (in this case the structure of the source type is assumed as target)
+	 * with row and column selection.
+	 * A structure preserving check as well as non-overlapping checks with existing views of \a source
+	 * are guaranteed only when each one of them incurs constant time work.
+	 *
+	 * @tparam SourceMatrix     The type of source ALP matrix
+	 *
+	 * @param source      The source matrix
+	 * @param rng_r       A valid range of rows
+	 * @param rng_c       A valid range of columns
+	 *
+	 * @return A new original view over the source structured matrix.
+	 *
+     * \parblock
+     * \par Performance semantics.
+     *        -# This function performs
+     *           \f$ \Theta(nref) \f$ amount of work where \f$ nref \f$ is the number
+	 * 			 of available views of \a source.
+     *        -# A call to this function may use \f$ \mathcal{O}(1) \f$ bytes
+     *           of memory beyond the memory in use at the function call entry.
+     *        -# This function may make system calls.
+     * \endparblock
+	 *
+	 */
+
+	template<
+		typename SourceMatrix,
+		std::enable_if_t< is_matrix< SourceMatrix >::value > * = nullptr
+	>
+	typename SourceMatrix::template view_type< view::gather >::type
+	get_view(
+		SourceMatrix &source,
+		const utils::range &rng_r,
+		const utils::range &rng_c
+	) {
+
+		return internal::get_view< typename SourceMatrix::structure >(
+			source,
+			imf::Strided( rng_r.count(), nrows(source), rng_r.start, rng_r.stride ),
+			imf::Strided( rng_c.count(), ncols(source), rng_c.start, rng_c.stride ) );
+	}
+
+	/**
+	 *
+	 * @brief Generate a vector view on a column of the source matrix.
+	 *
+	 * @tparam SourceMatrix The type of the source ALP matrix
+	 *
+	 * @param source        The source matrix
+	 * @param rng_r         A valid range of rows
+	 * @param sel_c         A valid column index
+	 *
+	 * @return A new gather view over the source ALP matrix.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# This function performs
+	 *           \f$ \Theta(nref) \f$ amount of work where \f$ nref \f$ is the number
+	 *           of available views of \a source.
+	 *        -# A call to this function may use \f$ \mathcal{O}(1) \f$ bytes
+	 *           of memory beyond the memory in use at the function call entry.
+	 *        -# This function may make system calls.
+	 * \endparblock
+	 *
+	 */
+	template<
+		typename SourceMatrix,
+		std::enable_if_t< is_matrix< SourceMatrix >::value > * = nullptr
+	>
+	typename internal::new_container_type_from<
+		typename SourceMatrix::template view_type< view::gather >::type
+	>::template change_container< alp::Vector >::_and_::
+	template change_structure< structures::General >::_and_::
+	template change_imfc< imf::Constant >::type
+	get_view(
+		SourceMatrix &source,
+		const utils::range &rng_r,
+		const size_t &sel_c
+	) {
+		using target_t = typename internal::new_container_type_from<
+			typename SourceMatrix::template view_type< view::gather >::type
+		>::template change_container< alp::Vector >::_and_::
+		template change_structure< structures::General >::_and_::
+		template change_imfc< imf::Constant >::type;
+
+		return target_t(
+			source,
+			imf::Strided( rng_r.count(), nrows( source ), rng_r.start, rng_r.stride ),
+			imf::Constant( 1, ncols( source ), sel_c )
+		);
+	}
+
+	/**
+	 *
+	 * @brief Generate a vector view on a row of the source matrix.
+	 *
+	 * @tparam SourceMatrix The type of the source ALP matrix
+	 *
+	 * @param source        The source matrix
+	 * @param sel_r         A valid row index
+	 * @param rng_c         A valid range of columns
+	 *
+	 * @return A new gather view over the source ALP matrix.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# This function performs
+	 *           \f$ \Theta(nref) \f$ amount of work where \f$ nref \f$ is the number
+	 *           of available views of \a source.
+	 *        -# A call to this function may use \f$ \mathcal{O}(1) \f$ bytes
+	 *           of memory beyond the memory in use at the function call entry.
+	 *        -# This function may make system calls.
+	 * \endparblock
+	 *
+	 * \note \internal Row-view is implemented as a column view over a
+	 *                 tranposed source matrix
+	 *
+	 */
+	template<
+		typename SourceMatrix,
+		std::enable_if_t< is_matrix< SourceMatrix >::value > * = nullptr
+	>
+	typename internal::new_container_type_from<
+		typename SourceMatrix::template view_type< view::transpose >::type::template view_type< view::gather >::type
+	>::template change_container< alp::Vector >::_and_::
+	template change_structure< structures::General >::_and_::
+	template change_imfc< imf::Constant >::type
+	get_view(
+		SourceMatrix &source,
+		const size_t &sel_r,
+		const utils::range &rng_c
+	) {
+		auto source_transposed = get_view< view::transpose >( source );
+		return get_view( source_transposed, rng_c, sel_r );
+	}
+
+	/**
+	 *
+	 * Generate a dynamic gather view where the type is compliant with the source Matrix.
+	 * Version where a selection of rows and columns, expressed as vectors of indices,
+	 * forms a new view with specified target structure.
+	 *
+	 * @tparam TargetStructure The target structure of the new view. It should verify
+	 *                         <code> alp::is_in<Structure, TargetStructure::inferred_structures> </code>.
+	 * @tparam SourceMatrix    The type of the source ALP matrix
+	 * @tparam SelectVectorR   The type of the ALP vector defining permutation for rows
+	 * @tparam SelectVectorC   The type of the ALP vector defining permutation for columns
+	 *
+	 * @param source           The source ALP matrix
+	 * @param sel_r            A valid permutation vector of a subset of row indices
+	 * @param sel_c            A valid permutation vector of a subset of column indices
+	 *
+	 * @return A new gather view over the source ALP matrix.
+	 *
+	 */
+	template<
+		typename TargetStructure,
+		typename SourceMatrix,
+		typename SelectVectorR, typename SelectVectorC,
+		std::enable_if_t<
+			is_matrix< SourceMatrix >::value &&
+			is_vector< SelectVectorR >::value &&
+			is_vector< SelectVectorC >::value
+		> * = nullptr
+	>
+	typename internal::new_container_type_from<
+		typename SourceMatrix::template view_type< view::gather >::type
+	>::template change_structure< TargetStructure >::_and_::
+	template change_imfr< imf::Select >::_and_::
+	template change_imfc< imf::Select >::type
+	get_view(
+		SourceMatrix &source,
+		const SelectVectorR &sel_r,
+		const SelectVectorC &sel_c
+	) {
+		return internal::get_view< TargetStructure >(
+			source,
+			imf::Select( nrows( source ), sel_r ),
+			imf::Select( ncols( source ), sel_c )
+		);
+	}
+
+
+	/** Definitions of previously declared global methods that operate on ALP Matrix */
+	namespace internal {
+
+		template<
+			typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType>::value > *
+		>
+		bool getInitialized( const MatrixType &A ) noexcept {
+			return static_cast< const MatrixBase< typename MatrixType::base_type > & >( A ).template getInitialized();
+		}
+
+		template<
+			typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType>::value > *
+		>
+		void setInitialized( MatrixType &A, const bool initialized ) noexcept {
+			return static_cast< MatrixBase< typename MatrixType::base_type > & >( A ).template setInitialized( initialized );
+		}
+
+		template< typename DerivedMatrix >
+		std::pair< size_t, size_t > dims( const MatrixBase< DerivedMatrix > &A ) noexcept {
+			return A.dims();
+		}
+
+		/** Access the matrix element.
+		 *
+		 * @tparam    MatrixType ALP Matrix type
+		 *
+		 * @param[in] A             matrix to be accessed
+		 * @param[in] storageIndex  index in the physical iteration space
+		 *
+		 * @return For container matrices, returns a constant reference to the
+		 *         element at the given physical position of matrix A.
+		 *         For functor view matrices, returns a value corresponding to
+		 *         the given physical position of matrix A.
+		 *
+		 * \note   This method may be used to access only elements local to the processor.
+		 */
+		template<
+			typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType >::value > *
+		>
+		typename MatrixType::const_access_type access( const MatrixType &A, const typename MatrixType::storage_index_type &storageIndex ) {
+			return static_cast<
+				const MatrixBase< typename MatrixType::base_type > &
+			>( A ).template access< typename MatrixType::const_access_type, typename MatrixType::storage_index_type >( storageIndex );
+		}
+
+		/** Non-constant variant. **/
+		template<
+			typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType >::value > *
+		>
+		typename MatrixType::access_type access( MatrixType &A, const typename MatrixType::storage_index_type &storageIndex ) {
+			return static_cast<
+				MatrixBase< typename MatrixType::base_type > &
+			>( A ).template access< typename MatrixType::access_type, typename MatrixType::storage_index_type >( storageIndex );
+		}
+
+		/** Return a storage index in the physical layout.
+		 *
+		 * @tparam    MatrixType ALP Matrix type
+		 *
+		 * @param[in] A  matrix to be accessed
+		 * @param[in] i  row-index in the logical layout
+		 * @param[in] j  column-index in the logical layout
+		 * @param[in] s  process ID
+		 * @param[in] P  total number of processors
+		 *
+		 * @return For container matrices, returns a constant reference to the
+		 *         element at the given physical position of matrix A.
+		 *         For functor view matrices, returns a value corresponding to
+		 *         the given physical position of matrix A.
+		 *
+		 */
+		template<
+			typename MatrixType,
+			std::enable_if< is_matrix< MatrixType >::value > *
+		>
+		typename MatrixType::storage_index_type getStorageIndex( const MatrixType &A, const size_t i, const size_t j, const size_t s, const size_t P ) {
+			return static_cast< const MatrixBase< typename MatrixType::base_type > & >( A ).template getStorageIndex< typename MatrixType::storage_index_type >( i, j, s, P );
+		}
+
+		/** Return a pair of coordinates in logical layout.
+		 *
+		 * @tparam    MatrixType ALP Matrix type
+		 *
+		 * @param[in] A             matrix to be accessed
+		 * @param[in] storageIndex  storage index in the physical layout.
+		 * @param[in] s             process ID
+		 * @param[in] P             total number of processors
+		 *
+		 * @return Returns a pair of coordinates in logical iteration space
+		 *         that correspond to the provided storage index in the
+		 *         physical iteration space.
+		 *
+		 */
+		template< typename MatrixType >
+		std::pair< size_t, size_t > getCoords( const MatrixType &A, const size_t storageIndex, const size_t s, const size_t P );
+
+	} // namespace internal
+
+	template< typename D, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	size_t nrows( const Matrix< D, Structure, Density::Dense, View, ImfR, ImfC, backend > &A ) noexcept {
+		return dims( A ).first;
+	}
+
+	template< typename D, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	size_t ncols( const Matrix< D, Structure, Density::Dense, View, ImfR, ImfC, backend > &A ) noexcept {
+		return dims( A ).second;
+	}
+
+	template< typename D, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	std::pair< size_t, size_t > dims( const Matrix< D, Structure, Density::Dense, View, ImfR, ImfC, backend > &A ) noexcept {
+		return internal::dims( static_cast< const internal::MatrixBase<
+			typename Matrix< D, Structure, Density::Dense, View, ImfR, ImfC, backend >::base_type > & > ( A ) );
+	}
+
+	namespace structures {
+
+		template<
+			size_t band,
+			typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType >::value > * = nullptr
+		>
+		std::ptrdiff_t get_lower_limit( const MatrixType &A ) {
+
+			return structures::get_lower_limit< band, typename MatrixType::structure >( nrows( A ) );
+
+		}
+
+		template<
+			size_t band,
+			typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType >::value > * = nullptr
+		>
+		std::ptrdiff_t get_upper_limit( const MatrixType &A ) {
+
+			return structures::get_upper_limit< band, typename MatrixType::structure >( ncols( A ) );
+
+		}
+
+		/**
+		 * Specialization for reference backend.
+		 * @see alp::structures::calculate_row_coordinate_limits
+		 */
+		template<
+			size_t band_index, typename MatrixType,
+			std::enable_if_t<
+				is_matrix< MatrixType >::value
+			> *
+		>
+		std::pair< size_t, size_t > calculate_row_coordinate_limits( const MatrixType &A ) {
+
+			using Structure = typename MatrixType::structure;
+
+			static_assert(
+				band_index < std::tuple_size< typename Structure::band_intervals >::value,
+				"Provided band index is out of bounds."
+			);
+
+			// cast matrix dimensions to signed integer to allow for comparison with negative numbers
+			const std::ptrdiff_t M = static_cast< std::ptrdiff_t >( nrows( A ) );
+			const std::ptrdiff_t N = static_cast< std::ptrdiff_t >( ncols( A ) );
+
+			// band limits are negated and inverted due to different orientation
+			// of coordinate system of band and matrix dimensions.
+			const std::ptrdiff_t l = -structures::get_upper_limit< band_index >( A );
+			const std::ptrdiff_t u = N - structures::get_lower_limit< band_index >( A );
+
+			// fit the limits within the matrix dimensions
+			const size_t lower_limit = static_cast< size_t >( std::max( std::min( l, M ), static_cast< std::ptrdiff_t >( 0 ) ) );
+			const size_t upper_limit = static_cast< size_t >( std::max( std::min( u, M ), static_cast< std::ptrdiff_t >( 0 ) ) );
+
+			assert( lower_limit <= upper_limit );
+
+			return std::make_pair( lower_limit, upper_limit );
+		}
+
+		/**
+		 * Specialization for reference backend.
+		 * @see alp::structures::calculate_column_coordinate_limits
+		 */
+		template<
+			size_t band_index, typename MatrixType,
+			std::enable_if_t<
+				is_matrix< MatrixType >::value
+			> *
+		>
+		std::pair< size_t, size_t > calculate_column_coordinate_limits( const MatrixType &A, const size_t row ) {
+
+			using Structure = typename MatrixType::structure;
+
+			// Declaring this to avoid static casts to std::ptrdiff_t in std::min and std::max calls
+			const std::ptrdiff_t signed_zero = 0;
+
+			static_assert(
+				band_index < std::tuple_size< typename Structure::band_intervals >::value,
+				"Provided band index is out of bounds."
+			);
+
+			assert( row < nrows( A ) );
+
+			// cast matrix dimensions to signed integer to allow for comparison with negative numbers
+			const std::ptrdiff_t N = static_cast< std::ptrdiff_t >( ncols( A ) );
+
+			constexpr bool is_sym = structures::is_a< Structure, structures::Symmetric >::value;
+			// Temporary until adding multiple symmetry directions
+			constexpr bool sym_up = is_sym;
+
+			// Band limits
+			const std::ptrdiff_t l = structures::get_lower_limit< band_index >( A );
+			const std::ptrdiff_t u = structures::get_upper_limit< band_index >( A );
+
+			// Band limits taking into account symmetry
+			const std::ptrdiff_t sym_l = is_sym && sym_up ? std::max( signed_zero, l ) : l;
+			const std::ptrdiff_t sym_u = is_sym && !sym_up ? std::min( signed_zero, u ) : u;
+
+			// column coordinate lower and upper limits considering the provided row coordinate
+			const std::ptrdiff_t sym_l_row = static_cast< std::ptrdiff_t >( row ) + sym_l;
+			const std::ptrdiff_t sym_u_row = sym_l_row + ( sym_u - sym_l );
+
+			// fit the limits within the matrix dimensions
+			const size_t lower_limit = static_cast< size_t >( std::max( std::min( sym_l_row, N ), signed_zero ) );
+			const size_t upper_limit = static_cast< size_t >( std::max( std::min( sym_u_row, N ), signed_zero ) );
+
+			assert( lower_limit <= upper_limit );
+
+			return std::make_pair( lower_limit, upper_limit );
+		}
+
+	} // namespace structures
+
+	namespace structures {
+		namespace constant {
+
+			/** Returns a constant reference to an Identity matrix of the provided size */
+			template< typename T, Backend backend >
+			const Matrix<
+				T, structures::Identity, Density::Dense,
+				view::Functor< std::function< const T( const size_t, const size_t ) > >,
+				imf::Id, imf::Id, backend
+			>
+			I( const size_t n ) {
+
+				return Matrix<
+					T, structures::Identity, Density::Dense,
+					view::Functor< std::function< const T( const size_t, const size_t ) > >,
+					imf::Id, imf::Id, backend
+				>(
+					[]( const size_t i, const size_t j ) {
+						return ( i == j ) ? 1 : 0;
+					},
+					n,
+					n
+				);
+			}
+
+			/** Returns a constant reference to a Zero matrix of the provided size */
+			template< typename T, Backend backend >
+			const Matrix<
+				T, structures::Zero, Density::Dense,
+				view::Functor< std::function< const T( const size_t, const size_t ) > >,
+				imf::Id, imf::Id, reference
+			>
+			Zero( const size_t rows, const size_t cols ) {
+				return Matrix<
+					T, structures::Zero, Density::Dense,
+					view::Functor< std::function< const T( const size_t, const size_t ) > >,
+					imf::Id, imf::Id, reference
+				> (
+					[]( const size_t, const size_t ) {
+						return 0;
+					},
+					rows,
+					cols
+				);
+			}
+
+			namespace internal {
+
+				/** Returns a constant reference to a matrix representing Givens rotation
+				 * of the provided size n and parameters i, j, s and c, where
+				 * s = sin( theta ) and c = cos( theta )
+				 */
+				template< typename T, Backend backend >
+				const Matrix<
+					T, structures::Square, Density::Dense,
+					view::Original< void >, imf::Id, imf::Id, backend
+				> &
+				Givens( const size_t n, const size_t i, const size_t j, const T s, const T c ) {
+					using return_type = const Matrix<
+						T, structures::Square, Density::Dense, view::Original< void >, imf::Id, imf::Id, backend
+					>;
+					return_type * ret = new return_type( n );
+					// TODO: initialize matrix values according to the provided parameters
+					return * ret;
+				}
+
+			} // namespace internal
+		} // namespace constant
+	} // namespace structures
+} // namespace alp
+
+#endif // end ``_H_ALP_AMF_BASED_MATRIX''
diff --git a/include/alp/amf-based/storage.hpp b/include/alp/amf-based/storage.hpp
new file mode 100644
index 000000000..7d5561339
--- /dev/null
+++ b/include/alp/amf-based/storage.hpp
@@ -0,0 +1,1201 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file
+ *
+ * This file registers mechanisms for coordinate mapping between
+ * logical and physical iteration spaces.
+ *
+ */
+
+#ifndef _H_ALP_AMF_BASED_STORAGE
+#define _H_ALP_AMF_BASED_STORAGE
+
+#include <cassert>
+#include <memory>
+
+#include <alp/imf.hpp>
+#include <alp/structures.hpp>
+#include <alp/views.hpp>
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * Determines the mapping polynomial type and exposes a factory method
+		 * to create instances of that polynomial.
+		 *
+		 * All specializations of this type trait should define the factory
+		 * method following the same signature. The factory method shall
+		 * return an object of the type exposed as \a type.
+		 *
+		 * @tparam Structure  Matrix structure
+		 * @tparam ImfR       Row IMF type
+		 * @tparam ImfC       Column IMF type
+		 * @tparam backend    The backend
+		 *
+		 */
+		template< typename Structure, typename ImfR, typename ImfC, enum Backend backend >
+		struct determine_poly_factory {};
+
+	} // namespace internal
+
+	namespace storage {
+
+		enum StorageOrientation {
+			ROW_WISE,
+			COLUMN_WISE
+		};
+
+		enum StoredPart {
+			UPPER,
+			LOWER
+		};
+
+		/**
+		 * The namespace containts polynomials used to map coordinates
+		 * between logical and physical iteration spaces,
+		 * associated type traits and helper classes.
+		 */
+		namespace polynomials {
+
+			/**
+			 * Implements the polynomial
+			 * ( A*a*x^2 + B*b*y^2 + C*c*x*y + D*d*x + E*e*y + F*f ) / Denominator
+			 * where uppercase coefficients are compile-time constant,
+			 * lowercase coefficients are run-time constant,
+			 * and x and y are variables.
+			 * All coefficients and variables are integers and all operations are integer
+			 * operations.
+			 *
+			 * The purpose of compile-time constant coefficients is to allow compile-time
+			 * optimizations for zero terms/monomials.
+			 *
+			 * Denominator allows for implementaiton of polynomials with integer division,
+			 * e.g., n * ( n + 1 ) / 2,
+			 * while avoiding the need for floating point coefficients and operations.
+			 *
+			 * @tparam Ax2  Static coefficient corresponding to x^2
+			 * @tparam Ay2  Static coefficient corresponding to y^2
+			 * @tparam Axy  Static coefficient corresponding to x*y
+			 * @tparam Ax   Static coefficient corresponding to x
+			 * @tparam Ay   Static coefficient corresponding to y
+			 * @tparam A0   Static coefficient corresponding to constant term
+			 * @tparam Denominator  Static denominator dividing the whole polynomial
+			 */
+			template<
+				size_t coeffAx2, size_t coeffAy2, size_t coeffAxy,
+				size_t coeffAx, size_t coeffAy,
+				size_t coeffA0,
+				size_t Denominator
+			>
+			struct BivariateQuadratic {
+
+				static_assert( Denominator != 0, "Denominator cannot be zero (division by zero).");
+				typedef int64_t dyn_coef_t;
+
+				static constexpr size_t Ax2 = coeffAx2;
+				static constexpr size_t Ay2 = coeffAy2;
+				static constexpr size_t Axy = coeffAxy;
+				static constexpr size_t Ax  = coeffAx;
+				static constexpr size_t Ay  = coeffAy;
+				static constexpr size_t A0  = coeffA0;
+				static constexpr size_t D   = Denominator;
+				const dyn_coef_t ax2, ay2, axy, ax, ay, a0;
+
+				BivariateQuadratic(
+					const dyn_coef_t ax2, const dyn_coef_t ay2, const dyn_coef_t axy,
+					const dyn_coef_t ax, const dyn_coef_t ay,
+					const dyn_coef_t a0 ) :
+					ax2( ax2 ), ay2( ay2 ), axy( axy ),
+					ax( ax ), ay( ay ),
+					a0( a0 ) {}
+
+				size_t evaluate( const size_t x, const size_t y ) const {
+					return (Ax2 * ax2 * x * x +
+						Ay2 * ay2 * y * y +
+						Axy * axy * x * y +
+						Ax * ax * x +
+						Ay * ay * y +
+						A0 * a0) / D;
+				}
+
+			}; // BivariateQuadratic
+
+			/** \internal Defines the interface implemented by other polynomial factories */
+			struct AbstractFactory {
+
+				/** \internal Defines the type of the polynomial returned by Create */
+				typedef BivariateQuadratic< 0, 0, 0, 0, 0, 0, 1 > poly_type;
+
+				/** \internal Instantiates a polynomial */
+				static poly_type Create( const size_t rows, const size_t cols );
+
+				/** \internal Returns the size of storage associated with the defined polynomial */
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols );
+
+			}; // struct AbstractFactory
+
+			/** p(i,j) = 0 */
+			struct NoneFactory {
+
+				typedef BivariateQuadratic< 0, 0, 0, 0, 0, 0, 1 > poly_type;
+
+				static poly_type Create( const size_t rows, const size_t cols ) {
+					(void) rows;
+					(void) cols;
+					return poly_type( 0, 0, 0, 0, 0, 0 );
+				}
+
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols ) {
+					(void) rows;
+					(void) cols;
+					return 0;
+				}
+			}; // struct NoneFactory
+
+			/** p(i,j) = Ni + j */
+			template< bool row_major = true >
+			struct FullFactory {
+
+				typedef BivariateQuadratic< 0, 0, 0, 1, 1, 0, 1 > poly_type;
+
+				static poly_type Create( const size_t rows, const size_t cols ) {
+					if( row_major ){
+						return poly_type( 0, 0, 0, cols, 1, 0 );
+					} else {
+						return poly_type( 0, 0, 0, 1, rows, 0 );
+					}
+				}
+
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols ) {
+					return rows * cols;
+				}
+			}; // struct FullFactory
+
+			/** Implements packed, triangle-like storage */
+			template< enum StoredPart stored_part, enum StorageOrientation orientation >
+			struct PackedFactory;
+
+			/** p(i,j) = (-i^2 + (2N - 1)i + 2j) / 2 */
+			template<>
+			struct PackedFactory< UPPER, ROW_WISE > {
+
+				typedef BivariateQuadratic< 1, 0, 0, 1, 2, 0, 2 > poly_type;
+
+				static poly_type Create( const size_t rows, const size_t cols ) {
+#ifdef NDEBUG
+					(void) cols;
+					(void) rows;
+#endif
+					assert( rows == cols );
+					return poly_type( -1, 0, 0, 2 * cols - 1, 1, 0 );
+				}
+
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols ) {
+#ifdef NDEBUG
+					(void) cols;
+#endif
+					assert( rows == cols );
+					return rows * ( rows + 1 ) / 2;
+				}
+			};
+
+			/** p(i,j) = (j^2 + 2i + j) / 2 */
+			template<>
+			struct PackedFactory< UPPER, COLUMN_WISE > {
+
+				typedef BivariateQuadratic< 0, 1, 0, 2, 1, 0, 2 > poly_type;
+
+				static poly_type Create( const size_t rows, const size_t cols ) {
+#ifdef NDEBUG
+					(void) cols;
+					(void) rows;
+#endif
+					assert( rows == cols );
+					return poly_type( 0, 1, 0, 1, 1, 0 );
+				}
+
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols ) {
+#ifdef NDEBUG
+					(void) cols;
+#endif
+					assert( rows == cols );
+					return rows * ( rows + 1 ) / 2;
+				}
+			}; // struct PackedFactory
+
+			/** p(i,j) = (i^2 + i + 2j) / 2 */
+			template<>
+			struct PackedFactory< LOWER, ROW_WISE > {
+
+				typedef BivariateQuadratic< 1, 0, 0, 1, 2, 0, 2 > poly_type;
+
+				static poly_type Create( const size_t rows, const size_t cols ) {
+#ifdef NDEBUG
+					(void) cols;
+					(void) rows;
+#endif
+					assert( rows == cols );
+					return poly_type( 1, 0, 0, 1, 1, 0 );
+				}
+
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols ) {
+#ifdef NDEBUG
+					(void) cols;
+#endif
+					assert( rows == cols );
+					return rows * ( rows + 1 ) / 2;
+				}
+			}; // struct PackedFactory
+
+			/** p(i,j) = (-j^2 + 2i + (2M - 1)j) / 2 */
+			template<>
+			struct PackedFactory< LOWER, COLUMN_WISE > {
+
+				typedef BivariateQuadratic< 0, 1, 0, 2, 1, 0, 2 > poly_type;
+
+				static poly_type Create( const size_t rows, const size_t cols ) {
+#ifdef NDEBUG
+					(void) rows;
+					(void) cols;
+#endif
+					assert( rows == cols );
+					return poly_type( 0, -1, 0, 1, 2 * rows - 1, 0 );
+				}
+
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols ) {
+#ifdef NDEBUG
+					(void) cols;
+#endif
+					assert( rows == cols );
+					return rows * ( rows + 1 ) / 2;
+				}
+			};
+
+			template< size_t l, size_t u, bool row_wise >
+			struct BandFactory {
+
+				typedef BivariateQuadratic< 0, 0, 0, 0, 0, 0, 1 > poly_type;
+
+				static poly_type Create( const size_t rows, const size_t cols ) {
+					(void) rows;
+					(void) cols;
+					throw std::runtime_error( "Needs an implementation." );
+				}
+
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols ) {
+					(void) rows;
+					(void) cols;
+					throw std::runtime_error( "Needs an implementation." );
+				}
+			}; // struct BandFactory
+
+			struct ArrayFactory {
+				/** p(i,j) = i */
+				typedef BivariateQuadratic< 0, 0, 0, 1, 0, 0, 1 > poly_type;
+
+				static poly_type Create( const size_t rows, const size_t cols ) {
+					(void) rows;
+					(void) cols;
+					return poly_type( 0, 0, 0, 1, 0, 0 );
+				}
+
+				static size_t GetStorageDimensions( const size_t rows, const size_t cols ) {
+					assert( ( rows == 1 ) || ( cols == 1 ) );
+					return rows * cols;
+				}
+			};
+
+			template< enum view::Views view, typename Polynomial >
+			struct apply_view {};
+
+			template< typename Polynomial >
+			struct apply_view< view::original, Polynomial > {
+				typedef Polynomial type;
+			};
+
+			template< typename Polynomial >
+			struct apply_view< view::transpose, Polynomial > {
+				typedef BivariateQuadratic< Polynomial::Ay2, Polynomial::Ax2, Polynomial::Axy, Polynomial::Ay, Polynomial::Ax, Polynomial::A0, Polynomial::D > type;
+			};
+
+			template< typename Polynomial >
+			struct apply_view< view::diagonal, Polynomial > {
+				typedef Polynomial type;
+			};
+
+			template< typename Polynomial >
+			struct apply_view< view::_internal, Polynomial > {
+				typedef typename NoneFactory::poly_type type;
+			};
+
+			/**
+			 * Specifies the resulting IMF and Polynomial types after fusing
+			 * the provided IMF and Polynomial and provides two factory methods
+			 * to create the IMF and the Polynomial of the resulting types.
+			 * In the general case, the fusion does not happen and the resulting
+			 * types are equal to the provided types.
+			 */
+			template< typename Imf, typename Poly >
+			struct fuse_on_i {
+
+				typedef Imf resulting_imf_type;
+				typedef Poly resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( Imf imf ) {
+					return imf;
+				}
+
+				static resulting_polynomial_type CreatePolynomial( Imf imf, Poly p ) {
+					(void) imf;
+					return p;
+				}
+			};
+
+			/**
+			 * Specialization for Id IMF.
+			 */
+			template< typename Poly >
+			struct fuse_on_i< imf::Id, Poly > {
+
+				/** The resulting IMF is an Id because strided IMF is fully fused into the polynomial */
+				typedef imf::Id resulting_imf_type;
+
+				/** Some static factors change after injecting strided IMF into the polynomial */
+				typedef Poly resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( imf::Id imf ) {
+					return imf::Id( imf.n );
+				}
+
+				static resulting_polynomial_type CreatePolynomial( imf::Id imf, Poly p ) {
+					(void)imf;
+					return p;
+				}
+			};
+
+			/**
+			 * Specialization for strided IMF.
+			 */
+			template< typename Poly >
+			struct fuse_on_i< imf::Strided, Poly> {
+
+				/** The resulting IMF is an Id because strided IMF is fully fused into the polynomial */
+				typedef imf::Id resulting_imf_type;
+
+				/** Some static factors change after injecting strided IMF into the polynomial */
+				typedef BivariateQuadratic<
+					Poly::Ax2, Poly::Ay2, Poly::Axy,
+					Poly::Ax2 || Poly::Ax, Poly::Axy || Poly::Ay,
+					Poly::Ax2 || Poly::Ax || Poly::A0,
+					Poly::D
+				> resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( imf::Strided imf ) {
+					return imf::Id( imf.n );
+				}
+
+				static resulting_polynomial_type CreatePolynomial( imf::Strided imf, Poly p ) {
+					return resulting_polynomial_type(
+						p.ax2 * imf.s * imf.s, // ax2
+						p.ay2,                 // ay2
+						p.axy * imf.s,         // axy
+						2 * Poly::Ax2 * p.ax2 * imf.s * imf.b + Poly::Ax * p.ax * imf.s, // ax
+						Poly::Ay * p.ay + Poly::Axy * p.axy * imf.b,                     // ay
+						Poly::Ax2 * p.ax2 * imf.b * imf.b + Poly::Ax * p.ax * imf.b + Poly::A0 * p.a0 // A0
+					);
+				}
+			};
+
+			/**
+			 * Specialization for zero IMF.
+			 */
+			template< typename Poly >
+			struct fuse_on_i< imf::Zero, Poly> {
+
+				/** The resulting IMF is an Id because strided IMF is fully fused into the polynomial */
+				typedef imf::Id resulting_imf_type;
+
+				/** Some static factors change after injecting strided IMF into the polynomial */
+				typedef BivariateQuadratic<
+					0, Poly::Ay2, 0,
+					0, Poly::Ay,
+					Poly::A0,
+					Poly::D
+				> resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( imf::Zero imf ) {
+					return imf::Id( imf.n );
+				}
+
+				static resulting_polynomial_type CreatePolynomial( imf::Zero imf, Poly p ) {
+					(void)imf;
+					return resulting_polynomial_type(
+						0,     // ax2
+						p.ay2, // ay2
+						0,     // axy
+						0,     // ax
+						p.ay,  // ay
+						p.a0   // A0
+					);
+				}
+			};
+
+			template< typename Imf, typename Poly >
+			struct fuse_on_j {
+
+				typedef Imf resulting_imf_type;
+				typedef Poly resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( Imf imf ) {
+					return imf;
+				}
+
+				static resulting_polynomial_type CreatePolynomial( Imf imf, Poly p ) {
+					(void) imf;
+					return p;
+				}
+			};
+
+			/**
+			 * Specialization for Id IMF.
+			 */
+			template< typename Poly >
+			struct fuse_on_j< imf::Id, Poly > {
+
+				/** The resulting IMF is an Id because strided IMF is fully fused into the polynomial */
+				typedef imf::Id resulting_imf_type;
+
+				/** Some static factors change after injecting strided IMF into the polynomial */
+				typedef Poly resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( imf::Id imf ) {
+					return imf::Id( imf.n );
+				}
+
+				static resulting_polynomial_type CreatePolynomial( imf::Id imf, Poly p ) {
+					(void)imf;
+					return p;
+				}
+			};
+
+			/**
+			 * Specialization for strided IMF.
+			 */
+			template< typename Poly >
+			struct fuse_on_j< imf::Strided, Poly > {
+
+				/** The resulting IMF is an Id because strided IMF is fully fused into the polynomial */
+				typedef imf::Id resulting_imf_type;
+
+				/** Some static factors change after injecting strided IMF into the polynomial */
+				typedef BivariateQuadratic<
+					Poly::Ax2, Poly::Ay2, Poly::Axy,
+					Poly::Axy || Poly::Ax, Poly::Ay2 || Poly::Ay,
+					Poly::Ay2 || Poly::Ay || Poly::A0,
+					Poly::D
+				> resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( imf::Strided imf ) {
+					return imf::Id( imf.n );
+				}
+
+				static resulting_polynomial_type CreatePolynomial( imf::Strided imf, Poly p ) {
+					return resulting_polynomial_type(
+						p.ax2,                 // ax2
+						p.ay2 * imf.s * imf.s, // ay2
+						p.axy * imf.s,         // axy
+						Poly::Ax * p.ax + Poly::Axy * p.axy * imf.b,                     // ax
+						2 * Poly::Ay2 * p.ay2 * imf.s * imf.b + Poly::Ay * p.ay * imf.s, // ay
+						Poly::Ay2 * p.ay2 * imf.b * imf.b + Poly::Ay * p.ay * imf.b + Poly::A0 * p.a0 // A0
+					);
+				}
+			};
+
+			/**
+			 * Specialization for constant-mapping IMF.
+			 */
+			template< typename Poly >
+			struct fuse_on_j< imf::Constant, Poly > {
+
+				/** The resulting IMF is an Id because strided IMF is fully fused into the polynomial */
+				typedef imf::Id resulting_imf_type;
+
+				/** j factors contribute to the constant factor, while they become 0 */
+				typedef BivariateQuadratic<
+					Poly::Ax2, 0, 0,
+					Poly::Ax || Poly::Axy, 0,
+					Poly::A0 || Poly::Ay || Poly::Ay2,
+					Poly::D
+				> resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( imf::Constant imf ) {
+					return imf::Id( imf.n );
+				}
+
+				static resulting_polynomial_type CreatePolynomial( imf::Constant imf, Poly p ) {
+					return resulting_polynomial_type(
+						p.ax2,         // ax2
+						0,             // ay2
+						0,             // axy
+						Poly::Ax * p.ax +
+						Poly::Axy * p.axy * imf.b, // ax
+						0,             // ay
+						Poly::A0 * p.a0 +
+						Poly::Ay * p.ay * imf.b +
+						Poly::Ay2 * p.ay2 * imf.b * imf.b  // A0
+					);
+				}
+			};
+
+			/**
+			 * Specialization for zero IMF.
+			 */
+			template< typename Poly >
+			struct fuse_on_j< imf::Zero, Poly > {
+
+				/** The resulting IMF is an Id because strided IMF is fully fused into the polynomial */
+				typedef imf::Id resulting_imf_type;
+
+				/** Some static factors change after injecting strided IMF into the polynomial */
+				typedef BivariateQuadratic<
+					Poly::Ax2, 0, 0,
+					Poly::Ax, 0,
+					Poly::A0,
+					Poly::D
+				> resulting_polynomial_type;
+
+				static resulting_imf_type CreateImf( imf::Zero imf ) {
+					return imf::Id( imf.n );
+				}
+
+				static resulting_polynomial_type CreatePolynomial( imf::Zero imf, Poly p ) {
+					(void)imf;
+					return resulting_polynomial_type(
+						p.ax2, // ax2
+						0,     // ay2
+						0,     // axy
+						p.ax,  // ax
+						0,     // ay
+						p.a0   // A0
+					);
+				}
+			};
+
+		}; // namespace polynomials
+
+		/** Forward declaration */
+		template< enum Backend backend >
+		class AMFFactory;
+
+		/**
+		 * Access Mapping Function (AMF) maps logical matrix coordinates (i, j)
+		 * to the corresponding matrix element's location in the physical container.
+		 *
+		 * To calculate the mapping, the AMF first applies logical-to-logical
+		 * mapping provided by one IMF per coordinate (row and column).
+		 * A bivariate polynomial (called mapping polynomial) takes these two
+		 * output coordinates as inputs to calculate the position is physical
+		 * storage of the requested element (logical-to-physical mapping).
+		 *
+		 * For certain combinations of IMFs and mapping polynomial types it is
+		 * possible to fuse the index computation into a single function call.
+		 * AMF specializations for such IMF and polynomial types are free to do
+		 * any optimizations.
+		 *
+		 * All AMF specializations shall expose the effective types of the IMFs
+		 * and the mapping polynomial, since these may change after the fusion.
+		 */
+		template<
+			typename ImfR, typename ImfC, typename MappingPolynomial,
+			enum Backend backend
+		>
+		class AMF {
+
+			friend class AMFFactory< backend >;
+
+			public:
+
+				/** Expose static properties */
+				typedef ImfR imf_r_type;
+				typedef ImfC imf_c_type;
+				typedef MappingPolynomial mapping_polynomial_type;
+
+			private:
+
+				const imf_r_type imf_r;
+				const imf_c_type imf_c;
+				const mapping_polynomial_type map_poly;
+				const size_t storage_dimensions;
+
+				AMF( ImfR imf_r, ImfC imf_c, MappingPolynomial map_poly, const size_t storage_dimensions ) :
+					imf_r( imf_r ), imf_c( imf_c ), map_poly( map_poly ), storage_dimensions( storage_dimensions ) {}
+
+				AMF( const AMF & ) = delete;
+				AMF &operator=( const AMF & ) = delete;
+
+			public:
+
+				AMF( AMF &&amf ) :
+					imf_r( std::move( amf.imf_r ) ),
+					imf_c( std::move( amf.imf_c ) ),
+					map_poly( std::move( amf.map_poly ) ),
+					storage_dimensions( std::move( amf.storage_dimensions ) ) {}
+
+				/**
+				 * Returns dimensions of the logical layout of the associated container.
+				 *
+				 * @return  A pair of two values, number of rows and columns, respectively.
+				 */
+				std::pair< size_t, size_t> getLogicalDimensions() const {
+					return std::make_pair( imf_r.n, imf_c.n );
+				}
+
+				/**
+				 * Returns dimensions of the physical layout of the associated container.
+				 *
+				 * @return  The size of the physical container.
+				 */
+				std::size_t getStorageDimensions() const {
+					return storage_dimensions;
+				}
+
+				/**
+				 * @brief Returns a storage index based on the coordinates in the
+				 *        logical iteration space.
+				 *
+				 * @tparam R  ImfR type
+				 * @tparam C  ImfC type
+				 *
+				 * @param[in] i  row-coordinate
+				 * @param[in] j  column-coordinate
+				 * @param[in] s  current process ID
+				 * @param[in] P  total number of processes
+				 *
+				 * @return  storage index corresponding to the provided logical
+				 *          coordinates and parameters s and P.
+				 *
+				 * \note It is not necessary to call imf.map() function if the imf
+				 *       has the type imf::Id. To implement SFINAE-driven selection
+				 *       of the getStorageIndex, dummy parameters R and C are added.
+				 *       They are set to the ImfR and ImfC by default and a static
+				 *       assert ensures that external caller does not force a call
+				 *       to wrong implementation by explicitly specifying values
+				 *       for R and/or C.
+				 *
+				 */
+				template<
+					typename R = ImfR, typename C = ImfC,
+					std::enable_if_t< !std::is_same< R, imf::Id >::value && !std::is_same< C, imf::Id >::value > * = nullptr
+				>
+				size_t getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					static_assert(
+						std::is_same< R, ImfR >::value && std::is_same< C, ImfC >::value,
+						"Explicit specialization of getStorageIndex is not allowed."
+					);
+					(void)s;
+					(void)P;
+					return map_poly.evaluate( imf_r.map( i ), imf_c.map( j ) );
+				}
+
+				template<
+					typename R = ImfR, typename C = ImfC,
+					std::enable_if_t< std::is_same< R, imf::Id >::value && !std::is_same< C, imf::Id >::value > * = nullptr
+				>
+				size_t getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					static_assert(
+						std::is_same< R, ImfR >::value && std::is_same< C, ImfC >::value,
+						"Explicit specialization of getStorageIndex is not allowed."
+					);
+					(void)s;
+					(void)P;
+					return map_poly.evaluate( i, imf_c.map( j ) );
+				}
+
+				template<
+					typename R = ImfR, typename C = ImfC,
+					std::enable_if_t< !std::is_same< R, imf::Id >::value && std::is_same< C, imf::Id >::value > * = nullptr
+				>
+				size_t getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					static_assert(
+						std::is_same< R, ImfR >::value && std::is_same< C, ImfC >::value,
+						"Explicit specialization of getStorageIndex is not allowed."
+					);
+					(void)s;
+					(void)P;
+					return map_poly.evaluate( imf_r.map( i ), j );
+				}
+
+				template<
+					typename R = ImfR, typename C = ImfC,
+					std::enable_if_t< std::is_same< R, imf::Id >::value && std::is_same< C, imf::Id >::value > * = nullptr
+				>
+				size_t getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					static_assert(
+						std::is_same< R, ImfR >::value && std::is_same< C, ImfC >::value,
+						"Explicit specialization of getStorageIndex is not allowed."
+					);
+					(void)s;
+					(void)P;
+					return map_poly.evaluate( i, j );
+				}
+
+				/**
+				 * Returns coordinates in the logical iteration space based on
+				 * the storage index.
+				 *
+				 * @param[in] storageIndex  storage index in the physical
+				 *                          iteration space
+				 * @param[in] s             current process ID
+				 * @param[in] P             total number of processes
+				 *
+				 * @return  a pair of row- and column-coordinates in the
+				 *          logical iteration space.
+				 */
+				std::pair< size_t, size_t > getCoords( const size_t storageIndex, const size_t s, const size_t P ) const;
+
+		}; // class AMF
+
+		/**
+		 * Collects AMF factory classes.
+		 */
+		template< enum Backend backend >
+		struct AMFFactory {
+
+			/**
+			 * @brief Transforms the provided AMF by applying the gather view
+			 *        represented by the given row and column IMFs
+			 *
+			 * Exposes the type of the resulting AMF and implements a factory
+			 * method that creates objects of such type.
+			 * The IMFs and the AMF may be fusedi (simplified), depending on
+			 * the properties of the IMFs. For example, static IMFs (e.g. Id,
+			 * Strided) are easily fused into the mapping polynomial.
+			 *
+			 * Fusion of the IMFs into the mapping polynomial results in a
+			 * reduced amount of function calls and, potentially, less computation,
+			 * on each call to the map function of the AMF. This is especially
+			 * beneficial for longer chains of views.
+			 *
+			 * Creation of the new AMF is done in following order:
+			 *   - view row IMF and target row IMF are composed
+			 *   - view col IMF and target col IMF are composed
+			 *   - composed row IMF is fused into the target Poly, if possible,
+			 *     yielding the new intermediate polynomial
+			 *   - composed col IMF is fused, if possible, into the intermediate
+			 *     polynomial, obtained above. This results in the final fused
+			 *     polynomial.
+			 *
+			 * @tparam view       The enum value of the desired view type.
+			 * @tparam SourceAMF  The type of the target AMF
+			 *
+			 */
+			template< typename ViewImfR, typename ViewImfC, typename SourceAMF >
+			struct Compose {
+
+				private:
+
+					/** Extract target IMF and polynomial types */
+					typedef typename SourceAMF::imf_r_type SourceImfR;
+					typedef typename SourceAMF::imf_c_type SourceImfC;
+					typedef typename SourceAMF::mapping_polynomial_type SourcePoly;
+
+					/** Compose row and column IMFs */
+					typedef typename imf::ComposedFactory< SourceImfR, ViewImfR >::type composed_imf_r_type;
+					typedef typename imf::ComposedFactory< SourceImfC, ViewImfC >::type composed_imf_c_type;
+
+					/** Fuse composed row IMF into the target polynomial */
+					typedef typename polynomials::fuse_on_i<
+						composed_imf_r_type,
+						SourcePoly
+					> fused_row;
+
+					/** Fuse composed column IMF into the intermediate polynomial obtained above */
+					typedef typename polynomials::fuse_on_j<
+						composed_imf_c_type,
+						typename fused_row::resulting_polynomial_type
+					> fused_row_col;
+
+					typedef typename fused_row::resulting_imf_type final_imf_r_type;
+					typedef typename fused_row_col::resulting_imf_type final_imf_c_type;
+					typedef typename fused_row_col::resulting_polynomial_type final_polynomial_type;
+
+				public:
+
+					typedef AMF< final_imf_r_type, final_imf_c_type, final_polynomial_type, backend > amf_type;
+
+					static amf_type Create( ViewImfR imf_r, ViewImfC imf_c, const AMF< SourceImfR, SourceImfC, SourcePoly, backend > &amf ) {
+						composed_imf_r_type composed_imf_r = imf::ComposedFactory< SourceImfR, ViewImfR >::create( amf.imf_r, imf_r );
+						composed_imf_c_type composed_imf_c = imf::ComposedFactory< SourceImfC, ViewImfC >::create( amf.imf_c, imf_c );
+						return amf_type(
+							fused_row::CreateImf( composed_imf_r ),
+							fused_row_col::CreateImf( composed_imf_c ),
+							fused_row_col::CreatePolynomial(
+								composed_imf_c,
+								fused_row::CreatePolynomial( composed_imf_r, amf.map_poly )
+							),
+							amf.storage_dimensions
+						);
+					}
+
+					Compose() = delete;
+
+			}; // class Compose
+
+			/**
+			 * @brief Describes an AMF for a container that requires allocation
+			 *        and exposes the AMFs type and a factory method to create it.
+			 *
+			 * A container that requires allocation is accompanied by Id IMFs for
+			 * both row and column dimensions and the provided mapping polynomial.
+			 *
+			 * @tparam PolyType  Type of the mapping polynomial.
+			 *
+			 */
+			template< typename Structure, typename ImfR, typename ImfC >
+			struct FromPolynomial {
+
+				// Ensure compatibility of IMF types.
+				// Original Matrix has imf::Id as both IMFs.
+				// Original Vector has ImfR = imf::Id and ImfC = imf::Zero.
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					( std::is_same< ImfC, imf::Id >::value || std::is_same< ImfC, imf::Zero >::value ),
+					"AMF factory FromPolynomial can only be used for an original container."
+				);
+
+				typedef typename internal::determine_poly_factory< Structure, ImfR, ImfC, backend >::factory_type PolyFactory;
+
+				typedef AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, backend > amf_type;
+
+				/**
+				 * Factory method used by 2D containers.
+				 *
+				 * @param[in] imf_r               Row IMF
+				 * @param[in] imf_c               Column IMF
+				 * @param[in] poly                Mapping polynomial
+				 * @param[in] storage_dimensions  Size of the allocated storage
+				 *
+				 * @return  An AMF object of the type \a amf_type
+				 *
+				 */
+				static amf_type Create( imf::Id imf_r, imf::Id imf_c ) {
+					return amf_type( imf_r, imf_c, PolyFactory::Create( imf_r.n, imf_c.n ), PolyFactory::GetStorageDimensions( imf_r.n, imf_c.n ) );
+				}
+
+				/**
+				 * Factory method used by 1D containers.
+				 *
+				 * Exploits the fact that fusion of strided IMFs into the polynomial
+				 * always succeeds and results in Id IMFs. As a result, the
+				 * constructed AMF is of the type \a amf_type.
+				 *
+				 * @param[in] imf_r               Row IMF
+				 * @param[in] imf_c               Column IMF
+				 * @param[in] poly                Mapping polynomial
+				 * @param[in] storage_dimensions  Size of the allocated storage
+				 *
+				 * @return  An AMF object of the type \a amf_type
+				 *
+				 * \note \internal To exploit existing mechanism for IMF fusion
+				 *                 into the polynomial, this method creates a
+				 *                 dummy AMF out of two Id IMFs and the provided
+				 *                 polynomial and composes the provided Strided
+				 *                 IMFs with the dummy AMF.
+				 */
+				static amf_type Create( imf::Id imf_r, imf::Zero imf_c ) {
+
+					/**
+					 * Ensure that the assumptions do not break upon potential
+					 * future changes to AMFFactory::Compose.
+					 */
+					static_assert(
+						std::is_same<
+							amf_type,
+							typename Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, backend > >::amf_type
+						>::value,
+						"The factory method returns the object of different type than declared. This is a bug."
+					);
+					return Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, backend > >::Create(
+						imf_r, imf_c,
+						FromPolynomial< Structure, imf::Id, imf::Zero >::Create( imf::Id( imf_r.N ), imf::Id( imf_c.N ) )
+					);
+				}
+
+				FromPolynomial() = delete;
+
+			}; // class FromPolynomial
+
+			/**
+			 * @brief Transforms the provided AMF by applying the provided View type.
+			 *
+			 * Exposes the type of the resulting AMF and implements a factory
+			 * method that creates objects of such type.
+			 *
+			 * @tparam view       The enum value of the desired view type.
+			 * @tparam SourceAMF  The type of the target AMF
+			 *
+			 */
+			template< enum view::Views view, typename SourceAMF >
+			struct Reshape {
+
+				typedef SourceAMF amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					throw std::invalid_argument( "Not implemented for the provided view type." );
+					return amf;
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape
+
+			template< typename SourceAMF >
+			struct Reshape< view::original, SourceAMF > {
+
+				typedef SourceAMF amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					return amf_type( amf.imf_r, amf.imf_c, amf.map_poly, amf.storage_dimensions );
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< original, ... >
+
+			template< typename SourceAMF >
+			struct Reshape< view::transpose, SourceAMF > {
+
+				typedef AMF<
+					typename SourceAMF::imf_c_type,
+					typename SourceAMF::imf_r_type,
+					typename polynomials::apply_view<
+						view::transpose,
+						typename SourceAMF::mapping_polynomial_type
+					>::type,
+					backend
+				> amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					typedef typename polynomials::apply_view< view::transpose, typename SourceAMF::mapping_polynomial_type >::type new_mapping_polynomial_type;
+					return AMF<
+						typename SourceAMF::imf_c_type,
+						typename SourceAMF::imf_r_type,
+						new_mapping_polynomial_type,
+						backend
+					>(
+						amf.imf_c,
+						amf.imf_r,
+						new_mapping_polynomial_type(
+							amf.map_poly.ay2, amf.map_poly.ax2, amf.map_poly.axy,
+							amf.map_poly.ay, amf.map_poly.ax,
+							amf.map_poly.a0
+						),
+						amf.storage_dimensions
+					);
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< transpose, ... >
+
+			/**
+			 * Specialization for diagonal views
+			 *
+			 * Diagonal view is implemented by taking a square view over the matrix.
+			 *
+			 * \note \internal Converts a mapping polynomial from a bivariate-quadratic
+			 *                 to univariate quadratic by summing j-factors into
+			 *                 corresponding i-factors.
+			 *                 Implicitely applies a largest possible square view by
+			 *                 using Strided IMFs.
+			 *
+			 */
+			template< typename SourceAMF >
+			struct Reshape< view::diagonal, SourceAMF > {
+
+				private:
+
+					/** Short name of the original mapping polynomial type */
+					typedef typename SourceAMF::mapping_polynomial_type orig_p;
+
+					/** The type of the resulting polynomial */
+					typedef polynomials::BivariateQuadratic<
+						orig_p::Ax2 || orig_p::Ay2 || orig_p::Axy, 0, 0,
+						orig_p::Ax || orig_p::Ay, 0,
+						orig_p::A0, orig_p::D
+					> new_poly_type;
+
+				public:
+
+					typedef AMF< imf::Id, imf::Zero, new_poly_type, backend > amf_type;
+
+					static amf_type Create( const SourceAMF &amf ) {
+						assert( amf.getLogicalDimensions().first == amf.getLogicalDimensions().second );
+						return amf_type(
+							imf::Id( amf.getLogicalDimensions().first ),
+							imf::Zero( 1 ),
+							new_poly_type(
+								orig_p::Ax2 * amf.map_poly.ax2 + orig_p::Ay2 * amf.map_poly.ay2 + orig_p::Axy * amf.map_poly.axy, 0, 0,
+								orig_p::Ax * amf.map_poly.ax + orig_p::Ay * amf.map_poly.ay, 0,
+								amf.map_poly.a0
+							),
+							amf.storage_dimensions
+						);
+					}
+
+					Reshape() = delete;
+
+			}; // class Reshape< diagonal, ... >
+
+			/**
+			 * Specialization for matrix views over vectors
+			 *
+			 * \note \internal The resulting AMF is equivalent to applying
+			 *                 a composition with two ID IMFs.
+			 *
+			 */
+			template< typename SourceAMF >
+			struct Reshape< view::matrix, SourceAMF > {
+
+				typedef typename Compose< imf::Id, imf::Id, SourceAMF >::amf_type amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					return Compose< imf::Id, imf::Id, SourceAMF >::Create(
+						imf::Id( amf.getLogicalDimensions().first ),
+						imf::Id( amf.getLogicalDimensions().second ),
+						amf
+					);
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< diagonal, ... >
+
+		}; // class AMFFactory
+
+	}; // namespace storage
+
+	namespace internal {
+
+		/**
+		 * Determines the AMF type for a matrix having the provided static properties.
+		 *
+		 * For a matrix that requires allocation, the new AMF consists of two Id IMFs
+		 * and the pre-defined mapping polynomial.
+		 * For a view over another matrix, the new AMF is created from the AMF of the
+		 * target matrix in one of the following ways:
+		 *  - When applying gather view using IMFs, the IMFs are applied to the AMF of
+		 *    the target matrix.
+		 *  - When applying a different view type (e.g. transpose or diagonal), the AMF
+		 *    of the target matrix is transformed according to the provided view type.
+		 *
+		 * @tparam View     View type
+		 * @tparam ImfR     Row IMF type
+		 * @tparam ImfC     Column IMF type
+		 * @tparam backend  The backend
+		 *
+		 * The valid combinations of the input parameters are as follows:
+		 *  - original view on void with Id IMFs.
+		 *  - original view on ALP matrix with any type of IMFs
+		 *  - other type of views (e.g. transposed, diagonal) with only Id IMFs.
+		 * Invocation using incompatible parameters may result in an undefined behavior.
+		 * The first parameter combination is handled by a specialization of this trait.
+		 *
+		 */
+		template<
+			typename Structure, typename View, typename ImfR, typename ImfC,
+			enum Backend backend
+		>
+		struct determine_amf_type {
+
+			/** Ensure that the view is not on a void type */
+			static_assert(
+				!std::is_same< typename View::applied_to, void >::value,
+				"Cannot handle views over void type by this determine_amf_type specialization."
+			);
+
+			/** Ensure that if the view is original, the IMFs are Id */
+			static_assert(
+				View::type_id != view::original ||
+				( View::type_id == view::original && std::is_same< imf::Id, ImfR >::value && std::is_same< imf::Id, ImfC >::value ),
+				"Original view with non-ID Index Mapping Functions is not supported."
+			);
+
+			/** Ensure that if the view is transposed, the IMFs are Id */
+			static_assert(
+				View::type_id != view::transpose ||
+				( View::type_id == view::transpose && std::is_same< imf::Id, ImfR >::value && std::is_same< imf::Id, ImfC >::value ),
+				"Transposed view with non-ID Index Mapping Functions is not supported."
+			);
+
+			/** Ensure that if the view is diagonal, the row and column IMFs are Id and Zero, respectively */
+			static_assert(
+				View::type_id != view::diagonal ||
+				( View::type_id == view::diagonal && std::is_same< imf::Id, ImfR >::value && std::is_same< imf::Zero, ImfC >::value ),
+				"Diagonal view with non-Id Row and non-Zero Column Index Mapping Functions is not supported."
+			);
+
+			typedef typename std::conditional<
+				View::type_id == view::gather,
+				typename storage::AMFFactory< backend >::template Compose<
+					ImfR, ImfC, typename View::applied_to::amf_type
+				>::amf_type,
+				typename storage::AMFFactory< backend >::template Reshape<
+					View::type_id,
+					typename View::applied_to::amf_type
+				>::amf_type
+			>::type type;
+
+		};
+
+		/** Specialization for storage-based containers that allocate storage */
+		template< typename Structure, typename ImfC, enum Backend backend >
+		struct determine_amf_type< Structure, view::Original< void >, imf::Id, ImfC, backend > {
+
+			static_assert(
+				std::is_same< ImfC, imf::Id >::value || std::is_same< ImfC, imf::Zero >::value,
+				"Incompatible combination of parameters provided to determine_amf_type."
+			);
+
+			typedef typename storage::AMFFactory< backend >::template FromPolynomial<
+				Structure, imf::Id, ImfC
+			>::amf_type type;
+		};
+
+		/** Specialization for functor-based containers that allocate storage */
+		template< typename Structure, typename ImfC, enum Backend backend, typename Lambda >
+		struct determine_amf_type< Structure, view::Functor< Lambda >, imf::Id, ImfC, backend > {
+
+			static_assert(
+				std::is_same< ImfC, imf::Id >::value || std::is_same< ImfC, imf::Zero >::value,
+				"Incompatible combination of parameters provided to determine_amf_type."
+			);
+
+			// A functor-based container does not have an AMF
+			typedef void type;
+		};
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // _H_ALP_AMF_BASED_STORAGE
diff --git a/include/alp/amf-based/storagebasedmatrix.hpp b/include/alp/amf-based/storagebasedmatrix.hpp
new file mode 100644
index 000000000..ec2c308d2
--- /dev/null
+++ b/include/alp/amf-based/storagebasedmatrix.hpp
@@ -0,0 +1,314 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_AMF_BASED_STORAGEBASEDMATRIX
+#define _H_ALP_AMF_BASED_STORAGEBASEDMATRIX
+
+#include <alp/backends.hpp>
+#include <alp/config.hpp>
+#include <alp/ops.hpp>
+#include <alp/type_traits.hpp>
+#include <alp/utils.hpp>
+
+#include <alp/base/matrix.hpp>
+
+#include "storage.hpp"
+
+/**
+ * @todo This should not happen given the hierarchy of concepts. Fix by splitting
+ * \a internal::Vector from the various vector.hpp
+ */
+
+#if defined( _ALP_WITH_REFERENCE ) || defined( _ALP_OMP_WITH_REFERENCE )
+ #include <alp/reference/vector.hpp>
+#endif
+#if defined( _ALP_WITH_DISPATCH ) || defined( _ALP_OMP_WITH_DISPATCH )
+ #include <alp/dispatch/vector.hpp>
+#endif
+
+
+namespace alp {
+
+	namespace internal {
+
+		/** Forward declaration */
+		template< typename DerivedMatrix >
+		class MatrixBase;
+
+		template< typename DerivedMatrix >
+		std::pair< size_t, size_t > dims( const MatrixBase< DerivedMatrix > &A ) noexcept;
+
+		template<
+			typename MatrixType,
+			std::enable_if_t< internal::is_storage_based< MatrixType >::value > * = nullptr
+		>
+		size_t getStorageDimensions( const MatrixType &A ) noexcept;
+
+		template< typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType>::value > * = nullptr
+		>
+		bool getInitialized( const MatrixType &A ) noexcept;
+
+		template< typename MatrixType,
+			std::enable_if_t< is_matrix< MatrixType>::value > * = nullptr
+		>
+		void setInitialized( MatrixType &, const bool ) noexcept;
+
+		/** Forward declaration */
+		template< typename T, typename AmfType, bool requires_allocation, Backend backend >
+		class StorageBasedMatrix;
+
+		/** Container reference getters used by friend functions of specialized Matrix */
+		template< typename T, typename AmfType, bool requires_allocation, Backend backend >
+		const Vector< T, backend > & getContainer( const StorageBasedMatrix< T, AmfType, requires_allocation, backend > & A );
+
+		template< typename T, typename AmfType, bool requires_allocation, Backend backend >
+		Vector< T, backend > & getContainer( StorageBasedMatrix< T, AmfType, requires_allocation, backend > & A );
+
+		/** Container reference getters. Defer the call to base class friend function */
+		template<
+			typename T, typename Structure, enum Density density, typename View,
+			typename ImfR, typename ImfC,
+			Backend backend
+		>
+		const Vector< T, backend > & getContainer( const alp::Matrix< T, Structure, density, View, ImfR, ImfC, backend > & A ) {
+			return getContainer( static_cast<
+				const StorageBasedMatrix<
+					T,
+					typename alp::Matrix< T, Structure, density, View, ImfR, ImfC, backend >::amf_type,
+					alp::Matrix< T, Structure, density, View, ImfR, ImfC, backend >::requires_allocation,
+					backend
+				> &
+			>( A ) );
+		}
+
+		template<
+			typename T, typename Structure, enum Density density, typename View,
+			typename ImfR, typename ImfC,
+			Backend backend
+		>
+		Vector< T, backend > & getContainer( alp::Matrix< T, Structure, density, View, ImfR, ImfC, backend > & A ) {
+			return getContainer( static_cast<
+				StorageBasedMatrix<
+					T,
+					typename alp::Matrix< T, Structure, density, View, ImfR, ImfC, backend >::amf_type,
+					alp::Matrix< T, Structure, density, View, ImfR, ImfC, backend >::requires_allocation,
+					backend
+				> &
+			>( A ) );
+		}
+
+		/** Returns the reference to the AMF of a storage-based matrix */
+		template<
+			typename MatrixType,
+			std::enable_if< internal::is_storage_based< MatrixType >::value > * = nullptr
+		>
+		const typename MatrixType::amf_type &getAmf( const MatrixType &A ) noexcept;
+
+		/**
+		 * Matrix container specialization
+		 * Implements both original containers and views on containers.
+		 * @tparam requires_allocation True if the class is an original container
+		 *                             False if the class is a view of another matrix
+		 */
+		template< typename T, typename AmfType, bool requires_allocation, Backend backend >
+		class StorageBasedMatrix : public MatrixBase< StorageBasedMatrix< T, AmfType, requires_allocation, backend > > {
+
+			template<
+				typename MatrixType,
+				std::enable_if_t< internal::is_storage_based< MatrixType >::value > *
+			>
+			friend size_t getStorageDimensions( const MatrixType &A ) noexcept;
+
+			/** Get the reference to the AMF of a storage-based matrix */
+			template<
+				typename MatrixType,
+				std::enable_if< internal::is_storage_based< MatrixType >::value > *
+			>
+			friend const typename MatrixType::amf_type &getAmf( const MatrixType &A ) noexcept;
+
+			public:
+
+				/** Expose static properties */
+
+				typedef T value_type;
+				typedef AmfType amf_type;
+				typedef typename AmfType::imf_r_type imf_r_type;
+				typedef typename AmfType::imf_c_type imf_c_type;
+				/** Type returned by access function */
+				typedef T &access_type;
+				typedef const T &const_access_type;
+				/** Type of the index used to access the physical storage */
+				typedef size_t storage_index_type;
+
+			protected:
+				typedef StorageBasedMatrix< T, AmfType, requires_allocation, backend > self_type;
+				friend MatrixBase< self_type >;
+
+				typedef typename std::conditional<
+					requires_allocation,
+					Vector< T, backend >,
+					Vector< T, backend > &
+				>::type container_type;
+
+				/** A container-type view is characterized by its association with a physical container */
+				container_type container;
+
+				/**
+				 * All matrix views use a pair of index mapping functions to
+				 * capture the correspondence between their logical layout and the one
+				 * of their underlying container. This may be another view leading to a composition
+				 * of IMFs between the top matrix view and the physical container.
+				 * Original matrix containers's index mapping functions are an identity mapping.
+				 */
+				//ImfR imf_r;
+				//ImfC imf_c;
+
+				/**
+				 * The container's storage scheme. \a storage_scheme is not exposed to the user as an option
+				 * but can defined by ALP at different points in the execution depending on the \a backend choice.
+				 * In particular, if the structured matrix is not a temporary matrix than it is fixed at construction
+				 * time when the allocation takes place.
+				 * If the structured matrix is a temporary one than a storage storage scheme choice may or may not be
+				 * made depending on whether a decision about instantiating the matrix is made by the framework.
+				 *
+				 * The specific storage scheme choice depends on the chosen backend and the structure of the matrix.
+				 * \internal \todo Revisit this when introducing storage mapping functions.
+				 */
+				//Smf smf;
+
+				/**
+				 * Access mapping function maps a pair of logical coordinates
+				 * into the concrete coordinate inside the actual container.
+				 * \see AMF
+				 */
+				AmfType amf;
+				/**
+				 * @brief determines the size of the matrix via the domain of
+				 * the index mapping functions.
+				 *
+				 * @return A pair of dimensions.
+				 */
+				std::pair< size_t, size_t > dims() const noexcept {
+					return amf.getLogicalDimensions();
+				}
+
+				size_t getStorageDimensions() const noexcept {
+					return amf.getStorageDimensions();
+				}
+
+				friend const Vector< T, backend > & getContainer( const self_type &A ) {
+					return A.container;
+				}
+
+				friend Vector< T, backend > & getContainer( self_type &A ) {
+					return A.container;
+				}
+
+				bool getInitialized() const noexcept {
+					return internal::getInitialized( container );
+				}
+
+				void setInitialized( const bool initialized ) noexcept {
+					internal::setInitialized( container , initialized );
+				}
+
+				const AmfType &getAmf() const noexcept {
+					return amf;
+				}
+
+				/**
+				 * Returns a constant reference to the element corresponding to
+				 * the provided storage index.
+				 *
+				 * @param storageIndex  storage index in the physical iteration
+				 *                      space.
+				 *
+				 * @return const reference or value of the element at given position.
+				 */
+				const_access_type access( const storage_index_type &storageIndex ) const {
+					return container[ storageIndex ];
+				}
+
+				access_type access( const storage_index_type &storageIndex ) {
+					return container[ storageIndex ];
+				}
+
+				storage_index_type getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					return amf.getStorageIndex( i, j, s, P );
+				}
+
+				/**
+				 * @brief Construct a new structured matrix Base object assigning identity
+				 * mapping functions both to the row and column dimensions.
+				 *
+				 * @param rows The number of rows of the matrix.
+				 * @param cols The number of columns of the matrix.
+				 * @param smf  The storage mapping function assigned to this matrix.
+				 */
+				/** (Documentation of old MatrixCotiainer. TODO: merge with the above docs.
+				 * @brief Construct a new structured matrix container object.
+				 *
+				 * \warning \a cap is present for compatibility with other matrix specializations.
+				 *          In reference backend, the number of non-zeros (i.e. capacity)
+				 *          depends on the used storage scheme. Therefore, this parameter is
+				 *          ignored.
+				 *
+				 * TODO: Add the storage scheme a parameter to the constructor
+				 * so that allocation can be made accordingly, generalizing the full case.
+				 */
+				StorageBasedMatrix( AmfType &&amf ) :
+					// enable only if ImfR and ImfC are imf::Id
+					container( internal::Vector< T, backend >( amf.getStorageDimensions() ) ),
+					amf( std::move( amf ) ) {}
+
+				/** View on another container */
+				StorageBasedMatrix( Vector< T, backend > &container, AmfType &&amf ) :
+					container( container ),
+					amf( std::move( amf ) ) {}
+
+				/** View on another raw container */
+				StorageBasedMatrix( T *buffer, const size_t buffer_size, AmfType &&amf ) :
+					container( buffer, buffer_size ),
+					amf( std::move( amf ) ) {}
+
+		}; // class StorageBasedMatrix
+
+
+		/** Get the reference to the AMF of a storage-based matrix */
+		template<
+			typename MatrixType,
+			std::enable_if< internal::is_storage_based< MatrixType >::value > *
+		>
+		const typename MatrixType::amf_type &getAmf( const MatrixType &A ) noexcept {
+			return A.getAmf();
+		}
+
+	} // namespace internal
+
+	template<
+		typename MatrixType,
+		std::enable_if< internal::is_storage_based< MatrixType >::value > * = nullptr
+	>
+	size_t internal::getStorageDimensions( const MatrixType &A ) noexcept {
+		static_assert( is_storage_based< MatrixType >::value, "getStorageDimensions supported only for storage-based containers.");
+		return static_cast< const typename MatrixType::base_type & >( A ).getStorageDimensions();
+	}
+
+} // namespace alp
+
+#endif // end ``_H_ALP_AMF_BASED_STORAGEBASEDMATRIX''
diff --git a/include/alp/amf-based/vector.hpp b/include/alp/amf-based/vector.hpp
new file mode 100644
index 000000000..0a6c2c972
--- /dev/null
+++ b/include/alp/amf-based/vector.hpp
@@ -0,0 +1,729 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_AMF_BASED_VECTOR
+#define _H_ALP_AMF_BASED_VECTOR
+
+
+#include <memory>
+#include <stdexcept>
+
+#include <assert.h>
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/imf.hpp>
+#include <alp/rc.hpp>
+#include <alp/views.hpp>
+#include <alp/base/vector.hpp>
+
+#include "matrix.hpp"
+#include "storage.hpp"
+
+
+namespace alp {
+
+	namespace internal {
+
+		template< typename T, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+		size_t getLength( const alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > &v ) noexcept {
+			return v._length();
+		}
+
+		template< typename T, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+		bool getInitialized( const alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > &v ) noexcept {
+			return getInitialized( static_cast< const typename alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend >::base_type & >( v ) );
+		}
+
+		template< 
+			typename T, typename Structure, typename View, 
+			typename ImfR, typename ImfC, enum Backend backend 
+		>
+		void setInitialized( 
+			alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > &v, bool initialized 
+		) noexcept {
+			setInitialized( static_cast< typename alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend >::base_type &>( v ), initialized );
+		}
+
+		template< 
+			typename T, typename Structure, typename View, 
+			typename ImfR, typename ImfC, enum Backend backend 
+		>
+		typename alp::Vector< 
+			T, Structure, Density::Dense, View, ImfR, ImfC, backend 
+		>::iterator
+		begin( 
+			alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > &v 
+		) noexcept;
+
+		template< 
+			typename T, typename Structure, typename View, 
+			typename ImfR, typename ImfC, enum Backend backend 
+		>
+		typename alp::Vector< 
+			T, Structure, Density::Dense, View, ImfR, ImfC, backend 
+		>::iterator
+		end( 
+			alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > &v 
+		) noexcept;
+
+	} // end namespace ``alp::internal''
+
+	/**
+	 * \brief An ALP vector view.
+	 *
+	 * This is an opaque data type for vector views.
+	 *
+	 * A vector exposes a mathematical \em logical layout which allows to
+	 * express implementation-oblivious concepts such as \em views on the vector.
+	 * The logical layout of a vector view maps to a physical counterpart via
+	 * a storage scheme which typically depends on the selected backend.
+	 * alp::Vector may be used as an interface to such a physical layout.
+	 *
+	 * Views can be used to create logical \em perspectives on top of a container.
+	 * For example, one may decide to refer to the part of the vector or
+	 * to reference a diagonal of a matrix as a vector.
+	 * See specialization \a Vector< T, Structure, Density::Dense, view::Diagonal< MatrixT >, backend >
+	 * as an example of such usage.
+	 *
+	 * Vector View defined as views on other vectors do not instantiate a
+	 * new container but refer to the one used by their targets.
+	 *
+	 * @tparam T         type.
+	 * @tparam Structure Structure introduced to match the template
+	 *                   parameter list of \a Matrix
+	 * @tparam View      One of the vector views.
+	 *                   All static views except for \a view::Original (via
+	 *                   \a view::Original<void> cannot instantiate a new container
+	 *                   and only allow to refer to a previously defined
+	 *                   \a Vector.
+	 *                   The \a View parameter should not be used directly
+	 *                   by the user but can be set using specific member types
+	 *                   appropriately defined by each Vector and
+	 *                   accessible via functions.
+	 *
+	 */
+	template< 
+		typename T, typename Structure, typename View, 
+		typename ImfR, typename ImfC, enum Backend backend 
+	>
+	class Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > { };
+
+	/*
+	 * ALP vector with a general structure
+	 */
+	template< 
+		typename T, typename View, 
+		typename ImfR, typename ImfC, enum Backend backend 
+	>
+	class Vector< 
+		T, structures::General, Density::Dense, View, ImfR, ImfC, backend 
+	> : public Matrix< T, structures::General, Density::Dense, View, ImfR, ImfC, backend > {
+
+		public:
+
+			typedef Vector< 
+				T, structures::General, Density::Dense, 
+				View, ImfR, ImfC, backend 
+			> self_type;
+			
+			typedef Matrix< 
+				T, structures::General, Density::Dense, 
+				View, ImfR, ImfC, backend 
+			> base_type;
+
+		private:
+			class VectorIterator: 
+				public std::iterator< std::random_access_iterator_tag, T > {
+				
+				friend class Vector< 
+					T, structures::General, Density::Dense, 
+					View, ImfR, ImfC, backend 
+				>;
+
+				private:
+					
+					typedef typename self_type::storage_index_type index_type;
+					typedef std::iterator<
+						std::random_access_iterator_tag, T
+					> std_base_class;
+
+					self_type *vec;
+					index_type position;
+
+					VectorIterator( self_type *vptr ) noexcept : 
+						vec( vptr ), position( 0 ) 
+					{}
+
+					VectorIterator( self_type *vptr, index_type pos ) noexcept : 
+						vec( vptr ), position( pos ) 
+					{}
+
+					bool equal( const VectorIterator &other ) const noexcept {
+						return ( vec == other.vec ) && ( position == other.position );
+					}
+
+					bool lessThen( const VectorIterator &other ) const noexcept {
+						return ( vec == other.vec ) && ( position < other.position );
+					}
+
+				public:
+					typedef typename std_base_class::pointer pointer;
+					typedef typename std_base_class::reference reference;
+					typedef typename std_base_class::difference_type difference_type;
+
+					/** Default constructor. */
+					VectorIterator() noexcept :
+						vec( nullptr ), position( 0 )
+					{}
+
+					/** Copy constructor. */
+					VectorIterator( const VectorIterator &other ) noexcept :
+						vec( other.vec ),
+						position( other.position )
+					{}
+
+					/** Move constructor. */
+					VectorIterator( VectorIterator &&other ) :
+						vec( nullptr ), position( 0 )
+					{
+						std::swap( vec, other.vec );
+						std::swap( position, other.position );
+					}
+
+					/** Copy assignment. */
+					VectorIterator& operator=( const VectorIterator &other ) noexcept {
+						vec = other.vec;
+						position = other.position;
+						return *this;
+					}
+
+					/** Move assignment. */
+					VectorIterator& operator=( VectorIterator &&other ) {
+						vec = nullptr;
+						position = 0;
+						std::swap( vec, other.vec );
+						std::swap( position, other.position );
+						return *this;
+					}
+
+					reference operator*() const {
+						return ( *vec )[ position ];
+					}
+
+					VectorIterator& operator++() {
+						++position;
+						return *this;
+					}
+
+					VectorIterator& operator--() {
+						--position;
+						return *this;
+					}
+
+					VectorIterator operator++( int ) {
+						return VectorIterator( vec, position++ );
+					}
+
+					VectorIterator operator--( int ) {
+						return VectorIterator( vec, position-- );
+					}
+
+					VectorIterator operator+( const difference_type &n ) const {
+						return VectorIterator( vec, ( position + n ) );
+					}
+
+					VectorIterator& operator+=( const difference_type &n ) {
+						position += n;
+						return *this;
+					}
+
+					VectorIterator operator-( const difference_type &n ) const {
+						return VectorIterator( vec, ( position - n ) );
+					}
+
+					VectorIterator& operator-=( const difference_type &n ) {
+						position -= n;
+						return *this;
+					}
+
+					reference operator[]( const difference_type &n ) const {
+						return ( *vec )[ position + n ];
+					}
+
+					bool operator==( const VectorIterator &other ) const {
+						return equal( other );
+					}
+
+					bool operator!=( const VectorIterator &other ) const {
+						return !equal( other );
+					}
+
+					bool operator<( const VectorIterator &other ) const {
+						return lessThen( other );
+					}
+
+					bool operator>( const VectorIterator &other ) const {
+						return !( lessThen( other ) || equal( other ) );
+					}
+
+					bool operator<=( const VectorIterator &other ) const {
+						return lessThen( other ) || equal( other );
+					}
+
+					bool operator>=( const VectorIterator &other ) const {
+						return !lessThen( other );
+					}
+
+					difference_type operator+( const VectorIterator &other ) const {
+						assert( other.vec == vec );
+						return position + other.position;
+					}
+
+					difference_type operator-( const VectorIterator &other ) const {
+						assert( other.vec == vec );
+						return position - other.position;
+					}
+			};
+
+			/*********************
+				Storage info friends
+			******************** */
+
+			friend size_t internal::getLength<>( const Vector< T, structures::General, Density::Dense, View, ImfR, ImfC, backend > &v ) noexcept;
+
+			/** Returns the length of the vector */
+			size_t _length() const {
+				return nrows( static_cast< const base_type & >( *this ) );
+			}
+
+			VectorIterator begin() noexcept {
+				return VectorIterator( this );
+			}
+
+			VectorIterator end() noexcept {
+				return VectorIterator(  this, _length() );
+			}
+
+
+		public:
+			
+			typedef VectorIterator iterator;
+
+			friend iterator internal::begin<>( self_type &v ) noexcept;
+			friend iterator internal::end<>( self_type &v ) noexcept;
+
+			/** @see Vector::value_type. */
+			using value_type = T;
+
+			typedef structures::General structure;
+
+			/** @see Vector::lambda_reference */
+			typedef typename std::conditional<
+				internal::is_storage_based< self_type >::value,
+				T &,
+				T
+			>::type lambda_reference;
+			typedef typename std::conditional<
+				internal::is_storage_based< self_type >::value,
+				const T &,
+				const T
+			>::type const_lambda_reference;
+
+			template < view::Views view_tag, bool d=false >
+			struct view_type;
+
+			template < bool d >
+			struct view_type< view::original, d > {
+				typedef Vector< T, structures::General, Density::Dense, view::Original< self_type >, imf::Id, imf::Id, backend > type;
+			};
+
+			template < bool d >
+			struct view_type< view::gather, d > {
+				typedef Vector< T, structures::General, Density::Dense, view::Gather< self_type >, imf::Strided, imf::Id, backend > type;
+			};
+
+			template < bool d >
+			struct view_type< view::matrix, d > {
+				typedef Matrix< T, structures::General, Density::Dense, view::Matrix< self_type >, imf::Id, imf::Id, backend > type;
+			};
+
+			/**
+			 * Constructor for a storage-based vector that allocates storage.
+			 */
+			Vector( const size_t length, const size_t cap = 0 ) :
+				base_type( length, 1, cap ) {
+				static_assert(
+					internal::is_view_over_storage< View >::value &&
+					internal::requires_allocation< View >::value,
+					"This constructor can only be used in storage-based allocation-requiring Vector specializations."
+				);
+			}
+
+			/**
+			 * Constructor for a view over another storage-based vector.
+			 *
+			 * @tparam SourceType  The type of the target vector.
+			 *
+			 */
+			template<
+				typename SourceType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_storage< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Vector( SourceType &source_vector, ImfR imf_r, ImfC imf_c ) :
+				base_type( source_vector, imf_r, imf_c ) { }
+
+			/**
+			 * Constructor for a view over another vector applying a view defined
+			 * by View template parameter of the constructed vector.
+			 *
+			 * @tparam SourceType  The type of the target vector.
+			 *
+			 */
+			template<
+				typename SourceType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_storage< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Vector( SourceType &source_vector ) :
+				base_type( source_vector ) {}
+
+			/**
+			 * @deprecated
+			 * Constructor for a view over another storage-based vector.
+			 *
+			 * @tparam SourceType  The type of the target vector.
+			 *
+			 */
+			template<
+				typename SourceType,
+				typename AmfType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_storage< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Vector( SourceType &source_vector, AmfType &&amf ) :
+				base_type( source_vector, std::forward< AmfType >( amf ) ) {}
+
+			/**
+			 * Constructor for a functor-based vector that allocates memory.
+			 *
+			 * @tparam LambdaType  The type of the lambda function associated to the data.
+			 *
+			 */
+			template<
+				typename LambdaType,
+				std::enable_if_t<
+					std::is_same< LambdaType, typename View::applied_to >::value &&
+					internal::is_view_over_functor< View >::value &&
+					internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Vector( std::function< bool() > initialized, const size_t length, LambdaType lambda ) :
+				base_type( initialized, length, 1, lambda ) {}
+
+			/**
+			 * Constructor for a view over another functor-based vector.
+			 *
+			 * @tparam SourceType  The type of the target vector.
+			 *
+			 */
+			template<
+				typename SourceType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_functor< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Vector( SourceType &target_vector, ImfR imf_r, ImfC imf_c ) :
+				base_type( getFunctor( target_vector ), imf_r, imf_c ) {}
+
+			/**
+			 * Constructor for a view over another functor-based vector.
+			 *
+			 * @tparam SourceType  The type of the target vector.
+			 *
+			 */
+			template<
+				typename SourceType,
+				std::enable_if_t<
+					std::is_same< SourceType, typename View::applied_to >::value &&
+					internal::is_view_over_functor< View >::value &&
+					!internal::requires_allocation< View >::value
+				> * = nullptr
+			>
+			Vector( SourceType &target_vector ) :
+				base_type( getFunctor( target_vector ),
+					imf::Id( nrows ( target_vector ) ),
+					imf::Id( 1 )
+				) {
+
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					std::is_same< ImfC, imf::Id >::value,
+					"This constructor can only be used with Id IMFs."
+				);
+
+			}
+
+			/** \internal No implementation notes. */
+			lambda_reference operator[]( const size_t i ) noexcept {
+				assert( i < _length() );
+				//assert( getInitialized( *v ) );
+				/** \internal \todo revise the third and fourth parameter for parallel backends */
+				return this->access( this->getStorageIndex( i, 0, 0, 1 ) );
+			}
+
+			/** \internal No implementation notes. */
+			const_lambda_reference operator[]( const size_t i ) const noexcept {
+				assert( i < _length() );
+				//assert( getInitialized( *v ) );
+				/** \internal \todo revise the third and fourth parameter for parallel backends */
+				return this->access( this->getStorageIndex( i, 0, 0, 1 ) );
+			}
+
+	}; // class Vector with physical container
+
+	namespace internal {
+
+		template< 
+			typename T, typename Structure, typename View, 
+			typename ImfR, typename ImfC, enum Backend backend 
+		>
+		typename alp::Vector< 
+			T, Structure, Density::Dense, View, ImfR, ImfC, backend 
+		>::iterator
+		begin( 
+			alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > &v 
+		) noexcept {
+			return v.begin();
+		}
+
+		template< 
+			typename T, typename Structure, typename View, 
+			typename ImfR, typename ImfC, enum Backend backend 
+		>
+		typename alp::Vector< 
+			T, Structure, Density::Dense, View, ImfR, ImfC, backend 
+		>::iterator
+		end( 
+			alp::Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > &v 
+		) noexcept {
+			return v.end();
+		}
+
+	} // end namespace ``alp::internal''
+
+	/** Identifies any backend's implementation of ALP vector as an ALP vector. */
+	template< typename T, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	struct is_vector< Vector< T, Structure, Density::Dense, View, ImfR, ImfC, backend > > : std::true_type {};
+
+	/**
+	 * @brief  Generate an original view of the input Vector. The function guarantees 
+	 *         the created view is non-overlapping with other existing views only when the
+	 *         check can be performed in constant time. 
+	 *
+	 * @tparam SourceVector  The type of the source ALP vector
+	 *
+	 * @param[in] source     The ALP Vector object over which the view is created.
+	 *
+	 * @returns A new ALP Vector object.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# This function performs
+	 *           \f$ \Theta(nref) \f$ amount of work where \f$ nref \f$ is the number
+	 *           of available views of \a source.
+	 *        -# A call to this function may use \f$ \mathcal{O}(1) \f$ bytes
+	 *           of memory beyond the memory in use at the function call entry.
+	 *        -# This function may make system calls.
+	 * \endparblock
+	 *
+	 */
+	template< 
+		typename SourceVector,
+		std::enable_if_t< is_vector< SourceVector >::value > * = nullptr
+	>
+	typename SourceVector::template view_type< view::original >::type
+	get_view( SourceVector &source ) {
+
+		using target_t = typename SourceVector::template view_type< view::original >::type;
+
+		return target_t( source );
+	}
+
+	/**
+	 * Create a matrix view over a vector.
+	 * The resulting matrix is a column matrix of size M x 1, where M is vector length.
+	 * The function guarantees the created view is non-overlapping with other
+	 * existing views only when the check can be performed in constant time.
+	 *
+	 * @tparam target_view   The type of the view to apply to the vector.
+	 *                       Only supports value view::matrix.
+	 * @tparam SourceVector  The type of the source ALP vector
+	 *
+	 * @param[in] source     The ALP Vector object over which the view is created.
+	 *
+	 */
+	template<
+		enum view::Views target_view,
+		typename SourceVector,
+		std::enable_if_t<
+			is_vector< SourceVector >::value &&
+			target_view == view::matrix
+		> * = nullptr
+	>
+	typename SourceVector::template view_type< target_view >::type
+	get_view( SourceVector &source ) {
+		using target_t = typename SourceVector::template view_type< target_view >::type;
+		return target_t( source );
+	}
+
+	namespace internal {
+
+		/**
+		 * Implement a gather through a View over compatible Structure using provided Index Mapping Functions.
+		 * The compatibility depends on the TargetStructure, SourceStructure and IMFs, and is calculated during runtime.
+		 */
+		template<
+			typename TargetStructure, typename TargetImfR,
+			typename SourceVector,
+			std::enable_if_t< is_vector< SourceVector >::value > * = nullptr
+		>
+		typename internal::new_container_type_from<
+			typename SourceVector::template view_type< view::gather >::type
+		>::template change_structure< TargetStructure >::_and_::
+		template change_imfr< TargetImfR >::type
+		get_view(
+			SourceVector &source,
+			TargetImfR imf_r,
+			imf::Id imf_c
+		) {
+
+			//if( std::dynamic_pointer_cast< imf::Select >( imf_r ) || std::dynamic_pointer_cast< imf::Select >( imf_c ) ) {
+			//	throw std::runtime_error("Cannot gather with imf::Select yet.");
+			//}
+			// No static check as the compatibility depends on IMF, which is a runtime level parameter
+			//if( ! (TargetStructure::template isInstantiableFrom< Structure >( static_cast< TargetImfR & >( imf_r ), static_cast< TargetImfR & >( imf_c ) ) ) ) {
+			if( ! (structures::isInstantiable< typename SourceVector::structure, TargetStructure >::check( imf_r, imf_c ) ) ) {
+				throw std::runtime_error("Cannot gather into specified TargetStructure from provided SourceStructure and Index Mapping Functions.");
+			}
+
+			using target_vec_t = typename internal::new_container_type_from<
+				typename SourceVector::template view_type< view::gather >::type
+			>::template change_structure< TargetStructure >::_and_::
+			template change_imfr< TargetImfR >::type;
+
+			return target_vec_t( source, imf_r, imf_c );
+		}
+	} // namespace internal
+
+	/**
+	 * @brief Version of get_view over vectors where a range of elements are selected to form a new view. 
+	 * 		  The function guarantees the created view is non-overlapping with other existing views only when the
+	 * 		  check can be performed in constant time. 
+	 * 
+	 * @tparam SourceVector  The type of the source ALP vector
+	 *
+	 * @param[in] source     The ALP Vector object over which the view is created.
+	 * @param[in] rng        A valid range of elements
+	 * 
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# This function performs
+	 *           \f$ \Theta(nref) \f$ amount of work where \f$ nref \f$ is the number
+	 * 			 of available views of \a source.
+	 *        -# A call to this function may use \f$ \mathcal{O}(1) \f$ bytes
+	 *           of memory beyond the memory in use at the function call entry.
+	 *        -# This function may make system calls.
+	 * \endparblock
+	 * 
+	 */
+	template<
+		typename SourceVector,
+		std::enable_if_t< is_vector< SourceVector >::value > * = nullptr
+	>
+	typename SourceVector::template view_type< view::gather >::type
+	get_view( SourceVector &source, const utils::range& rng ) {
+
+		return internal::get_view< typename SourceVector::structure >(
+			source,
+			std::move( imf::Strided( rng.count(), nrows(source), rng.start, rng.stride ) ),
+			std::move( imf::Id( 1 ) )
+		);
+	}
+
+	/**
+	 *
+	 * Generate a dynamic gather view where the type is compliant with the source Vector.
+	 * Version where a selection of indices, expressed as a vector of indices,
+	 * form a new view with specified target structure.
+	 *
+	 * @tparam TargetStructure The target structure of the new view. It should verify
+	 *                         <code> alp::is_in<Structure, TargetStructure::inferred_structures> </code>.
+	 * @tparam SourceVector    The type of the source ALP vector
+	 * @tparam SelectVector    The type of the ALP vector defining permutation for rows
+	 *
+	 * @param source           The source ALP matrix
+	 * @param sel              A valid permutation vector of a subset of indices
+	 *
+	 * @return A new gather view over the source ALP matrix.
+	 *
+	 */
+	template<
+		typename TargetStructure,
+		typename SourceVector,
+		typename SelectVector,
+		std::enable_if_t<
+			is_vector< SourceVector >::value &&
+			is_vector< SelectVector >::value
+		> * = nullptr
+	>
+	typename internal::new_container_type_from<
+		typename SourceVector::template view_type< view::gather >::type
+	>::template change_structure< TargetStructure >::_and_::
+	template change_imfr< imf::Select >::type
+	get_view(
+		SourceVector &source,
+		const SelectVector &sel
+	) {
+		return internal::get_view< TargetStructure >(
+			source,
+			imf::Select( size( source ), sel ),
+			imf::Id( 1 )
+		);
+	}
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_AMF_BASED_VECTOR''
+
diff --git a/include/alp/backends.hpp b/include/alp/backends.hpp
new file mode 100644
index 000000000..1e4d6ec42
--- /dev/null
+++ b/include/alp/backends.hpp
@@ -0,0 +1,60 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author: A. N. Yzelman
+ * @date 21st of December, 2016
+ *
+ * @file This file contains a register of all backends that are either
+ *       implemented, under implementation, or were at any point in time
+ *       conceived and noteworthy enough to be recorded for future
+ *       consideration to implement. It does so via the alp::Backend
+ *       enum.
+ */
+
+#ifndef _H_ALP_BACKENDS
+#define _H_ALP_BACKENDS
+
+namespace alp {
+
+	/**
+	 * This enum collects all implemented backends. Depending on compile flags,
+	 * some of these options may be disabled.
+	 */
+	enum Backend {
+
+		/**
+		 * The ALP reference backend.
+		 */
+		reference,
+
+		/*
+		 * The ALP dispatch backend.
+		 */
+		dispatch,
+
+		/**
+		 * The ALP OpenMP backend.
+		 */
+		omp,
+
+	};
+
+} // namespace alp
+
+#endif
+
diff --git a/include/alp/base/blas0.hpp b/include/alp/base/blas0.hpp
new file mode 100644
index 000000000..460eaa347
--- /dev/null
+++ b/include/alp/base/blas0.hpp
@@ -0,0 +1,457 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 5th of December 2016
+ */
+
+#ifndef _H_ALP_BLAS0_BASE
+#define _H_ALP_BLAS0_BASE
+
+#include <type_traits> //enable_if
+
+#include <alp/rc.hpp>
+#include <alp/type_traits.hpp>
+
+#include "config.hpp"
+#include "scalar.hpp"
+
+namespace alp {
+
+	/**
+	 * \defgroup BLAS0 The Level-0 Basic Linear Algebra Subroutines (BLAS)
+	 *
+	 * A collection of functions that let GraphBLAS operators work on
+	 * zero-dimensional containers, i.e., on scalars.
+	 *
+	 * The GraphBLAS uses opaque data types and defines several standard functions
+	 * to operate on these data types. Examples types are alp::Vector and
+	 * alp::Matrix, example functions are alp::dot and alp::vxm.
+	 *
+	 * To input data into an opaque GraphBLAS type, each opaque type defines a
+	 * member function \a build: alp::Vector::build() and alp::Matrix::build().
+	 *
+	 * To extract data from opaque GraphBLAS types, each opaque type provides
+	 * \em iterators that may be obtained via the STL standard \a begin and \a end
+	 * functions:
+	 *   - alp::Vector::begin or alp::Vector::cbegin
+	 *   - alp::Vector::end or alp::Vector::cend
+	 *   - alp::Matrix::begin or alp::Matrix::cbegin
+	 *   - alp::Matrix::end or alp::Matrix::cend
+	 *
+	 * Some GraphBLAS functions, however, reduce all elements in a GraphBLAS
+	 * container into a single element of a given type. So for instance, alp::dot
+	 * on two vectors of type alp::Vector<double> using the regular real semiring
+	 * alp::Semiring<double> will store its output in a variable of type \a double.
+	 *
+	 * When parametrising GraphBLAS functions in terms of arbitrary Semirings,
+	 * Monoids, Operators, and object types, it is useful to have a way to apply
+	 * the same operators on whatever type they make functions like alp::dot
+	 * produce-- that is, we require functions that enable the application of
+	 * GraphBLAS operators on single elements.
+	 *
+	 * This group of BLAS level 0 functions provides this functionality.
+	 *
+	 * @{
+	 */
+
+	/**
+	 * Out-of-place application of the operator \a OP on two data elements.
+	 *
+	 * The output data will be output to an existing memory location, overwriting
+	 * any existing data.
+	 *
+	 * @tparam descr      The descriptor passed to this operator.
+	 * @tparam OP         The type of the oparator to apply.
+	 * @tparam InputType1 The left-hand side input argument type.
+	 * @tparam InputType2 The right-hand side input argument type.
+	 * @tparam OutputType The output argument type.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *   -# alp::descriptors::no_operation for default behaviour.
+	 *   -# alp::descriptors::no_casting when a call to this function should *not*
+	 *      automatically cast input arguments to operator input domain, and *not*
+	 *      automatically cast operator output to the output argument domain.
+	 * \endparblock
+	 *
+	 * If \a InputType1 does not match the left-hand side input domain of \a OP,
+	 * or if \a InputType2 does not match the right-hand side input domain of
+	 * \a OP, or if \a OutputType does not match the output domain of \a OP while
+	 * alp::descriptors::no_casting was set, then the code shall not compile.
+	 *
+	 * @param[in]  x   The left-hand side input data.
+	 * @param[in]  y   The right-hand side input data.
+	 * @param[out] out Where to store the result of the operator.
+	 * @param[in]  op  The operator to apply (optional).
+	 *
+	 * \note \a op is optional when the operator type \a OP is explicitly given.
+	 *       Thus there are two ways of calling this function:
+	 *        -# <code>
+	 *             Scalar< double > a, b, c;
+	 *             alp::apply< alp::operators::add<double> >( a, b, c );
+	 *           </code>, or
+	 *        -# <code>
+	 *             Scalar< double > a, b, c;
+	 *             alp::operators::add< double > addition_over_doubles;
+	 *             alp::apply( a, b, c, addition_over_doubles);
+	 *           </code>
+	 *
+	 * \note There should be no performance difference between the two ways of
+	 *       calling this function. For compatibility with other ALP
+	 *       implementations, the latter type of call is preferred.
+	 *
+	 * @return alp::SUCCESS A call to this function never fails.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *      -# This call comprises \f$ \Theta(1) \f$ work. The constant factor
+	 *         depends on the cost of evaluating the operator.
+	 *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	 *         already used by the application when a call to this function is
+	 *         made.
+	 *      -# This call incurs at most \f$ \Theta(1) \f$ memory where the
+	 *         constant factor depends on the storage requirements of the
+	 *         arguments and the temporary storage required for evaluation of
+	 *         this operator.
+	 * \endparblock
+	 *
+	 * \warning The use of stateful operators, or even thus use of stateless
+	 *          operators that are not included in alp::operators, may cause this
+	 *          function to incur performance penalties beyond the worst case
+	 *          sketched above.
+	 *
+	 * @see foldr for applying an operator in-place (if allowed).
+	 * @see foldl for applying an operator in-place (if allowed).
+	 * @see alp::operators::internal::Operator for a discussion on when foldr and
+	 *      foldl successfully generate in-place code.
+	 */
+	template< 
+		class OP,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2,
+		typename OutputType, typename OutputStructure,
+		enum Backend implementation = config::default_backend 
+	>
+	RC apply( 
+		Scalar< OutputType, OutputStructure, implementation > &out,
+		const Scalar< InputType1, InputStructure1, implementation > &x,
+		const Scalar< InputType2, InputStructure2, implementation > &y,
+		const OP &op = OP(),
+		const std::enable_if_t<
+			alp::is_operator< OP >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!alp::is_object< OutputType >::value
+		> * = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement alp::apply (scalar)\n";
+#endif
+#ifndef NDEBUG
+		const bool backend_does_not_support_scalar_apply = false;
+		assert( backend_does_not_support_scalar_apply );
+#endif
+
+		(void) out;
+		(void) x;
+		(void) y;
+		(void) op;
+
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Application of the operator \a OP on two data elements. The output data
+	 * will overwrite the right-hand side input element.
+	 *
+	 * In mathematical notation, this function calculates \f$ x \odot y \f$ and
+	 * copies the result into \a y.
+	 *
+	 * @tparam OP        The type of the operator to apply.
+	 * @tparam InputType The type of the left-hand side input element. This
+	 *                   element will be accessed read-only.
+	 * @tparam IOType    The type of the right-hand side input element, which will
+	 *                   be overwritten.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *   -# alp::descriptors::no_operation for default behaviour.
+	 *   -# alp::descriptors::no_casting when a call to this function should *not*
+	 *      automatically cast input arguments to operator input domain, and *not*
+	 *      automatically cast operator output to the output argument domain.
+	 * \endparblock
+	 *
+	 * If \a InputType does not match the left-hand side input domain
+	 * (see alp::operators::internal::Operator::D1) corresponding to \a OP, then
+	 * \a x will be temporarily cached and cast into \a D1.
+	 * If \a IOType does not match the right-hand side input domain corresponding
+	 * to \a OP, then \a y will be temporarily cached and cast into \a D2.
+	 * If \a IOType does not match the output domain corresponding to \a OP, then
+	 * the result of \f$ x \odot y \f$ will be temporarily cached before cast to
+	 * \a IOType and written to \a y.
+	 *
+	 * @param[in]     x The left-hand side input parameter.
+	 * @param[in,out] y On function entry: the right-hand side input parameter.
+	 *                  On function exit: the output of the operator.
+	 * @param[in]    op The operator to apply (optional).
+	 *
+	 * @return alp::SUCCESS A call to this function never fails.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *      -# This call comprises \f$ \Theta(1) \f$ work. The constant factor
+	 *         depends on the cost of evaluating the operator.
+	 *      -# This call will not allocate any new dynamic memory.
+	 *      -# This call requires at most \f$ \mathit{sizeof}(D_1+D_2+D_3) \f$
+	 *         bytes of temporary storage, plus any temporary requirements for
+	 *         evaluating \a op.
+	 *      -# This call incurs at most \f$ \mathit{sizeof}(D_1+D_2+D_3) +
+	 *         \mathit{sizeof}(\mathit{InputType}+2\mathit{IOType}) \f$ bytes of
+	 *         data movement, plus any data movement requirements for evaluating
+	 *         \a op.
+	 * \endparblock
+	 *
+	 * \warning The use of stateful operators, or even thus use of stateless
+	 *          operators that are not included in alp::operators, may cause this
+	 *          function to incur performance penalties beyond the worst case
+	 *          sketched above.
+	 *
+	 * \note For the standard stateless operators in alp::operators, there are
+	 *       no additional temporary storage requirements nor any additional data
+	 *       movement requirements than the ones mentioned above.
+	 *
+	 * \note If \a OP is fold-right capable, the temporary storage and data
+	 *       movement requirements are less than reported above.
+	 *
+	 * @see foldl for a left-hand in-place version.
+	 * @see apply for an example of how to call this function without explicitly
+	 *            passing \a op.
+	 * @see alp::operators::internal Operator for a discussion on fold-right
+	 *      capable operators and on stateful operators.
+	 */
+	template< 
+		class OP, 
+		typename InputType, typename InputStructure, 
+		typename IOType, typename IOStructure,
+		enum Backend implementation = config::default_backend
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, implementation > &x,
+		Scalar< IOType, IOStructure, implementation > &y,
+		const OP & op = OP(),
+		const std::enable_if_t<
+			alp::is_operator< OP >::value &&
+			! alp::is_object< InputType >::value &&
+			! alp::is_object< IOType >::value
+		> * = nullptr
+	) {
+
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement alp::foldr (scalar)\n";
+#endif
+#ifndef NDEBUG
+		const bool backend_does_not_support_scalar_foldr = false;
+		assert( backend_does_not_support_scalar_foldr );
+#endif
+		
+		(void) x;
+		(void) y;
+		(void) op;
+
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Application of the operator \a OP on two data elements. The output data
+	 * will overwrite the left-hand side input element.
+	 *
+	 * In mathematical notation, this function calculates \f$ x \odot y \f$ and
+	 * copies the result into \a x.
+	 *
+	 * @tparam OP        The type of the operator to apply.
+	 * @tparam IOType    The type of the left-hand side input element, which will
+	 *                   be overwritten.
+	 * @tparam InputType The type of the right-hand side input element. This
+	 *                   element will be accessed read-only.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *   -# alp::descriptors::no_operation for default behaviour.
+	 *   -# alp::descriptors::no_casting when a call to this function should *not*
+	 *      automatically cast input arguments to operator input domain, and *not*
+	 *      automatically cast operator output to the output argument domain.
+	 * \endparblock
+	 *
+	 * If \a InputType does not match the right-hand side input domain
+	 * (see alp::operators::internal::Operator::D2) corresponding to \a OP, then
+	 * \a x will be temporarily cached and cast into \a D2.
+	 * If \a IOType does not match the left-hand side input domain corresponding
+	 * to \a OP, then \a y will be temporarily cached and cast into \a D1.
+	 * If \a IOType does not match the output domain corresponding to \a OP, then
+	 * the result of \f$ x \odot y \f$ will be temporarily cached before cast to
+	 * \a IOType and written to \a y.
+	 *
+	 * @param[in,out] x On function entry: the left-hand side input parameter.
+	 *                  On function exit: the output of the operator.
+	 * @param[in]     y The right-hand side input parameter.
+	 * @param[in]    op The operator to apply (optional).
+	 *
+	 * @return alp::SUCCESS A call to this function never fails.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *      -# This call comprises \f$ \Theta(1) \f$ work. The constant factor
+	 *         depends on the cost of evaluating the operator.
+	 *      -# This call will not allocate any new dynamic memory.
+	 *      -# This call requires at most \f$ \mathit{sizeof}(D_1+D_2+D_3) \f$
+	 *         bytes of temporary storage, plus any temporary requirements for
+	 *         evaluating \a op.
+	 *      -# This call incurs at most \f$ \mathit{sizeof}(D_1+D_2+D_3) +
+	 *         \mathit{sizeof}(\mathit{InputType}+2\mathit{IOType}) \f$ bytes of
+	 *         data movement, plus any data movement requirements for evaluating
+	 *         \a op.
+	 * \endparblock
+	 *
+	 * \warning The use of stateful operators, or even thus use of stateless
+	 *          operators that are not included in alp::operators, may cause this
+	 *          function to incur performance penalties beyond the worst case
+	 *          sketched above.
+	 *
+	 * \note For the standard stateless operators in alp::operators, there are
+	 *       no additional temporary storage requirements nor any additional data
+	 *       movement requirements than the ones mentioned above.
+	 *
+	 * \note If \a OP is fold-left capable, the temporary storage and data
+	 *       movement requirements are less than reported above.
+	 *
+	 * @see foldr for a right-hand in-place version.
+	 * @see apply for an example of how to call this function without explicitly
+	 *            passing \a op.
+	 * @see alp::operators::internal Operator for a discussion on fold-right
+	 *      capable operators and on stateful operators.
+	 */
+	template< 
+		class OP, 
+		typename InputType, typename InputStructure, 
+		typename IOType, typename IOStructure,
+		enum Backend implementation = config::default_backend
+	>
+	RC foldl(
+		Scalar< IOType, IOStructure, implementation > &x,
+		const Scalar< InputType, InputStructure, implementation > &y,
+		const OP & op = OP(),
+		const std::enable_if_t<
+			alp::is_operator< OP >::value &&
+			! alp::is_object< InputType >::value &&
+			! alp::is_object< IOType >::value
+		> * = nullptr
+	) {
+
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement alp::foldl (scalar)\n";
+#endif
+#ifndef NDEBUG
+		const bool backend_does_not_support_scalar_foldl = false;
+		assert( backend_does_not_support_scalar_foldl );
+#endif
+
+		(void) x;
+		(void) y;
+		(void) op;
+
+		return UNSUPPORTED;
+	}
+
+	/** @} */
+
+	namespace internal {
+
+		/**
+		 * Helper class that, depending on a given descriptor, either returns a
+		 * nonzero value from a vector, or its corresponding coordinate.
+		 *
+		 * This class hence makes the use of the following descriptor(s) transparent:
+		 *   -# #alp::descriptors::use_index
+		 *
+		 * @tparam descr The descriptor under which to write back either the value or
+		 *               the index.
+		 * @tparam OutputType The type of the output to return.
+		 * @tparam D          The type of the input.
+		 * @tparam Enabled    Controls, through SFINAE, whether the use of the
+		 *                    #use_index descriptor is allowed at all.
+		 */
+		template< alp::Descriptor descr, typename OutputType, typename D, typename Enabled = void >
+		class ValueOrIndex;
+
+		/* Version where use_index is allowed. */
+		template< alp::Descriptor descr, typename OutputType, typename D >
+		class ValueOrIndex< 
+			descr, OutputType, D,
+			typename std::enable_if< std::is_arithmetic< OutputType >::value
+			&& ! std::is_same< D, void >::value >::type 
+		> {
+		private:
+			static constexpr const bool use_index = descr & alp::descriptors::use_index;
+			static_assert(
+				use_index
+				|| std::is_convertible< D, OutputType >::value, "Cannot convert to the requested output type"
+			);
+
+		public:
+
+			static OutputType getFromScalar( const D &x, const size_t index ) noexcept {
+				if( use_index ) {
+					return static_cast< OutputType >( index );
+				} else {
+					return static_cast< OutputType >( x );
+				}
+			}
+
+		};
+
+		/* Version where use_index is not allowed. */
+		template< alp::Descriptor descr, typename OutputType, typename D >
+		class ValueOrIndex<
+			descr, OutputType, D,
+			typename std::enable_if< ! std::is_arithmetic< OutputType >::value
+			&& ! std::is_same< OutputType, void >::value >::type
+		> {
+			static_assert(
+				!( descr & descriptors::use_index ),
+				"use_index descriptor given while output type is not numeric"
+			);
+			static_assert(
+				std::is_convertible< D, OutputType >::value,
+				"Cannot convert input to the given output type"
+			);
+
+		public:
+
+			static OutputType getFromScalar( const D &x, const size_t ) noexcept {
+				return static_cast< OutputType >( x );
+			}
+		};
+
+	} // namespace internal
+
+} // namespace alp
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_ALP_BLAS0_BASE''
diff --git a/include/alp/base/blas1.hpp b/include/alp/base/blas1.hpp
new file mode 100644
index 000000000..9ed0711d3
--- /dev/null
+++ b/include/alp/base/blas1.hpp
@@ -0,0 +1,877 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 5th of December 2016
+ */
+
+#ifndef _H_ALP_BASE_BLAS1
+#define _H_ALP_BASE_BLAS1
+
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+
+#include <alp/backends.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/internalops.hpp>
+#include <alp/monoid.hpp>
+#include <alp/ops.hpp>
+#include <alp/phase.hpp>
+#include <alp/rc.hpp>
+#include <alp/semiring.hpp>
+
+
+namespace alp {
+
+	/**
+	 * \defgroup BLAS1 The Level-1 ALP/GraphBLAS routines
+	 * @{
+	 */
+
+	/**
+	 * Folds all elements in a ALP Vector \a x into a single value \a beta.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure,
+		class Monoid,
+		Backend backend
+	>
+	RC foldr(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &x,
+		Scalar< IOType, IOStructure, backend > &beta,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && !alp::is_object< IOType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) beta;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/** C++ scalar variant */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType,
+		class Monoid,
+		Backend backend
+	>
+	RC foldr(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &x,
+		IOType &beta,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && !alp::is_object< IOType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) beta;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * For all elements in a ALP Vector \a y, fold the value \f$ \alpha \f$
+	 * into each element.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid,
+		Backend backend
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, backend > &alpha,
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &y,
+		const Monoid & monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && !alp::is_object< IOType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) alpha;
+		(void) y;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes y = x + y, operator variant.
+	 *
+	 * Specialisation for scalar \a x.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class OP, Backend backend
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, backend > &alpha,
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &y,
+		const OP & op = OP(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_operator< OP >::value
+		> * const = nullptr
+	) {
+		(void) alpha;
+		(void) y;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Folds all elements in a ALP Vector \a x into the corresponding
+	 * elements from an input/output vector \a y.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class OP, Backend backend
+	>
+	RC foldr(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &x,
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &y,
+		const OP & op = OP(),
+		const std::enable_if_t<
+			alp::is_operator< OP >::value && ! alp::is_object< InputType >::value && ! alp::is_object< IOType >::value
+		> * = nullptr
+	) {
+		(void) x;
+		(void) y;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Folds all elements in a ALP Vector \a x into the corresponding
+	 * elements from an input/output vector \a y.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid,
+		Backend backend
+	>
+	RC foldr(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &x,
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &y,
+		const Monoid & monoid = Monoid(),
+		const std::enable_if_t<
+			alp::is_monoid< Monoid >::value && ! alp::is_object< InputType >::value && ! alp::is_object< IOType >::value
+		> * = nullptr
+	) {
+		(void) x;
+		(void) y;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * For all elements in a ALP Vector \a x, fold the value \f$ \beta \f$
+	 * into each element.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType, typename InputStructure,
+		class Op,
+		Backend backend
+	>
+	RC foldl(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &x,
+		const Scalar< InputType, InputStructure, backend > beta,
+		const Op &op = Op(),
+		const std::enable_if_t<
+			! alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_operator< Op >::value
+		> * = nullptr
+	) {
+		(void) x;
+		(void) beta;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Folds all elements in a ALP Vector \a y into the corresponding
+	 * elements from an input/output vector \a x.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class OP,
+		Backend backend
+	>
+	RC foldl(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &y,
+		const OP &op = OP(),
+		const std::enable_if_t<
+			alp::is_operator< OP >::value && !alp::is_object< IOType >::value && !alp::is_object< InputType >::value
+		> * = nullptr
+	) {
+		(void) x;
+		(void) y;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Folds all elements in a ALP Vector \a y into the corresponding
+	 * elements from an input/output vector \a x.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Monoid,
+		Backend backend
+	>
+	RC foldl(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &y,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			alp::is_monoid< Monoid >::value && ! alp::is_object< IOType >::value && ! alp::is_object< InputType >::value
+		  > * = nullptr
+		) {
+		(void) x;
+		(void) y;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x .* \beta \f$, using the given operator.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR, typename InputImfC,
+		typename InputType2, typename InputStructure2,
+		class OP,
+		Backend backend
+	>
+	RC eWiseApply(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR, InputImfC, backend > &x,
+		const Scalar< InputType2, InputStructure2, backend > &beta,
+		const OP &op = OP(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< OP >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) x;
+		(void) beta;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Specialisation for \a x and \a y scalar, operator version.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2,
+		class OP,
+		Backend backend
+	>
+	RC eWiseApply(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Scalar< InputType1, InputStructure1, backend> &alpha,
+		const Scalar< InputType2, InputStructure2, backend> &beta,
+		const OP &op = OP(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< OP >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) alpha;
+		(void) beta;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Specialisation for \a x and \a y scalar, monoid version.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2,
+		class Monoid,
+		Backend backend
+	>
+	RC eWiseApply(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Scalar< InputType1, InputStructure1, backend> &alpha,
+		const Scalar< InputType2, InputStructure2, backend> &beta,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) alpha;
+		(void) beta;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Monoid version.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Monoid,
+		Backend backend
+	>
+	RC eWiseApply(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			! alp::is_object< OutputType >::value &&
+			! alp::is_object< InputType1 >::value &&
+			! alp::is_object< InputType2 >::value &&
+			alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) x;
+		(void) y;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Specialisation for scalar \a x. Monoid version.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Monoid,
+		Backend backend
+	>
+	RC eWiseApply(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Scalar< InputType1, InputStructure1, backend> &alpha,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) alpha;
+		(void) y;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Specialisation for scalar \a y. Monoid version.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2,
+		class Monoid,
+		Backend backend
+	>
+	RC eWiseApply(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &x,
+		const Scalar< InputType2, InputStructure2, backend > &beta,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) x;
+		(void) beta;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = \alpha .* y \f$, using the given operator.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class OP,
+		Backend backend
+	>
+	RC eWiseApply(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Scalar< InputType1, InputStructure1, backend > &alpha,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const OP &op = OP(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< OP >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) alpha;
+		(void) y;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise operation on elements of two vectors,
+	 * \f$ z = x .* y \f$, using the given operator.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class OP,
+		Backend backend
+	>
+	RC eWiseApply(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const OP &op = OP(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< OP >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) x;
+		(void) y;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise multiplication of two vectors,
+	 *     \f$ z = z + x .* y \f$,
+	 * under a given semiring.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring,
+		Backend backend
+	>
+	RC eWiseMul(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) x;
+		(void) y;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = z + x * y \f$.
+	 *
+	 * Specialisation for scalar \a x.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring,
+		Backend backend
+	>
+	RC eWiseMul(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Scalar< InputType1, InputStructure1, backend > &alpha,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) alpha;
+		(void) y;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = z + x * y \f$.
+	 *
+	 * Specialisation for scalar \a y.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2,
+		class Ring,
+		Backend backend
+	>
+	RC eWiseMul(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &x,
+		const Scalar< InputType2, InputStructure2, backend > &beta,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		(void) z;
+		(void) x;
+		(void) beta;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the dot product, \f$ \alpha = (x,y) \f$, under a given additive
+	 * monoid and multiplicative operator.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class AddMonoid, class AnyOp,
+		Backend backend
+	>
+	RC dot(
+		Scalar< OutputType, OutputStructure, backend > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< AddMonoid >::value &&
+			alp::is_operator< AnyOp >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) y;
+		(void) addMonoid;
+		(void) anyOp;
+		return UNSUPPORTED;
+	}
+
+	/** C++ scalar specialization */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class AddMonoid, class AnyOp,
+		Backend backend
+	>
+	RC dot(
+		OutputType &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< AddMonoid >::value &&
+			alp::is_operator< AnyOp >::value
+		> * const = nullptr
+	) {
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Provides a generic implementation of the dot computation on semirings by
+	 * translating it into a dot computation on an additive commutative monoid
+	 * with any multiplicative operator.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring,
+		Backend backend
+	>
+	RC dot(
+		Scalar< IOType, IOStructure, backend > &x,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &left,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &right,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) left;
+		(void) right;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/** C++ scalar specialization. */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename IOType,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		Backend backend
+	>
+	RC dot(
+		IOType &x,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &left,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &right,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) left;
+		(void) right;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * This is the eWiseLambda that performs length checking by recursion.
+	 *
+	 * in the backend implementation all vectors are distributed equally, so no
+	 * need to synchronise any data structures. We do need to do error checking
+	 * though, to see when to return alp::MISMATCH. That's this function.
+	 *
+	 * @see Vector::operator[]()
+	 * @see Vector::lambda_backend
+	 */
+	template<
+		typename Func,
+		typename DataType1, typename DataStructure1, typename DataView1, typename InputImfR1, typename InputImfC1,
+		typename DataType2, typename DataStructure2, typename DataView2, typename InputImfR2, typename InputImfC2,
+		Backend backend,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		Vector< DataType1, DataStructure1, Density::Dense, DataView1, InputImfR1, InputImfC1, backend > &x,
+		const Vector< DataType2, DataStructure2, Density::Dense, DataView2, InputImfR2, InputImfC2, backend > &y,
+		Args const &... args
+	) {
+		(void) f;
+		(void) x;
+		(void) y;
+		// (void) args;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * No implementation notes. This is the `real' implementation on backend
+	 * vectors.
+	 *
+	 * @see Vector::operator[]()
+	 * @see Vector::lambda_backend
+	 */
+	template<
+		typename Func,
+		typename DataType, typename DataStructure, typename DataView, typename DataImfR, typename DataImfC,
+		Backend backend
+	>
+	RC eWiseLambda(
+		const Func f,
+		Vector< DataType, DataStructure, Density::Dense, DataView, DataImfR, DataImfC, backend > &x
+	) {
+		(void) f;
+		(void) x;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Reduces a vector into a scalar. Reduction takes place according a monoid
+	 * \f$ (\oplus,1) \f$, where \f$ \oplus:\ D_1 \times D_2 \to D_3 \f$ with an
+	 * associated identity \f$ 1 \in \{D_1,D_2,D_3\} \f$. Elements from the given
+	 * vector \f$ y \in \{D_1,D_2\} \f$ will be applied at the left-hand or right-
+	 * hand side of \f$ \oplus \f$; which, exactly, is implementation-dependent
+	 * but should not matter since \f$ \oplus \f$ should be associative.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Monoid,
+		Backend backend
+	>
+	RC foldl(
+		Scalar< IOType, IOStructure, backend > &alpha,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &y,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			! alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) alpha;
+		(void) y;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Sort vectors, function available to user, e.g. to sort eigenvectors
+	 */
+	template<
+		typename IndexType, typename IndexStructure, typename IndexView, typename IndexImfR, typename IndexImfC,
+		typename ValueType, typename ValueStructure, typename ValueView, typename ValueImfR, typename ValueImfC,
+		typename Compare,
+		Backend backend
+	>
+	RC sort(
+		Vector< IndexType, IndexStructure, Density::Dense, IndexView, IndexImfR, IndexImfC, backend > &permutation,
+		const Vector< ValueType, ValueStructure, Density::Dense, ValueView, ValueImfR, ValueImfC, backend > &toSort,
+		Compare cmp
+	) noexcept {
+		(void) permutation;
+		(void) toSort;
+		(void) cmp;
+		return UNSUPPORTED;
+	}
+
+    /**
+	 * Provides a generic implementation of the 2-norm computation.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Ring,
+		Backend backend
+	>
+	RC norm2(
+		Scalar< OutputType, OutputStructure, backend > &x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &y,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			std::is_floating_point< OutputType >::value || grb::utils::is_complex< OutputType >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) y;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/** C++ scalar version */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Ring,
+		Backend backend
+	>
+	RC norm2(
+		OutputType &x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &y,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			std::is_floating_point< OutputType >::value || grb::utils::is_complex< OutputType >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) y;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/** @} */
+
+} // end namespace alp
+
+#endif // end _H_ALP_BASE_BLAS1
+
diff --git a/include/alp/base/blas2.hpp b/include/alp/base/blas2.hpp
new file mode 100644
index 000000000..2334d7710
--- /dev/null
+++ b/include/alp/base/blas2.hpp
@@ -0,0 +1,473 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Defines the GraphBLAS level 2 API.
+ *
+ * @author A. N. Yzelman
+ * @date 30th of March 2017
+ */
+
+#ifndef _H_ALP_BLAS2_BASE
+#define _H_ALP_BLAS2_BASE
+
+#include <assert.h>
+#include <functional>
+
+#include <alp/backends.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/rc.hpp>
+#include <alp/semiring.hpp>
+
+#include "blas1.hpp"
+#include "config.hpp"
+#include "matrix.hpp"
+#include "vector.hpp"
+
+namespace alp {
+
+	/**
+	 * \defgroup BLAS2 The Level-2 Basic Linear Algebra Subroutines (BLAS)
+	 *
+	 * A collection of functions that allow GraphBLAS operators, monoids, and
+	 * semirings work on a mix of zero-dimensional, one-dimensional, and
+	 * two-dimensional containers.
+	 *
+	 * That is, these functions allow various linear algebra operations on
+	 * scalars, objects of type alp::Vector, and objects of type alp::Matrix.
+	 *
+	 * \note The backends of each opaque data type should match.
+	 *
+	 * @{
+	 */
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType = typename Ring::D4, typename IOStructure,
+		typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType1 = typename Ring::D1, typename InputStructure1,
+		typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2 = typename Ring::D2, typename InputStructure2,
+		typename InputView2, typename InputImfR2, typename InputImfC2,
+		Backend backend
+	>
+	RC vxm(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &u,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &v,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &A,
+		const Ring &ring = Ring(),
+		const std::enable_if_t< alp::is_semiring< Ring >::value > * const = nullptr
+	) {
+		(void) u;
+		(void) v;
+		(void) A;
+		return UNSUPPORTED;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView,
+		typename IOImfR, typename IOImfC,
+		typename InputType1, typename InputStructure1, typename InputView1,
+		typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2,
+		typename InputImfR2, typename InputImfC2,
+		class AdditiveMonoid, class MultiplicativeOperator,
+		Backend backend
+	>
+	RC vxm(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &u,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &v,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const std::enable_if_t<
+			alp::is_monoid< AdditiveMonoid >::value &&
+			alp::is_operator< MultiplicativeOperator >::value &&
+			!alp::is_object< IOType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!std::is_same< InputType2, void >::value
+		> * const = nullptr
+	) {
+		(void) u;
+		(void) v;
+		(void) A;
+		(void) add;
+		(void) mul;
+		return UNSUPPORTED;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType = typename Ring::D4, typename IOStructure,
+		typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType2 = typename Ring::D2, typename InputStructure2,
+		typename InputView2, typename InputImfR2, typename InputImfC2,
+		typename InputType1 = typename Ring::D1, typename InputStructure1,
+		typename InputView1, typename InputImfR1, typename InputImfC1,
+		Backend backend
+	>
+	RC mxv(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &u,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &v,
+		const Ring &ring,
+		const std::enable_if_t< alp::is_semiring< Ring >::value > * const = nullptr
+	) {
+		(void) u;
+		(void) A;
+		(void) v;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView,
+		typename IOImfR, typename IOImfC,
+		typename InputType2, typename InputStructure2, typename InputView2,
+		typename InputImfR2, typename InputImfC2,
+		typename InputType1, typename InputStructure1, typename InputView1,
+		typename InputImfR1, typename InputImfC1,
+		class AdditiveMonoid, class MultiplicativeOperator,
+		Backend backend
+	>
+	RC mxv(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &u,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &v,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const std::enable_if_t<
+			alp::is_monoid< AdditiveMonoid >::value &&
+			alp::is_operator< MultiplicativeOperator >::value &&
+			!alp::is_object< IOType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!std::is_same< InputType2, void >::value
+		> * const = nullptr
+	) {
+		(void) u;
+		(void) A;
+		(void) v;
+		(void) add;
+		(void) mul;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * @see alp::eWiseLambda for the user-level specification.
+	 */
+	template<
+		typename Func,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+		Backend backend
+	>
+	RC eWiseLambda(
+		const Func f,
+		Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, backend > &A
+	) {
+		(void) f;
+		(void) A;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * This function provides dimension checking and will defer to the below
+	 * function for the actual implementation.
+	 *
+	 * @see alp::eWiseLambda for the user-level specification.
+	 */
+	template<
+		typename Func,
+		typename DataType1, typename Structure1, typename View1, typename ImfR1, typename ImfC1,
+		typename DataType2, typename Structure2, typename View2, typename ImfR2, typename ImfC2,
+		Backend backend,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		Matrix< DataType1, Structure1, Density::Dense, View1, ImfR1, ImfC1, backend > &A,
+		const Vector< DataType2, Structure2, Density::Dense, View2, ImfR2, ImfC2, backend > &x,
+		Args const &... args
+	) {
+		// do size checking
+		if( !( size( x ) == nrows( A ) || size( x ) == ncols( A ) ) ) {
+			std::cerr << "Mismatching dimensions: given vector of size " << size( x )
+				<< " has nothing to do with either matrix dimension (" << nrows( A ) << " nor " << ncols( A ) << ").\n";
+			return MISMATCH;
+		}
+
+		return eWiseLambda( f, A, args... );
+	}
+
+	/**
+	 * For all elements in a ALP Matrix \a B, fold the value \f$ \alpha \f$
+	 * into each element.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid,
+		Backend backend
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, backend > &alpha,
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &B,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) alpha;
+		(void) B;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/** Folds element-wise alpha into B, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator,
+		Backend backend
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, backend > &alpha,
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &B,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		(void) alpha;
+		(void) B;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/** Folds element-wise A into B, monoid variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid,
+		Backend backend
+	>
+	RC foldr(
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &A,
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &B,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		(void) B;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/** Folds element-wise A into B, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator,
+		Backend backend
+	>
+	RC foldr(
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &A,
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &B,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		(void) B;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/** Folds element-wise B into A, monoid variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid,
+		Backend backend
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &A,
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &B,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value &&
+			!alp::is_object< InputType >::value &&
+			alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		(void) B;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/** Folds element-wise B into A, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator,
+		Backend backend
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &A,
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &B,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value &&
+			!alp::is_object< InputType >::value &&
+			alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		(void) B;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/** Folds element-wise beta into A, monoid variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid,
+		Backend backend
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &A,
+		const Scalar< InputType, InputStructure, backend > &beta,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value &&
+			!alp::is_object< InputType >::value &&
+			alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		(void) beta;
+		(void) monoid;
+		return UNSUPPORTED;
+	}
+
+	/** Folds element-wise beta into A, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator,
+		Backend backend
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, backend > &A,
+		const Scalar< InputType, InputStructure, backend > &beta,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value &&
+			!alp::is_object< InputType >::value &&
+			alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		(void) beta;
+		(void) op;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Returns a view over the input matrix returning conjugate of the accessed element.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+		Backend backend,
+		std::enable_if_t<
+			!structures::is_a< Structure, structures::Square >::value
+		> * = nullptr
+	>
+	Matrix<
+		DataType, Structure, Density::Dense,
+		view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		backend
+	>
+	conjugate(
+		const Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, backend > &A,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		return UNSUPPORTED;
+	}
+
+	/** Specialization for square matrices */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+		Backend backend,
+		std::enable_if_t<
+			structures::is_a< Structure, structures::Square >::value
+		> * = nullptr
+	>
+	Matrix<
+		DataType, Structure, Density::Dense,
+		view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		backend
+	>
+	conjugate(
+		const Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, backend > &A,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		return UNSUPPORTED;
+	}
+	/** @} */
+
+} // namespace alp
+
+#endif // end _H_ALP_BLAS2_BASE
diff --git a/include/alp/base/blas3.hpp b/include/alp/base/blas3.hpp
new file mode 100644
index 000000000..1875ddc4e
--- /dev/null
+++ b/include/alp/base/blas3.hpp
@@ -0,0 +1,336 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ */
+
+#ifndef _H_ALP_BLAS3_BASE
+#define _H_ALP_BLAS3_BASE
+
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+
+#include <alp/backends.hpp>
+#include <alp/identities.hpp>
+#include <alp/monoid.hpp>
+#include <alp/phase.hpp>
+
+#include "matrix.hpp"
+#include "vector.hpp"
+#include "io.hpp"
+
+
+namespace alp {
+
+	/**
+	 * \defgroup BLAS3 The Level-3 Basic Linear Algebra Subroutines (BLAS)
+	 *
+	 * A collection of functions that allow GraphBLAS semirings to work on
+	 * one or more two-dimensional sparse containers (i.e, sparse matrices).
+	 *
+	 * @{
+	 */
+
+	/**
+	 * @brief Computes \f$ C = A . B \f$ for a given monoid.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class MulMonoid,
+		Backend backend
+	>
+	RC eWiseApply(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &A,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &B,
+		const MulMonoid &mulmono,
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< MulMonoid >::value
+		> * const = nullptr
+	) {
+		(void) C;
+		(void) A;
+		(void) B;
+		(void) mulmono;
+		return UNSUPPORTED;
+	}
+
+
+	/**
+	 * Computes \f$ C = alpha . B \f$ for a given monoid.
+	 *
+	 * Case where \a A is a scalar.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class MulMonoid,
+		Backend backend
+	>
+	RC eWiseApply(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &C,
+		const Scalar< InputType1, InputStructure1, backend > &alpha,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &B,
+		const MulMonoid &mulmono,
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< MulMonoid >::value
+		> * const = nullptr
+	) {
+		(void) C;
+		(void) alpha;
+		(void) B;
+		(void) mulmono;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ C = A . beta \f$ for a given monoid.
+	 *
+	 * Case where \a B is a scalar.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2,
+		class MulMonoid,
+		Backend backend
+	>
+	RC eWiseApply(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &A,
+		const Scalar< InputType2, InputStructure2, backend > &beta,
+		const MulMonoid &mulmono,
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< MulMonoid >::value
+		> * const = nullptr
+	) {
+		(void) C;
+		(void) A;
+		(void) beta;
+		(void) mulmono;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise multiplication of two matrices,
+	 *     \f$ C = C + A .* B \f$,
+	 * under a given semiring.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		Backend backend
+	>
+	RC eWiseMul(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &A,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &B,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		(void) C;
+		(void) A;
+		(void) B;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * eWiseMul, version where A is a scalar.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		Backend backend
+	>
+	RC eWiseMul(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &C,
+		const Scalar< InputType1, InputStructure1, backend > &alpha,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &B,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		(void) C;
+		(void) alpha;
+		(void) B;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * eWiseMul, version where B is a scalar.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2,
+		Backend backend
+	>
+	RC eWiseMul(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &A,
+		const Scalar< InputType2, InputStructure2, backend > &beta,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		(void) C;
+		(void) A;
+		(void) beta;
+		(void) ring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * @brief  Outer product of two vectors. The result matrix \a A will contain \f$ uv^T \f$.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Operator,
+		Backend backend
+	>
+	RC outer(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &u,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &v,
+		const Operator &mul = Operator(),
+		const std::enable_if_t<
+			alp::is_operator< Operator >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!alp::is_object< OutputType >::value
+		> * const = nullptr
+	) {
+		(void) A;
+		(void) u;
+		(void) v;
+		(void) mul;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Returns a view over the general rank-1 matrix computed with the outer product.
+	 * This avoids creating the resulting container. The elements are calculated lazily on access.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Operator,
+		Backend backend
+	>
+	Matrix<
+		typename Operator::D3, structures::General, Density::Dense,
+		view::Functor< std::function< void( InputType1 &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		backend
+	>
+	outer(
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, backend > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, backend > &y,
+		const Operator &mul = Operator(),
+		const typename std::enable_if<
+			alp::is_operator< Operator >::value &&
+			! alp::is_object< InputType1 >::value &&
+			! alp::is_object< InputType2 >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) y;
+		(void) mul;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Returns a view over the general rank-1 matrix computed with the outer product.
+	 * Version for the case when input vectors are the same vector,
+	 * which results in a symmetric matrix.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Operator,
+		Backend backend
+	>
+	Matrix<
+		typename Operator::D3,
+		typename std::conditional<
+			grb::utils::is_complex< typename Operator::D3 >::value,
+			alp::structures::Hermitian,
+			alp::structures::Symmetric
+		>::type,
+		Density::Dense,
+		view::Functor< std::function< void( typename Operator::D3 &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		backend
+	>
+	outer(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &x,
+		const Operator &mul = Operator(),
+		const std::enable_if_t<
+			alp::is_operator< Operator >::value &&
+			!alp::is_object< InputType >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) mul;
+		return UNSUPPORTED;
+	}
+	/**
+	 * @}
+	 */
+
+} // namespace alp
+
+#endif // end _H_ALP_BLAS3_BASE
diff --git a/include/alp/base/collectives.hpp b/include/alp/base/collectives.hpp
new file mode 100644
index 000000000..1ac2e87eb
--- /dev/null
+++ b/include/alp/base/collectives.hpp
@@ -0,0 +1,259 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman & J. M. Nash
+ * @date 20th of February, 2017
+ */
+
+#ifndef _H_ALP_COLL_BASE
+#define _H_ALP_COLL_BASE
+
+#include <alp/backends.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/rc.hpp>
+
+
+namespace alp {
+
+	/**
+	 * A static class defining various collective operations on scalars. This
+	 * class is templated in terms of the backends that are implemented-- each
+	 * implementation provides its own mechanisms to handle collective
+	 * communications. These are required for users employing alp::eWiseLambda,
+	 * or for users who perform explicit SPMD programming.
+	 */
+	template< enum Backend implementation >
+	class collectives {
+
+	private:
+		/** Disallow creating an instance. */
+		collectives() {}
+
+	public:
+		/**
+		 * Schedules an allreduce operation of a single object of type IOType per
+		 * process. The allreduce shall be complete by the end of the call. This is a
+		 * collective graphBLAS operation. After the collective call finishes, each
+		 * user process will locally have available the allreduced value.
+		 *
+		 * Since this is a collective call, there are \a P values \a inout spread over
+		 * all user processes. Let these values be denoted by \f$ x_s \f$, with
+		 * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
+		 * argument \a inout on input at the user process with ID \a s. Let
+		 * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
+		 * bijection, some unknown permutation of the process ID. This permutation is
+		 * must be fixed for any given combination of GraphBLAS implementation and value
+		 * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
+		 *
+		 * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
+		 * exact same result to \a inout at each of the \a P user processes.
+		 *
+		 * In summary, this means 1) this operation is coherent across all processes and
+		 * produces bit-wise equivalent output on all user processes, and 2) the result
+		 * is reproducible across different runs using the same input and \a P. Yet it
+		 * does \em not mean that the order of addition is fixed.
+		 *
+		 * Since each user process supplies but one value, there is no difference
+		 * between a reduce-to-the-left versus a reduce-to-the-right (see alp::reducel
+		 * and alp::reducer).
+		 *
+		 * @tparam descr    The GraphBLAS descriptor.
+		 *                  Default is alp::descriptors::no_operation.
+		 * @tparam Operator Which operator to use for reduction.
+		 * @tparam IOType   The type of the to-be reduced value.
+		 *
+		 * @param[in,out] inout On input:  the value at the calling process to be
+		 *                      reduced. On output: the reduced value.
+		 * @param[in]      op   The associative operator to reduce by.
+		 *
+		 * \note If \op is commutative, the implementation free to employ a different
+		 *       allreduce algorithm, as long as it is documented well enough so that
+		 *       its cost can be quantified.
+		 *
+		 * @returns alp::SUCCESS When the operation succeeds as planned.
+		 * @returns alp::PANIC   When the communication layer unexpectedly fails. When
+		 *                       this error code is returned, the library enters an
+		 *                       undefined state.
+		 *
+		 * \parblock
+		 * \par Valid descriptors:
+		 * -# alp::descriptors::no_operation
+		 * -# alp::descriptors::no_casting
+		 *  Any other descriptors will be ignored.
+		 *  \endparblock
+		 *
+		 * \parblock
+		 * \par Performance semantics:
+		 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
+		 * -# local work: \f$ N*Operator \f$ ;
+		 * -# transferred bytes: \f$ N \f$ ;
+		 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
+		 * \endparblock
+		 */
+		template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
+		static RC allreduce( IOType & inout, const Operator op = Operator() ) {
+			(void)inout;
+			(void)op;
+			return UNSUPPORTED;
+		}
+
+		/**
+		 * Schedules a reduce operation of a single object of type IOType per process.
+		 * The reduce shall be complete by the end of the call. This is a collective
+		 * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce.
+		 *
+		 * Since this is a collective call, there are \a P values \a inout spread over
+		 * all user processes. Let these values be denoted by \f$ x_s \f$, with
+		 * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
+		 * argument \a inout on input at the user process with ID \a s. Let
+		 * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
+		 * bijection, some unknown permutation of the process ID. This permutation is
+		 * must be fixed for any given combination of GraphBLAS implementation and value
+		 * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
+		 *
+		 * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
+		 * result to \a inout at the user process with ID \a root.
+		 *
+		 * In summary, this the result is reproducible across different runs using the
+		 * same input and \a P. Yet it does \em not mean that the order of addition is
+		 * fixed.
+		 *
+		 * Since each user process supplies but one value, there is no difference
+		 * between a reduce-to-the-left versus a reduce-to-the-right (see alp::reducel
+		 * and alp::reducer).
+		 *
+		 * @tparam descr    The GraphBLAS descriptor.
+		 *                  Default is alp::descriptors::no_operation.
+		 * @tparam Operator Which operator to use for reduction.
+		 * @tparam IOType   The type of the to-be reduced value.
+		 *
+		 * @param[in,out] inout On input: the value at the calling process to be
+		 *                      reduced. On output at process \a root: the reduced value.
+		 *                      On output as non-root processes: same value as on input.
+		 * @param[in]       op  The associative operator to reduce by.
+		 * @param[in]      root Which process should hold the reduced value. This
+		 *                      number must be larger or equal to zero, and must be
+		 *                      strictly smaller than the number of user processes
+		 *                      \a P.
+		 *
+		 * @return SUCCESS When the function completes successfully.
+		 * @return ILLEGAL When root is larger or equal than \a P. When this code is
+		 *                 returned, the state of the GraphBLAS shall be as though
+		 *                 this call was never made.
+		 * @return PANIC   When an unmitigable error within the GraphBLAS occurs.
+		 *                 Upon returning this error, the GraphBLAS enters an
+		 *                 undefined state.
+		 *
+		 * \note If \op is commutative, the implementation free to employ a different
+		 *       allreduce algorithm, as long as it is documented well enough so that
+		 *       its cost can be quantified.
+		 *
+		 * \parblock
+		 * \par Performance semantics:
+		 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
+		 * -# local work: \f$ N*Operator \f$ ;
+		 * -# transferred bytes: \f$ N \f$ ;
+		 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
+		 * \endparblock
+		 */
+		template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
+		static RC reduce( IOType & inout, const size_t root = 0, const Operator op = Operator() ) {
+			(void)inout;
+			(void)op;
+			(void)root;
+			return UNSUPPORTED;
+		}
+
+		/**
+		 * Schedules a broadcast operation of a single object of type IOType per
+		 * process. The broadcast shall be complete by the end of the call. This is
+		 * a collective graphBLAS operation. The BSP costs are as for the PlatformBSP
+		 * #broadcast.
+		 *
+		 * @tparam IOType   The type of the to-be broadcast value.
+		 *
+		 * @param[in,out] inout On input at process \a root: the value to be
+		 *                      broadcast.
+		 *                      On input at non-root processes: initial values are
+		 *                      ignored.
+		 *                      On output at process \a root: the input value remains
+		 *                      unchanged.
+		 *                      On output at non-root processes: the same value held
+		 *                      at process ID \a root.
+		 * @param[in]      root The user process which is to send out the given input
+		 *                      value \a inout so that it becomes available at all
+		 *                      \a P user processes. This value must be larger or
+		 *                      equal to zero and must be smaller than the total
+		 *                      number of user processes \a P.
+		 *
+		 * @return SUCCESS On the successful completion of this function.
+		 * @return ILLEGAL When \a root is larger or equal to \a P. If this code is
+		 *                 returned, it shall be as though the call to this function
+		 *                 had never occurred.
+		 * return PANIC    When the function fails and the library enters an
+		 *                 undefined state.
+		 *
+		 * \parblock
+		 * \par Performance semantics: serial
+		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
+		 * -# local work: \f$ 0 \f$ ;
+		 * -# transferred bytes: \f$ NP \f$ ;
+		 * -# BSP cost: \f$ NPg + l \f$;
+		 * \endparblock
+		 *
+		 * \par Performance semantics: two phase
+		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
+		 * -# local work: \f$ 0 \f$ ;
+		 * -# transferred bytes: \f$ 2N \f$ ;
+		 * -# BSP cost: \f$ 2(Ng + l) \f$;
+		 * \endparblock
+		 *
+		 * \par Performance semantics: two level tree
+		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
+		 * -# local work: \f$ 0 \f$ ;
+		 * -# transferred bytes: \f$ 2\sqrt{P}N \f$ ;
+		 * -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$;
+		 * \endparblock
+		 */
+		template< typename IOType >
+		static RC broadcast( IOType &inout, const size_t root = 0 ) {
+			(void)inout;
+			(void)root;
+			return UNSUPPORTED;
+		}
+
+		/**
+		 * Broadcast on an array of \a IOType.
+		 *
+		 * The above documentation applies with \a size times <tt>sizeof(IOType)</tt>
+		 * substituted in.
+		 */
+		template< Descriptor descr = descriptors::no_operation, typename IOType >
+		static RC broadcast( IOType * inout, const size_t size, const size_t root = 0 ) {
+			(void)inout;
+			(void)size;
+			(void)root;
+			return UNSUPPORTED;
+		}
+
+	}; // end class ``collectives''
+
+} // end namespace alp
+
+#endif // end _H_ALP_COLL_BASE
+
diff --git a/include/alp/base/config.hpp b/include/alp/base/config.hpp
new file mode 100644
index 000000000..5a5dcc0fe
--- /dev/null
+++ b/include/alp/base/config.hpp
@@ -0,0 +1,309 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 8th of August, 2016
+ */
+
+#ifndef _H_ALP_CONFIG_BASE
+#define _H_ALP_CONFIG_BASE
+
+#include <cstddef> //size_t
+#ifndef _ALP_NO_STDIO
+ #include <iostream> //std::cout
+#endif
+#include <string>
+
+#include <assert.h>
+#include <unistd.h> //sysconf
+
+#include <alp/backends.hpp>
+
+
+// if the user did not define _ALP_BACKEND, set it to the default sequential
+// implementation
+#ifndef _ALP_BACKEND
+ #define _ALP_BACKEND reference
+#endif
+
+// if the user did not define _ALP_SECONDARY_BACKEND, set it to the default
+// sequential implementation. This setting may be used by other backends for
+// backend-specific purposes. For example, a parallel backend may use this
+// setting to constrol to which sequential backend it dispatches sequential
+// work.
+#ifndef _ALP_SECONDARY_BACKEND
+ #define _ALP_SECONDARY_BACKEND reference
+#endif
+
+/**
+ * The main GraphBLAS namespace.
+ *
+ * All GraphBLAS functions and objects are defined within.
+ */
+namespace alp {
+
+	/** Contains compile-time configuration constants. */
+	namespace config {
+
+		/** The default backend to be selected for an end user. */
+		static constexpr alp::Backend default_backend = _ALP_BACKEND;
+
+		/** The cache line size, in bytes. */
+		class CACHE_LINE_SIZE {
+
+		private:
+			/**
+			 * The cache line size in bytes. Update this value at compile time to
+			 * reflect the target architecture.
+			 */
+			static constexpr size_t bytes = 64;
+
+		public:
+			/**
+			 * @return The cache line size in bytes.
+			 * @see alp::config::CACHE_LINE_SIZE::bytes
+			 */
+			static constexpr size_t value() {
+				return bytes;
+			}
+		};
+
+		/** The SIMD size, in bytes. */
+		class SIMD_SIZE {
+
+		private:
+			/**
+			 * The SIMD size, in bytes. Update this value at compile time to reflect
+			 * the target architecture.
+			 */
+			static constexpr size_t bytes = 32;
+
+		public:
+			/**
+			 * @return The SIMD size in bytes.
+			 * @see alp::config::SIMD_SIZE::bytes
+			 */
+			static constexpr size_t value() {
+				return bytes;
+			}
+		};
+
+		/** How many elements of a given data type fit into a SIMD register. */
+		template< typename T >
+		class SIMD_BLOCKSIZE {
+		public:
+			/**
+			 * Calculates the block size this operator should use.
+			 *
+			 * \warning This rounds down. If instances of T are too large, this could
+			 *          result in a zero value. See #value for a correction.
+			 */
+			static constexpr size_t unsafe_value() {
+				return SIMD_SIZE::value() / sizeof( T );
+			}
+
+			/**
+			 * The maximum of one and the number of elements that fit into a single
+			 * cache line.
+			 */
+			static constexpr size_t value() {
+				return unsafe_value() > 0 ? unsafe_value() : 1;
+			}
+		};
+
+		/**
+		 * How many hardware threads the operating system exposes.
+		 *
+		 * \warning On contemporary x86-based hardware, the reported number by
+		 *          value() will include that of each hyper-thread. This number
+		 *          thus does not necessarily equal the number of cores available.
+		 */
+		class HARDWARE_THREADS {
+		public:
+			/**
+			 * Returns the number of online hardware threads as reported by the OS.
+			 *
+			 * \warning This is a UNIX system call.
+			 *
+			 * @returns The number of hardware threads currently online. The return
+			 *          type is specified by the UNIX standard.
+			 */
+			static long value() {
+				return sysconf( _SC_NPROCESSORS_ONLN );
+			}
+		};
+
+		/** Benchmarking defaults. */
+		class BENCHMARKING {
+		public:
+			/** The default number of inner repititions. */
+			static constexpr size_t inner() {
+				return 1;
+			}
+
+			/** The default number of outer repititions. */
+			static constexpr size_t outer() {
+				return 10;
+			}
+		};
+
+		/** Memory defaults. */
+		class MEMORY {
+		public:
+
+			/** The private L1 data cache size, in bytes. */
+			static constexpr size_t l1_cache_size() {
+				return 32768;
+			}
+
+			/** What is considered a lot of memory, in 2-log of bytes. */
+			static constexpr size_t big_memory() {
+				return 31;
+			} // 2GB
+
+			/**
+			 * The memory speed under random accesses of 8-byte words.
+			 *
+			 * @returns The requested speed in MiB/s/process.
+			 *
+			 * @note The default value was measured on a two-socket Ivy Bridge node
+			 *       with 128GB quad-channel DDR4 memory at 1600 MHz per socket.
+			 *
+			 * @note In the intended use of these variables, it is the ratio between
+			 *       #stream_memspeed and #random_access_memspeed that matters. While
+			 *       untested, it is reasonable to think the ratios do not change too
+			 *       much between architectures. Nevertheless, for best results, these
+			 *       numbers are best set to benchmarked values on the deployment
+			 *       hardware.
+			 */
+			static constexpr double random_access_memspeed() {
+				return 147.298;
+			}
+
+			/**
+			 * The memory speed under a limited number of streams of uncached data.
+			 *
+			 * @returns The requested speed in MiB/s/process.
+			 *
+			 * @note The default value was measured on a two-socket Ivy Bridge node
+			 *       with 128GB quad-channel DDR4 memory at 1600 MHz per socket.
+			 *
+			 * @note In the intended use of these variables, it is the ratio between
+			 *       #stream_memspeed and #random_access_memspeed that matters. While
+			 *       untested, it is reasonable to think the ratios do not change too
+			 *       much between architectures. Nevertheless, for best results, these
+			 *       numbers are best set to benchmarked values on the deployment
+			 *       hardware.
+			 */
+			static constexpr double stream_memspeed() {
+				return 1931.264;
+			}
+
+			/**
+			 * Prints memory usage info to stdout, but only for big memory allocations.
+			 *
+			 * @returns true if and only if this function printed information to stdout.
+			 */
+			static bool report( const std::string prefix, const std::string action, const size_t size, const bool printNewline = true ) {
+#ifdef _ALP_NO_STDIO
+				(void)prefix;
+				(void)action;
+				(void)size;
+				(void)printNewline;
+				return false;
+#else
+				constexpr size_t big =
+ #ifdef _DEBUG
+					true;
+ #else
+					( 1ul << big_memory() );
+ #endif
+				if( size >= big ) {
+					std::cout << "Info: ";
+					std::cout << prefix << " ";
+					std::cout << action << " ";
+					if( sizeof( size_t ) * 8 > 40 && ( size >> 40 ) > 2 ) {
+						std::cout << ( size >> 40 ) << " TB of memory";
+					} else if( sizeof( size_t ) * 8 > 30 && ( size >> 30 ) > 2 ) {
+						std::cout << ( size >> 30 ) << " GB of memory";
+					} else if( sizeof( size_t ) * 8 > 20 && ( size >> 20 ) > 2 ) {
+						std::cout << ( size >> 20 ) << " MB of memory";
+					} else if( sizeof( size_t ) * 8 > 10 && ( size >> 10 ) > 2 ) {
+						std::cout << ( size >> 10 ) << " kB of memory";
+					} else {
+						std::cout << size << " bytes of memory";
+					}
+					if( printNewline ) {
+						std::cout << ".\n";
+					}
+					return true;
+				}
+				return false;
+#endif
+			}
+		};
+
+		/**
+		 * What data type should be used to store row indices.
+		 *
+		 * Some uses cases may require this to be set to <tt>size_t</tt>-- others may
+		 * do with (much) smaller data types instead.
+		 *
+		 * \note The data type for indices of general arrays is not configurable. This
+		 *       set of implementations use <tt>size_t</tt> for those.
+		 */
+		typedef unsigned int RowIndexType;
+
+		/**
+		 * What data type should be used to store column indices.
+		 *
+		 * Some uses cases may require this to be set to <tt>size_t</tt>-- others may
+		 * do with (much) smaller data types instead.
+		 *
+		 * \note The data type for indices of general arrays is not configurable. This
+		 *       set of implementations use <tt>size_t</tt> for those.
+		 */
+		typedef unsigned int ColIndexType;
+
+		/**
+		 * What data type should be used to refer to an array containing nonzeroes.
+		 *
+		 * Some uses cases may require this to be set to <tt>size_t</tt>-- others may
+		 * do with (much) smaller data types instead.
+		 *
+		 * \note The data type for indices of general arrays is not configurable. This
+		 *       set of implementations use <tt>size_t</tt> for those.
+		 */
+		typedef size_t NonzeroIndexType;
+
+		/**
+		 * What data type should be used to store vector indices.
+		 *
+		 * Some uses cases may require this to be set to <tt>size_t</tt>-- others may
+		 * do with (much) smaller data types instead.
+		 *
+		 * \note The data type for indices of general arrays is not configurable. This
+		 *       set of implementations use <tt>size_t</tt> for those.
+		 */
+		typedef unsigned int VectorIndexType;
+
+	} // namespace config
+
+} // namespace alp
+
+#endif // end _H_ALP_CONFIG_BASE
diff --git a/include/alp/base/exec.hpp b/include/alp/base/exec.hpp
new file mode 100644
index 000000000..c011b7221
--- /dev/null
+++ b/include/alp/base/exec.hpp
@@ -0,0 +1,232 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 17th of April, 2017
+ */
+
+#ifndef _H_ALP_EXEC_BASE
+#define _H_ALP_EXEC_BASE
+
+#ifndef _ALP_NO_STDIO
+#include <iostream>
+#endif
+#include <stdexcept>
+#include <string>
+
+#include <alp/backends.hpp>
+#include <alp/rc.hpp>
+
+
+namespace alp {
+
+	/**
+	 * The various ways in which the #Launcher can be used
+	 * to execute a GraphBLAS program.
+	 *
+	 * \warning An implementation may require different linker commands
+	 *          when using different modes. This is OK, since a call to
+	 *          the #Launcher is required to be quite different
+	 *          depending on which mode is used. The portability is in
+	 *          the GraphBLAS program being launched-- that one should
+	 *          never change depending on whichever mode it is used.
+	 */
+	enum EXEC_MODE {
+
+		/**
+		 * Automatic mode. The #Launcher can spawn user processes
+		 * which will execute a given program.
+		 */
+		AUTOMATIC = 0,
+
+		/**
+		 * Manual mode. The user controls \a nprocs user processes
+		 * which together should execute a given program, by, for
+		 * example, using the #Launcher.
+		 */
+		MANUAL,
+
+		/**
+		 * When running from an MPI program. The user controls
+		 * \a nprocs MPI programs, which, together, should execute
+		 * a given GraphBLAS program.
+		 */
+		FROM_MPI
+
+	};
+
+	/**
+	 * Allows an auxiliary program to run any GraphBLAS program. Input data may be
+	 * passed through a user-defined type. Output data will be retrieved via the
+	 * same type. For implementations that support multiple user processes, the
+	 * caller may explicitly set the process ID and total number of user processes.
+	 *
+	 * The intended use is to `just call' alp::exec which should, in its most
+	 * trivial form, compile regardless of which backend is selected.
+	 *
+	 * @tparam mode           Which #EXEC_MODE the Launcher should adhere to.
+	 * @tparam implementation Which GraphBLAS implementation is to be used.
+	 */
+	template< enum EXEC_MODE mode, enum Backend implementation >
+	class Launcher {
+
+		public :
+
+			/**
+		     * Constructs a new Launcher. This constructor is a collective
+		     * call; all \a nprocs processes that form a single Launcher
+		     * group must make a call to this constructor at roughly the
+		     * same time. There is an implementation-defined time-out for
+		     * the creation of a Launcher group.
+		     *
+		     * @param[in]  process_id  The user process ID of the calling process.
+		     *                         The value must be larger or equal to 0. This
+		     *                         value must be strictly smaller than \a nprocs.
+		     *                         This value must be unique to the calling
+		     *                         process within this collective call across
+		     *                         \em all \a nprocs user processes. This number
+		     *                         \em must be strictly smaller than \a nprocs.
+		     *                         Optional: the default is 0.
+		     * @param[in]  nprocs      The total number of user processes making a
+		     *                         collective call to this function. Optional: the
+		     *                         default is 1.
+		     * @param[in]  hostname    The hostname of one of the user processes.
+		     *                         Optional: the default is `localhost'.
+		     * @param[in]  port        A free port number at \a hostname. This port
+		     *                         will be used for TCP connections to \a hostname
+		     *                         if and only if \a nprocs is larger than one.
+		     *                         Optional: the default value is `0'.
+		     *
+		     * @throws invalid_argument If #nprocs is zero.
+		     * @throws invalid_argument If #process_id is greater than or
+		     *                          equal to \a nprocs.
+		     *
+		     * \note An implementation may define further constraints on
+		     *       the input arguments, such as, obviously, on \a hostname
+		     *       and \a port, but also on \a nprocs and, as a result, on
+		     *       \a process_id.
+		     */
+			Launcher( const size_t process_id = 0,        // user process ID
+				const size_t nprocs = 1,                  // total number of user processes
+				const std::string hostname = "localhost", // one of the user process hostnames
+				const std::string port = "0"              // a free port at hostname
+			) {                                           // standard does not specify any constrants on hostname and port
+		                                                  // so accept (and ignore) anything
+				(void)hostname; (void)port;
+
+#ifndef _ALP_NO_EXCEPTIONS
+				// sanity checks on process_id and nprocs
+				if( nprocs == 0 ) { throw std::invalid_argument( "Total number of user "
+																 "processes must be "
+																 "strictly larger than "
+																 "zero." ); }
+	if( process_id >= nprocs ) {
+		throw std::invalid_argument( "Process ID must be strictly smaller than "
+									 "total number of user processes." );
+	}
+#endif
+} // namespace alp
+
+/**
+ * Executes the given GraphBLAS program. This function, depending on whether
+ * GraphBLAS is compiled in automatic or in manual mode, will either
+ * \em spawn the maximum number of available user processes or will connect
+ * exactly \a nprocs existing processes, respectively, to execute the given
+ * \a alp_program.
+ *
+ * This is a collective function call.
+ *
+ * @tparam T The type of the data to pass to the GraphBLAS program.
+ * @tparam U The type of the output data to pass back to the user.
+ *
+ * @param[in]  alp_program User GraphBLAS program to be executed.
+ * @param[in]  data_in     Input data of user-defined type \a T.
+ *                         When in automatic mode, the data will only be
+ *                         available at user process 0 only. When in
+ *                         manual mode, the data will be available to
+ *                         this user process (with the below given
+ *                         \a process_id) only.
+ * @param[out] data_out    Output data of user-defined type \a U. The output
+ *                         data should be available at user process with ID
+ *                         zero.
+ * @param[in]  broadcast   Whether the input should be broadcast from user
+ *                         process 0 to all other user processes. Optional;
+ *                         the default value is \a false.
+ *
+ * @return SUCCESS If the execution proceeded as intended.
+ * @return PANIC   If an unrecoverable error was encountered while trying to
+ *                 execute the given GraphBLAS program.
+ *
+ * \warning An implementation can define further constraints on the validity
+ *          of input arguments. The most obvious is that implementations
+ *          supporting only one user process will not accept \a nprocs larger
+ *          than 1.
+ *
+ * All aforementioned default values shall always be legal.
+ */
+template< typename T, typename U >
+RC exec( void ( *alp_program )( const T &, U & ), // user GraphBLAS program
+	const T & data_in,
+	U & data_out, // input & output data
+	const bool broadcast = false ) const {
+	(void)alp_program;
+	(void)data_in;
+	(void)data_out;
+	(void)broadcast;
+	// stub implementation, should be overridden by specialised implementation,
+	// so return error code
+	return PANIC;
+}
+
+/**
+ * Variable size version of the above function.
+ *
+ * @param[in]  broadcast   Whether the input should be broadcast from user
+ *                         process 0 to all other user processes. Optional;
+ *                         the default value is \a false. This will let user
+ *                         processes with ID larger than zero allocate
+ *                         \a in_size bytes of memory into which the data at
+ *                         process 0 will be copied.
+ *
+ * \todo more documentation
+ */
+template< typename U >
+RC exec( void ( *alp_program )( const void *, const size_t, U & ), const void * data_in, const size_t in_size, U & data_out, const bool broadcast = false ) const {
+	(void)alp_program;
+	(void)data_in;
+	(void)in_size;
+	(void)data_out;
+	(void)broadcast;
+	return PANIC;
+}
+
+/**
+ * Releases all GraphBLAS resources. After a call to this function, no
+ * GraphBLAS library functions may be called any longer.
+ *
+ * @return SUCCESS A call to this function may never fail.
+ */
+static RC finalize() {
+	return PANIC;
+}
+}
+; // end class `Launcher'
+
+} // end namespace ``alp''
+
+#endif // end _H_ALP_EXEC_BASE
diff --git a/include/alp/base/init.hpp b/include/alp/base/init.hpp
new file mode 100644
index 000000000..b40093cf2
--- /dev/null
+++ b/include/alp/base/init.hpp
@@ -0,0 +1,183 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 24th of January, 2017
+ */
+
+#ifndef _H_ALP_INIT_BASE
+#define _H_ALP_INIT_BASE
+
+#include <alp/rc.hpp>
+
+#include "config.hpp"
+
+namespace alp {
+
+	/**
+	 * Initialises the calling user process.
+	 *
+	 * If the backend supports multiple user processes, the user can invoke this
+	 * function with \a P equal to one or higher; if the backend supports only a
+	 * single user process, then \a P must equal one.
+	 * The value for the user process ID \a s must be larger or equal to zero and
+	 * must be strictly smaller than \a P. If \a P > 1, each user process must
+	 * call this function collectively, each user process should pass the same
+	 * value for \a P, and each user process should pass a unique value for \a s
+	 * amongst all \a P collective calls made.
+	 *
+	 * An implementation may define that additional data is required for a call to
+	 * this function to complete successfully. Such data may be passed via the
+	 * final argument to this function, \a implementation_data.
+	 *
+	 * If the implementation does not support multiple user processes, then a
+	 * value for \a implementation_data shall not be required. In parcticular, a
+	 * call to this function with an empty parameter list shall then be legal
+	 * and infer the following default arguments: zero for \a s, one for \a P,
+	 * and \a NULL for \a implementation_data. When such an implementation is
+	 * requested to initialise multiple user processes, the alp::UNSUPPORTED
+	 * error code shall be returned.
+	 *
+	 * A call to this function must be matched with a call to alp::finalize().
+	 * After a successful call to this function, a new call to alp::init() without
+	 * first calling alp::finalize() shall incur undefined behaviour. The
+	 * construction of GraphBLAS containers without a preceding successful call
+	 * to alp::init() will result in invalid GraphBLAS objects. Any valid
+	 * GraphBLAS containers will become invalid after a call to alp::finalize().
+	 * Any use of GraphBLAS functions on invalid containers will result in
+	 * undefined behaviour.
+	 *
+	 * @tparam backend Which GraphBLAS backend this call to init initialises.
+	 *
+	 * @param[in] s The ID of this user process.
+	 * @param[in] P The total number of user processes.
+	 * @param[in] implementation_data Any implementation-defined data structure
+	 *                                required for successful completion of this
+	 *                                call.
+	 *
+	 * \note For a pure MPI implementation, for instance, \a implementation_data
+	 *       may be a pointer to the MPI communicator corresponding to these user
+	 *       processes.
+	 *
+	 * \note The implementations based on PlatformBSP require direct passing of
+	 *       the \a bsp_t corresponding to the BSP context of the user processes;
+	 *       this is legal since the PlatformBSP specification defines the
+	 *       \a bsp_t type as a void pointer.
+	 *
+	 * @return SUCCESS     If the initialisation was successful.
+	 * @return UNSUPPORTED When the implementation does not support multiple
+	 *                     user processes (\a P larger than 1). After a call to
+	 *                     this function exits with this error code the library
+	 *                     state  shall be as though the call never were made.
+	 * @return PANIC       If this function fails, the state of this GraphBLAS
+	 *                     implementation becomes undefined.
+	 *
+	 * \note There is no argument checking. If \a s is larger or equal to \a P,
+	 *       undefined behaviour occurs. If \a implementation_data was invalid
+	 *       or corrupted, undefined behaviour occurs.
+	 *
+	 * \par Performance semantics
+	 *      None. Implementations are encouraged to specify the complexity of
+	 *      their implementation of this function in terms of \a P.
+	 *
+	 * \note Compared to the GraphBLAS C specification, this function lacks a
+	 *       choice whether to execute in `blocking' or `non-blocking' mode.
+	 *       All functions in the Huawei GraphBLAS are blocking. A choice
+	 *       between blocking and non-blocking execution may be added later.
+	 * \note Note that a blocking mode is a valid implementation of a non-
+	 *       blocking mode, as specified in the GraphBLAS C API. Therefore
+	 *       this specification will still yield a valid implementation of
+	 *       the C API when properly wrapped.
+	 * \note Non-blocking mode with clear performance semantics are possible via
+	 *       carefully designed algorithmic skeletons. This is future work.
+	 * \note This specification allows for alp::init() to be called multiple
+	 *       times from the same process and the same thread, as long as all the
+	 *       above requirements are met at each call. The parameters \a s and
+	 *       \a P (and \a implementation_data) may differ each time.
+	 * \note This is an extension of the GraphBLAS C API, whom only allow a
+	 *       single initialisation and a single matching finalisation.
+	 * \note The GraphBLAS C API does not have the notion of user processes. We
+	 *       believe this notion is necessary to properly integrate into parallel
+	 *       frameworks, and also to affect proper and efficient parallel I/O.
+	 */
+	template< enum Backend backend = config::default_backend >
+	RC init( const size_t s, const size_t P, void * const implementation_data ) {
+		(void)s;
+		(void)P;
+		(void)implementation_data;
+		return PANIC;
+	}
+
+	/**
+	 * Implementations must ensure that initialisation without explicitly given
+	 * values regarding user processes etc. should still result in a successful
+	 * initialisation in all cases except where it cannot initialise due to
+	 * external factors.
+	 * A call to this function could, for instance, reduce to a full alp::init()
+	 * while using the default parameters 0 for \a s, 1 for \a P, and \a NULL for
+	 * \a implementation_data:
+	 * \code
+	 * return init< backend >( 0, 1, NULL );
+	 * \endcode
+	 *
+	 * @tparam backend The backend implementation to initialise.
+	 *
+	 * @return SUCCESS     If the initialisation was successful.
+	 * @return PANIC       If this function fails, the state of this GraphBLAS
+	 *                     implementation becomes undefined.
+	 */
+	template< enum Backend backend = config::default_backend >
+	RC init() {
+		return alp::init< backend >( 0, 1, NULL );
+	}
+
+	/**
+	 * Finalises a graphBLAS context opened by the last call to alp::init().
+	 *
+	 * This function must be called collectively and must follow a call to
+	 * alp::init(). After successful execution of this function, a new call
+	 * to alp::init() may be made.
+	 * After a call to this function, any graphBLAS objects that remain in scope
+	 * are invalid. The only graphBLAS functions on invalid containers which
+	 * shall \em not incur undefined behaviour are their destructors.
+	 *
+	 * \warning Invalid GraphBLAS containers will remain invalid no matter if a
+	 *          next call to alp::init() is made.
+	 *
+	 * @tparam backend Which GraphBLAS backend this call to init initialises.
+	 *
+	 * @return SUCCESS If the initialisation was successful.
+	 * @return PANIC   If this function fails, the state of the GraphBLAS
+	 *                 implementation becomes undefined. This means none of its
+	 *                 functions should be called during the remainder program
+	 *                 execution; in particular this means a new call to
+	 *                 alp::init() will not remedy the situaiton.
+	 *
+	 * \par Performance semantics
+	 *      None. Implementations are encouraged to specify the complexity of
+	 *      their implementation of this function in terms of the parameter
+	 *      \a P the matching call to alp::init() was called with.
+	 */
+	template< enum Backend backend = config::default_backend >
+	RC finalize() {
+		return PANIC;
+	}
+
+} // namespace alp
+
+#endif // end _H_ALP_INIT_BASE
diff --git a/include/alp/base/internalops.hpp b/include/alp/base/internalops.hpp
new file mode 100644
index 000000000..1f9592378
--- /dev/null
+++ b/include/alp/base/internalops.hpp
@@ -0,0 +1,3178 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 8th of August, 2016
+ */
+
+#ifndef _H_ALP_INTERNAL_OPERATORS_BASE
+#define _H_ALP_INTERNAL_OPERATORS_BASE
+
+#include <type_traits>
+#include <utility>
+
+#include <alp/type_traits.hpp>
+#include <alp/utils.hpp>
+#include <alp/utils/suppressions.h>
+
+#include "config.hpp"
+
+
+namespace alp {
+
+	namespace operators {
+
+		/** Core implementations of the standard operators in #alp::operators. */
+		namespace internal {
+
+			/**
+			 * Standard argmin operator.
+			 *
+			 * Takes std::pair< index, value > domains only.
+			 *
+			 * Given two pairs (i1,v1), (i2,v2)
+			 *  - returns (i1,v1) if v1<v2, OR
+			 *  - returns (i2,v2) otherwise.
+			 */
+			template< typename IType, typename VType >
+			class argmin {
+
+				static_assert( std::is_integral< IType >::value,
+					"Argmin operator may only be constructed using integral index "
+					"types." );
+
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef std::pair< IType, VType > left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef std::pair< IType, VType > right_type;
+
+				/** Alias to the output data type. */
+				typedef std::pair< IType, VType > result_type;
+
+				/** Whether this operator has an inplace foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an inplace foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of the operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( a->second < b->second ) {
+						c->first = a->first;
+						c->second = a->second;
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( a->second < c->second ) {
+						c->first = a->first;
+						c->second = a->second;
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( b->second <= c->second ) {
+						c->first = b->first;
+						c->second = b->second;
+					}
+				}
+			};
+
+			/**
+			 * Standard argmax operator.
+			 *
+			 * Takes std::pair< index, value > domains only.
+			 *
+			 * Given two pairs (i1,v1), (i2,v2)
+			 *  - returns (i1,v1) if v1>v2, OR
+			 *  - returns (i2,v2) otherwise.
+			 */
+			template< typename IType, typename VType >
+			class argmax {
+
+				static_assert( std::is_integral< IType >::value,
+					"Argmin operator may only be constructed using integral index "
+					"types." );
+
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef std::pair< IType, VType > left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef std::pair< IType, VType > right_type;
+
+				/** Alias to the output data type. */
+				typedef std::pair< IType, VType > result_type;
+
+				/** Whether this operator has an inplace foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an inplace foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of the operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( a->second > b->second ) {
+						c->first = a->first;
+						c->second = a->second;
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( a->second > c->second ) {
+						c->first = a->first;
+						c->second = a->second;
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( b->second >= c->second ) {
+						c->first = b->first;
+						c->second = b->second;
+					}
+				}
+			};
+
+			/**
+			 * Standard left-hand side assignment operator.
+			 *
+			 * Takes binary input, but ignores the right-hand side input and simply
+			 * assigns the left-hand side input to the output variable.
+			 *
+			 * Assumes native availability of = on the given data types, or assumes
+			 * the relevant operators are properly overloaded.
+			 *
+			 * Assumes a binary operator defined using the =-operator in the following
+			 * way, is \em associative:
+			 * \code
+			 * void left_assign( const IN1 x, const IN2 y, OUT &out ) {
+			 *     (void)y;
+			 *     out = x;
+			 * }
+			 * \endcode
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) =
+			 * operators should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #has_foldl, #has_foldr, and
+			 * the other fields, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class left_assign {
+
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an inplace foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an inplace foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = false;
+
+				/**
+				 * Out-of-place application of the addition c = a.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					(void)b;
+					*c = static_cast< result_type >( *a );
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					*c = static_cast< result_type >( *a );
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					(void)b;
+					(void)c;
+				}
+			};
+
+			/**
+			 * Standard right-hand side assignment operator.
+			 *
+			 * Takes binary input, but ignores the right-hand side input and simply
+			 * assigns the left-hand side input to the output variable.
+			 *
+			 * Assumes native availability of = on the given data types, or assumes
+			 * the relevant operators are properly overloaded.
+			 *
+			 * Assumes a binary operator defined using the =-operator in the following
+			 * way, is \em associative:
+			 * \code
+			 * void right_assign( const IN1 x, const IN2 y, OUT &out ) {
+			 *     (void)x;
+			 *     out = y;
+			 * }
+			 * \endcode
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) =
+			 * operators should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #has_foldl, #has_foldr, and
+			 * the other fields, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class right_assign {
+
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an inplace foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an inplace foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = false;
+
+				/**
+				 * Out-of-place application of the addition c = a.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					(void)a;
+					*c = *b;
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					(void)a;
+					(void)c;
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					*c = static_cast< result_type >( *b );
+				}
+			};
+
+			/**
+			 * Left-sided operator that combines an indicator and an identity function
+			 * as follows:
+			 *
+			 * \f$ z = x \odot y = x \text{ if } y \text{evaluates true}. \f$
+			 *
+			 * If \f$ x \f$ does not evaluate true the operator shall have no effect.
+			 */
+			template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
+			class left_assign_if {
+
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef D1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef D2 right_type;
+
+				/** Alias to the output data type. */
+				typedef D3 result_type;
+
+				/** Whether this operator has an inplace foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an inplace foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of the addition c = a.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 */
+				static void apply( const D1 * __restrict__ const a, const D2 * __restrict__ const b, D3 * __restrict__ const c ) {
+					if( static_cast< const bool >( *b ) ) {
+						*c = *a;
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const D1 * __restrict__ const a, D3 * __restrict__ const c ) {
+					if( static_cast< const bool >( *c ) ) {
+						*c = static_cast< D3 >( *a );
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( D3 * __restrict__ const c, const D2 * __restrict__ const b ) {
+					if( static_cast< bool >( *b ) ) {
+						*c = static_cast< D3 >( static_cast< D1 >( *c ) );
+					}
+				}
+			};
+
+			/**
+			 * Right-sided operator that combines an indicator and an identity function
+			 * as follows:
+			 *
+			 * \f$ z = x \odot y = y \text{ if } x \text{evaluates true}. \f$
+			 *
+			 * If \f$ x \f$ does not evaluate true the operator shall have no effect.
+			 */
+			template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
+			class right_assign_if {
+
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef D1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef D2 right_type;
+
+				/** Alias to the output data type. */
+				typedef D3 result_type;
+
+				/** Whether this operator has an inplace foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an inplace foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of the addition c = a.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 */
+				static void apply( const D1 * __restrict__ const a, const D2 * __restrict__ const b, D3 * __restrict__ const c ) {
+					if( static_cast< const bool >( *a ) ) {
+						*c = *b;
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const D1 * __restrict__ const a, D3 * __restrict__ const c ) {
+					if( static_cast< const bool >( *a ) ) {
+						*c = static_cast< D3 >( static_cast< D2 >( *c ) );
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( D3 * __restrict__ const c, const D2 * __restrict__ const b ) {
+					if( static_cast< bool >( *c ) ) {
+						*c = static_cast< D3 >( *b );
+					}
+				}
+			};
+
+			/**
+			 * Standard additive operator.
+			 *
+			 * Assumes native availability of + on the given data types or assumes that
+			 * the relevant operators are properly overloaded.
+			 *
+			 * Assumes that the + operator is associative \em and commutative when
+			 * assuming perfect arithmetic and equal data types for \a IN1, \a IN2, and
+			 * \a OUT.
+			 *
+			 * Non-standard or non-matching data types or non-standard (overloaded) +
+			 * operators, should therefore be used with caution and may necessitate an
+			 * explicit definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			// [Example Base Operator Implementation]
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class add {
+
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an inplace foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an inplace foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of the addition c = a + b.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * \warning Passing invalid pointers will result in UB.
+				 */
+				static void apply( const left_type * __restrict__ const a,
+					const right_type * __restrict__ const b,
+					result_type * __restrict__ const c
+				) {
+					ALP_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+					                                    // see internal issue 306 for rationale
+					*c = *a + *b;
+					ALP_UTIL_RESTORE_WARNINGS
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 *
+				 * \warning Passing invalid pointers will result in UB.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					*c += *a;
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 *
+				 * \warning Passing invalid pointers will result in UB.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					*c += *b;
+				}
+			};
+			// [Example Base Operator Implementation]
+
+			/**
+			 * Standard multiplicative operator.
+			 *
+			 * Assumes native availability * on the given data types, or assumes
+			 * the relevant operators are properly overloaded.
+			 *
+			 * Assumes that the * operator is associative \em and commutative when
+			 * assuming perfect arithmetic and equal data types for \a IN1, \a IN2, and
+			 * \a OUT.
+			 *
+			 * Non-standard or non-matching data types or non-standard (overloaded) *
+			 * operators, should therefore be used with caution and may necessitate an
+			 * explicit definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class mul {
+
+			public:
+
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of the multiplication c = a * b.
+				 *
+				 * @param[in]  a Pointer to the left-hand side input. Must be initialised.
+				 * @param[in]  b Pointer to the right-hand side input. Must be initialised.
+				 * @param[out] c Pointer to where to compute the output.
+				 *
+				 * \warning All pointers must be valid or UB occurs.
+				 */
+				static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+				) {
+					ALP_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+					                                    // see internal issue 306 for rationale
+					*c = *a * *b;
+					ALP_UTIL_RESTORE_WARNINGS
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					*c *= *a;
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					*c *= *b;
+				}
+			};
+
+			/**
+			 * Standard max operator.
+			 *
+			 * Assumes native availability of < on the given data types, or assumes
+			 * the relevant operators are properly overloaded.
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) <
+			 * operators, should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class max {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of the max operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \max\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( *a < *b ) {
+						*c = static_cast< OUT >( *b );
+					} else {
+						*c = static_cast< OUT >( *a );
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a > *c ) {
+						*c = *a;
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b > *c ) {
+						*c = *b;
+					}
+				}
+			};
+
+			/**
+			 * Standard min operator.
+			 *
+			 * Assumes native availability of > on the given data types, or assumes
+			 * the relevant operators are properly overloaded.
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) >
+			 * operators, should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class min {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of the min operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( *a > *b ) {
+						*c = static_cast< OUT >( *b );
+					} else {
+						*c = static_cast< OUT >( *a );
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a < *c ) {
+						*c = *a;
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b < *c ) {
+						*c = *b;
+					}
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class substract {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = false;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = false;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					*c = *a - *b;
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					*c = *a - *c;
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					*c -= *b;
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class divide {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = false;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = false;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = a/b \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					*c = *a / *b;
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					*c = *a / *c;
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					*c /= *b;
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class divide_reverse {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = false;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = false;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = b/a \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					*c = *b / *a;
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					*c /= *a;
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					*c = *b / *c;
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class equal {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( *a == *b ) {
+						*c = static_cast< OUT >( true );
+					} else {
+						*c = static_cast< OUT >( false );
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a == *c ) {
+						*c = static_cast< result_type >( true );
+					} else {
+						*c = static_cast< result_type >( false );
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b == *c ) {
+						*c = static_cast< result_type >( true );
+					} else {
+						*c = static_cast< result_type >( false );
+					}
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class not_equal {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					ALP_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+					                                    // see internal issue 306 for rationale
+					if( *a != *b ) {
+						*c = static_cast< OUT >( true );
+					} else {
+						*c = static_cast< OUT >( false );
+					}
+					ALP_UTIL_RESTORE_WARNINGS
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a != *c ) {
+						*c = static_cast< result_type >( true );
+					} else {
+						*c = static_cast< result_type >( false );
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b != *c ) {
+						*c = static_cast< result_type >( true );
+					} else {
+						*c = static_cast< result_type >( false );
+					}
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class any_or {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( *a ) {
+						*c = static_cast< OUT >( *a );
+					} else if( *b ) {
+						*c = static_cast< OUT >( *b );
+					} else {
+						assert( ! ( *a ) );
+						*c = static_cast< OUT >( *a );
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a ) {
+						*c = static_cast< result_type >( *a );
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b ) {
+						*c = static_cast< result_type >( *b );
+					}
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class logical_or {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( *a || *b ) {
+						*c = static_cast< OUT >( true );
+					} else {
+						*c = static_cast< OUT >( false );
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a || *c ) {
+						*c = static_cast< result_type >( true );
+					} else {
+						*c = static_cast< result_type >( false );
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b || *c ) {
+						*c = static_cast< result_type >( true );
+					} else {
+						*c = static_cast< result_type >( false );
+					}
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class logical_and {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( *a && *b ) {
+						*c = static_cast< OUT >( true );
+					} else {
+						*c = static_cast< OUT >( false );
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a && *c ) {
+						*c = static_cast< result_type >( true );
+					} else {
+						*c = static_cast< result_type >( false );
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b && *c ) {
+						*c = static_cast< result_type >( true );
+					} else {
+						*c = static_cast< result_type >( false );
+					}
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class abs_diff {
+
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = false;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( *a < *b ) {
+						*c = static_cast< OUT >( *b - *a );
+					} else {
+						*c = static_cast< OUT >( *a - *b );
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a < *c ) {
+						*c -= *a;
+					} else {
+						*c = static_cast< OUT >( *a - *c );
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b < *c ) {
+						*c -= *b;
+					} else {
+						*c = static_cast< OUT >( *b - *c );
+					}
+				}
+			};
+
+			/** \todo add documentation */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class relu {
+			public:
+				/** Alias to the left-hand input data type. */
+				typedef IN1 left_type;
+
+				/** Alias to the right-hand input data type. */
+				typedef IN2 right_type;
+
+				/** Alias to the output data type. */
+				typedef OUT result_type;
+
+				/** Whether this operator has an in-place foldl. */
+				static constexpr bool has_foldl = true;
+
+				/** Whether this operator has an in-place foldr. */
+				static constexpr bool has_foldr = true;
+
+				/**
+				 * Whether this operator is \em mathematically associative; that is,
+				 * associative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_associative = true;
+
+				/**
+				 * Whether this operator is \em mathematically commutative; that is,
+				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+				 */
+				static constexpr bool is_commutative = true;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = ReLU\{a,b\} = \begin{cases}
+				 *   a \text{, if } a>b \\
+				 *   b \text{, otherwise}
+				 * \end{cases}\f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( *a < *b ) {
+						*c = static_cast< OUT >( *b );
+					} else {
+						*c = static_cast< OUT >( *a );
+					}
+				}
+
+				/**
+				 * In-place left-to-right folding.
+				 *
+				 * @param[in]     a Pointer to the left-hand side input data.
+				 * @param[in,out] c Pointer to the right-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 */
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					if( *a > *c ) {
+						*c = *a;
+					}
+				}
+
+				/**
+				 * In-place right-to-left folding.
+				 *
+				 * @param[in,out] c Pointer to the left-hand side input data. This also
+				 *                  dubs as the output memory area.
+				 * @param[in]     b Pointer to the right-hand side input data.
+				 */
+				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
+					if( *b > *c ) {
+						*c = *b;
+					}
+				}
+			};
+
+			template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
+			class square_diff {
+			public:
+				typedef D1 left_type;
+				typedef D2 right_type;
+				typedef D3 result_type;
+
+				static constexpr bool has_foldl = true;
+				static constexpr bool has_foldr = true;
+				static constexpr bool is_associative = false;
+				static constexpr bool is_commutative = true;
+
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					*c = ( *a - *b ) * ( *a - *b );
+				}
+
+				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
+					*c = ( *a - *c ) * ( *a - *c );
+				}
+
+				static void foldl( const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					*c = ( *c - *b ) * ( *c - *b );
+				}
+			};
+
+			/**
+			 * left operand of type IN1,
+			 * right operand of type IN2
+			 * result of type std::pair< IN1, IN2 >
+			 *
+			 * for use together with argmin
+			 */
+			template< typename IN1, typename IN2, enum Backend implementation = config::default_backend >
+			class zip {
+			public:
+				typedef IN1 left_type;
+				typedef IN2 right_type;
+				typedef std::pair< IN1, IN2 > result_type;
+
+				static constexpr bool has_foldl = false;
+				static constexpr bool has_foldr = false;
+				static constexpr bool is_associative = false;
+				static constexpr bool is_commutative = false;
+
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					*c = std::make_pair( *a, *b );
+				}
+			};
+
+			/**
+			 * compares the first argument of a pair
+			 */
+			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			class equal_first {
+			public:
+				typedef IN1 left_type;
+
+				typedef IN2 right_type;
+
+				typedef OUT result_type;
+
+				static constexpr bool has_foldl = false;
+				static constexpr bool has_foldr = false;
+				static constexpr bool is_associative = false;
+				static constexpr bool is_commutative = false;
+
+				/**
+				 * Out-of-place application of this operator.
+				 *
+				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+				 * @param[out] c The output. Must be pre-allocated.
+				 *
+				 * At the end of the operation, \f$ c = a->first == b->first \f$.
+				 */
+				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
+					if( a->first == b->first ) {
+						*c = static_cast< OUT >( true );
+					} else {
+						*c = static_cast< OUT >( false );
+					}
+				}
+			};
+
+			/**
+			 * This class takes a generic operator implementation and exposes a more
+			 * convenient apply() function based on it. This function allows arbitrary
+			 * data types being passed as parameters, and automatically handles any
+			 * casting required for the raw operator.
+			 *
+			 * @tparam OP The generic operator implementation.
+			 *
+			 * @see Operator for full details.
+			 */
+			template< typename OP, enum Backend implementation = config::default_backend >
+			class OperatorBase {
+
+			protected:
+				/** The block size that should be used during map-like operations. */
+				static constexpr size_t blocksize = alp::utils::static_min( alp::config::SIMD_BLOCKSIZE< typename OP::left_type >::value(),
+					alp::utils::static_min( alp::config::SIMD_BLOCKSIZE< typename OP::right_type >::value(), alp::config::SIMD_BLOCKSIZE< typename OP::result_type >::value() ) );
+
+				/** The left-hand side input domain. */
+				typedef typename OP::left_type D1;
+
+				/** The right-hand side input domain. */
+				typedef typename OP::right_type D2;
+
+				/** The output domain. */
+				typedef typename OP::result_type D3;
+
+			public:
+				/** @return Whether this operator is mathematically associative. */
+				static constexpr bool is_associative() {
+					return OP::is_associative;
+				}
+
+				/** @return Whether this operator is mathematically commutative. */
+				static constexpr bool is_commutative() {
+					return OP::is_commutative;
+				}
+
+				/**
+				 * Straightforward application of this operator. Computes \f$ x \odot y \f$
+				 * and stores the result in \a z.
+				 *
+				 * @tparam InputType1 The type of the input parameter \a x.
+				 * @tparam InputType2 The type of the input parameter \a y.
+				 * @tparam OutputType The type of the output parameter \a z.
+				 *
+				 * \warning If \a InputType1 does not match \a D! \em or \a InputType2 does
+				 *          not match \a D2 \em or \a OutputType does not match \a D3, then
+				 *          the input will be cast into temporary variables of the correct
+				 *          types, while the output will be cast from a temporary variable,
+				 *
+				 * \note Best performance is thus only guaranteed when all domains match.
+				 *
+				 * @param[in]  x The left-hand side input.
+				 * @param[in]  y The right-hand side input.
+				 * @param[out] z The output element.
+				 */
+				template< typename InputType1, typename InputType2, typename OutputType >
+				static void apply( const InputType1 & x, const InputType2 & y, OutputType & z ) {
+					const D1 a = static_cast< D1 >( x );
+					const D2 b = static_cast< D2 >( y );
+					D3 temp;
+					OP::apply( &a, &b, &temp );
+					z = static_cast< OutputType >( temp );
+				}
+
+				/**
+				 * This is the high-performance version of apply() in the sense that no
+				 * casting is required. This version will be automatically caled whenever
+				 * possible.
+				 */
+				static void apply( const D1 & x, const D2 & y, D3 & out ) {
+					OP::apply( &x, &y, &out );
+				}
+			};
+
+			/**
+			 * A class capable of adding an out-of-place \a foldr function for an
+			 * operator that is not fold-right capable, or capable of adding an in-
+			 * place foldr function for an operator that is fold-right capable. For
+			 * fold-right capable operators, this class is also capable of adding
+			 * an efficient eWiseApply function.
+			 *
+			 * An operator is fold-right capable when the Base Operator \a OP
+			 * provides an in-place foldr implementation, \em and whenever \a D1
+			 * equals \a D3. If one of either requirements is not met, then \a OP
+			 * is not fold-right capable and this class is selected to add an out-
+			 * of-place foldr function.
+			 *
+			 * @tparam OP    The generic operator implementation.
+			 * @tparam guard This typename is void if and only if \a OP is not fold-
+			 *               right capable. In this case, this class adds an out-of-
+			 *               place foldr implementation to the operator.
+			 *               If it is not void, then this class defines an
+			 *               in-place foldr implementation instead.
+			 *
+			 * \note This specific class corresponds to the \a guard variable equal to
+			 *       \a void.
+			 *
+			 * @see Operator for full details.
+			 * @see OperatorBase for additional functions exposed to the final operator.
+			 */
+			template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+			class OperatorFR : public OperatorBase< OP > {
+
+			public:
+				/**
+				 * Emulated in-place application of this operator on two data elements.
+				 *
+				 * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
+				 *
+				 * We wish to call this in-place variant internally for brevity. However,
+				 * if \a OP has no in-place variant, then we must cache the previous
+				 * value of the output element or otherwise we will breach the
+				 * __restrict__ contract of OP::apply.
+				 * The caller must ensure the appropriate domains and casting behaviour
+				 * is applicable. Note that a user is never to call these functions
+				 * explicitly.
+				 *
+				 * @tparam InputType The type of the parameter \a x.
+				 * @tparam IOType    The type of the parameter \a y.
+				 *
+				 * \warning Additional casting and use of temporary variables may occur
+				 *          when \a InputType does not match \a D1 \em or \a IOType
+				 *          does not match \a D3.
+				 *
+				 * \note This implementation relies on apply().
+				 *
+				 * @param[in]     x The value that is to be applied to \a y.
+				 * @param[in,out] y The value \a x is to be applied against.
+				 */
+				template< typename InputType, typename IOType >
+				static void foldr( const InputType & x, IOType & y ) {
+					typedef typename OperatorBase< OP >::D2 D2;
+					const D2 cache = static_cast< D2 >( y );
+					OperatorBase< OP >::apply( x, cache, y );
+				}
+
+				/**
+				 * Out-of-place element-wise foldr function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ x_i \odot z_i \f$ and stores the result into
+				 * \f$ z_i \f$.
+				 *
+				 * @tparam InputType The type of elements in \a x.
+				 * @tparam IOType    The type of elements in \a z.
+				 *
+				 * @param x The left-hand side input data.
+				 * @param z Where \a x shall be mapped into.
+				 * @param n How many data elements \a x and \a z contain.
+				 *
+				 * This version requires three buffers, streams \a x once,
+				 * and streams \a z twice (once for reading, once for
+				 * writing.
+				 */
+				template< typename InputType, typename IOType >
+				static void eWiseFoldrAA( const InputType * __restrict__ const x, IOType * __restrict__ const z, const size_t n ) {
+					// local buffers
+					typedef typename OperatorBase< OP >::D1 D1;
+					typedef typename OperatorBase< OP >::D2 D2;
+					typedef typename OperatorBase< OP >::D3 D3;
+					D1 left_buffer[ OperatorBase< OP >::blocksize ];
+					D2 right_buffer[ OperatorBase< OP >::blocksize ];
+					D3 result_buffer[ OperatorBase< OP >::blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + OperatorBase< OP >::blocksize <= n ) {
+						// load into buffers
+						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+							right_buffer[ b ] = static_cast< D2 >( z[ i ] );
+						}
+
+						// rewind source and output
+						i -= OperatorBase< OP >::blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
+							OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+							z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+						right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
+						OP::apply( left_buffer, right_buffer, result_buffer );
+						z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+					}
+				}
+
+				/**
+				 * Out-of-place element-wise foldr function. Calculates
+				 * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ x \odot z_i \f$ and stores the result into
+				 * \f$ z_i \f$.
+				 *
+				 * @tparam InputType The type of elements in \a x.
+				 * @tparam IOType    The type of elements in \a z.
+				 *
+				 * @param x The left-hand side input value.
+				 * @param z Where \a x shall be mapped into.
+				 * @param n How many data elements \a z contains.
+				 *
+				 * This version requires two buffers and streams \a z
+				 * twice (once for reading, once for writing).
+				 */
+				template< typename InputType, typename IOType >
+				static void eWiseFoldrSA( const InputType x, IOType * __restrict__ const z, const size_t n ) {
+					// local buffers
+					typedef typename OperatorBase< OP >::D1 D1;
+					typedef typename OperatorBase< OP >::D2 D2;
+					typedef typename OperatorBase< OP >::D3 D3;
+					const D1 left_buffer = x; // this is actually mandatory in case x is a temporary
+					D2 right_buffer[ OperatorBase< OP >::blocksize ];
+					D3 result_buffer[ OperatorBase< OP >::blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + OperatorBase< OP >::blocksize <= n ) {
+						// load into buffers
+						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+							right_buffer[ b ] = static_cast< D2 >( z[ i ] );
+						}
+
+						// rewind source and output
+						i -= OperatorBase< OP >::blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
+							OP::apply( &left_buffer, &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+							z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
+						OP::apply( &left_buffer, right_buffer, result_buffer );
+						z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+					}
+				}
+			};
+
+			/**
+			 * This class provides an in-place foldr implementation for Base Operators
+			 * that are fold-right capable given its provided domains. It also implements
+			 * an eWiseApply function that requires two buffers by exploiting the
+			 * in-place foldr operator. Without an in-place foldr, it is still possible
+			 * to implement an eWiseApply using two buffers if there is an in-place foldl
+			 * added via OperatorFL. If that also fails, the eWiseApply function will be
+			 * implemented using three buffers via OperatorNoFRFL.
+			 *
+			 * @tparam OP The generic operator implementation.
+			 *
+			 * @see Operator for full details.
+			 * @see OperatorFR for details on fold-right capable operators and behaviour
+			 *                 for non fold-right capable operators.
+			 * @see OperatorBase for additional functions exposed to the final operator.
+			 */
+			template< typename OP >
+			class OperatorFR< OP, typename std::enable_if< OP::has_foldr && std::is_same< typename OP::right_type, typename OP::result_type >::value >::type > : public OperatorBase< OP > {
+
+			private:
+				typedef typename OperatorBase< OP >::D1 D1;
+				typedef typename OperatorBase< OP >::D3 D3;
+				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+			public:
+				/**
+				 * In-place application of this operator on two data elements.
+				 *
+				 * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
+				 *
+				 * \note This variant is only called when the underlying raw operator
+				 *       supports in-place operations.
+				 *
+				 * The caller must ensure the appropriate domains and casting behaviour
+				 * is applicable. Note that a user is never to call these functions
+				 * explicitly.
+				 *
+				 * @param[in]     x The value that is to be applied to \a y.
+				 * @param[in,out] y The value \a x is to be applied against.
+				 */
+				static void foldr( const D1 & x, D3 & y ) {
+					OP::foldr( &x, &y );
+				}
+
+				/**
+				 * In-place element-wise foldr function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ x \odot z_i \f$ and stores the result into \f$ z_i \f$.
+				 *
+				 * @tparam InputType The type of \a x.
+				 * @tparam IOType    The type of elements in \a z.
+				 *
+				 * @param[in]     x The left-hand side input value.
+				 * @param[in,out] z Where \a x shall be mapped into.
+				 * @param[in]     n How many data elements \a z contains.
+				 *
+				 * This implementation requires one buffers only. It streams \a z twice,
+				 * once for reading, once for writing. This function should vectorise.
+				 */
+				template< typename InputType, typename IOType >
+				static void eWiseFoldrSA( const InputType x, IOType * __restrict__ const z, const size_t n ) {
+					// local buffers
+					const D1 left_buffer = static_cast< D1 >( x );
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							result_buffer[ b ] = static_cast< D3 >( z[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::foldr( &left_buffer, &( result_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
+						OP::foldr( &left_buffer, result_buffer );
+						z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+					}
+				}
+
+				/**
+				 * In-place element-wise foldr function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ x_i \odot z_i \f$ and stores the result into \f$ z_i \f$.
+				 *
+				 * @tparam InputType The type of elements in \a x.
+				 * @tparam IOType    The type of elements in \a z.
+				 *
+				 * @param[in]     x The left-hand side input data.
+				 * @param[in,out] z Where \a x shall be mapped into.
+				 * @param[in]     n How many data elements \a x and \a z contain.
+				 *
+				 * This implementation requires two buffers only. It streams \a x once,
+				 * while streaming \a z twice (once for reading, once for writing). This
+				 * function should vectorise.
+				 */
+				template< typename InputType, typename IOType >
+				static void eWiseFoldrAA( const InputType * __restrict__ const x, IOType * __restrict__ const z, const size_t n ) {
+					// local buffers
+					D1 left_buffer[ blocksize ];
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+							result_buffer[ b ] = static_cast< D3 >( z[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+						result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
+						OP::foldr( left_buffer, result_buffer );
+						z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+					}
+				}
+
+				/**
+				 * In-place element-wise apply function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ z_i = x_i \odot y_i \f$.
+				 *
+				 * @tparam InputType1 The type of elements in \a x.
+				 * @tparam InputType2 The type of elements in \a y.
+				 * @tparam OutputType The type of elements in \a z.
+				 *
+				 * If \a InputType2 and \a D3 are not the same, then the existing data in
+				 * \a y is cast to \a D3 prior to application of this in-place operator.
+				 * If \a InputType1 and \a D1 are not the same, then the existing data in
+				 * \a x are cast to \a D1 prior to application of this in-place operator.
+				 * If \a OutputType and \a D3 are not the same, then the results of
+				 * applying this operator are cast to \a OutputType prior to writing back
+				 * the results.
+				 *
+				 * \warning The first casting behaviour may not be what you want. The two
+				 *          other casting behaviours are allowed by the GraphBLAS unless
+				 *          the alp::descriptor::no_casting is given.
+				 *
+				 * \note By default, this GraphBLAS implementation will only use this
+				 *       code when \a D2 matches \a D3 and OP::has_foldr is \a true.
+				 *
+				 * This implementation relies on an in-place foldr().
+				 *
+				 * @param[in]  x The left-hand side input data. The memory range starting
+				 *               at \a x and ending at \a x + n (exclusive) may not
+				 *               overlap with the memory area starting at \a z and ending
+				 *               at \a z + n (exclusive).
+				 * @param[in]  y The right-hand side input data. The memory range starting
+				 *               at \a y and ending at \a y + n (exclusive) may not
+				 *               overlap with the memory area starting at \a z and ending
+				 *               at \a z + n.
+				 * @param[out] z Where the map of \a x into \a y must be stored. This
+				 *               pointer is restricted in the sense that its memory may
+				 *               never overlap with those pointed to by \a x or \y, as
+				 *               detailed above.
+				 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
+				 */
+				template< typename InputType1, typename InputType2, typename OutputType >
+				static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
+#ifdef _DEBUG
+#ifdef D_ALP_NO_STDIO
+					std::cout << "In OperatorFR::eWiseApply\n";
+#endif
+#endif
+					// NOTE: this variant is only active when the computation can be done using two buffers only
+
+					// local buffers
+					D1 left_buffer[ blocksize ];
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+							result_buffer[ b ] = static_cast< D3 >( y[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						left_buffer[ 0 ] = static_cast< typename OP::left_type >( x[ i ] );
+						result_buffer[ 0 ] = static_cast< typename OP::result_type >( y[ i ] );
+						OP::foldr( left_buffer, result_buffer );
+						z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
+					}
+				}
+			};
+
+			/**
+			 * A class capable of adding an out-of-place \a foldl function for an
+			 * operator that is not fold-left capable, or capable of adding an in-
+			 * place foldl function for an operator that is fold-left capable.
+			 *
+			 * An operator is fold-left capable when the Base Operator \a OP provides
+			 * an in-place foldl implementation, \em and whenever \a D2 equals \a D3.
+			 * If one of either requirements is not met, then \a OP is not fold-left
+			 * capable and this class is selected to add an out-of-place foldl function.
+			 *
+			 * @tparam OP    The generic operator implementation.
+			 * @tparam guard This typename is void if and only if \a OP is not fold-
+			 *               left capable. In this case, this class adds an
+			 *               out-of-place foldl implementation to the operator.
+			 *               If \a guard is not void, then this class defines an
+			 *               in-place foldr implementation instead.
+			 *
+			 * \note This specific class corresponds to the \a guard variable equal to
+			 *       \a void.
+			 *
+			 * @see Operator for full details.
+			 * @see OperatorFR for additional functions exposed to the resulting
+			 *                 operator.
+			 * @see OperatorBase for additional functions exposed to the resulting
+			 *                   operator.
+			 */
+			template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+			class OperatorFL : public OperatorFR< OP > {
+
+			private:
+			public:
+				typedef typename OperatorBase< OP >::D1 D1;
+				typedef typename OperatorBase< OP >::D2 D2;
+				typedef typename OperatorBase< OP >::D3 D3;
+				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+				/**
+				 * Emulated in-place application of this operator on two data elements.
+				 *
+				 * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
+				 *
+				 * We wish to call this in-place variant internally for brevity. However,
+				 * if \a OP has no in-place variant, then we must cache the previous
+				 * value of the output element or otherwise we will breach the
+				 * __restrict__ contract of OP::apply.
+				 * The caller must ensure the appropriate domains and casting behaviour
+				 * is applicable. Note that a user is never to call these functions
+				 * explicitly.
+				 *
+				 * @tparam InputType The type of the parameter \a x.
+				 * @tparam IOType    The type of the parameter \a y.
+				 *
+				 * \warning Additional casting and use of temporary variables may occur
+				 *          when \a InputType does not match \a D2 \em or \a IOType
+				 *          does not match \a D3.
+				 *
+				 * \note This implementation relies on apply().
+				 *
+				 * @param[in,out] x The value \a y is to be applied against.
+				 * @param[in]     y The value that is to be applied to \a x.
+				 */
+				template< typename InputType, typename IOType >
+				static void foldl( IOType & x, const InputType & y ) {
+					const D1 cache = static_cast< D1 >( x );
+					OperatorBase< OP >::apply( cache, y, x );
+				}
+
+				/**
+				 * Out-of-place element-wise foldl function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
+				 *
+				 * @tparam IOType    The type of elements in \a x.
+				 * @tparam InputType The type of \a y.
+				 *
+				 * @param[in, out] x At function entry, the left-hand side input data.
+				 *                   At function exit, the output data as defined above.
+				 * @param[in]      y The right-hand side input value.
+				 * @param[in]      n How many data elements \a x contains.
+				 *
+				 * This version requires two buffers and streams \a x twice (once for
+				 * reading, once for writing). This function should vectorise its
+				 * out-of-place operations.
+				 */
+				template< typename IOType, typename InputType >
+				static void eWiseFoldlAS( IOType * __restrict__ const x, const InputType y, const size_t n ) {
+					// local buffers
+					D1 left_buffer[ blocksize ];
+					const D2 right_buffer = y;
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::apply( &( left_buffer[ b ] ), &right_buffer, &( result_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+						OP::apply( left_buffer, &right_buffer, result_buffer );
+						x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+					}
+				}
+
+				/**
+				 * Out-of-place element-wise foldl function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
+				 *
+				 * @tparam IOType    The type of elements in \a x.
+				 * @tparam InputType The type of elements in \a y.
+				 *
+				 * @param[in, out] x At function entry, the left-hand side input data.
+				 *                   At function exit, the output data as defined above.
+				 * @param[in]      y The right-hand side input.
+				 * @param[in]      n How many data elements \a x and \a y contain.
+				 *
+				 * This version requires three buffers, streams \a y once, and streams
+				 * \a x twice (once for reading, once for writing). This function should
+				 * vectorise its out-of-place operations.
+				 */
+				template< typename IOType, typename InputType >
+				static void eWiseFoldlAA( IOType * __restrict__ const x, const InputType * __restrict__ const y, const size_t n ) {
+					// local buffers
+					D1 left_buffer[ blocksize ];
+					D2 right_buffer[ blocksize ];
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+							right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+						right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+						OP::apply( left_buffer, right_buffer, result_buffer );
+						x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+					}
+				}
+			};
+
+			/**
+			 * This class provides an in-place foldl implementation for Base Operators
+			 * that are fold-left capable given its provided domains.
+			 *
+			 * @tparam OP The generic operator implementation.
+			 *
+			 * @see Operator for full details.
+			 * @see OperatorFL for details on fold-right capable operators and behaviour
+			 *                 for non fold-right capable operators.
+			 * @see OperatorFR for additional functions exposed to the resulting
+			 *                 operator.
+			 * @see OperatorBase for additional functions exposed to the resulting
+			 *                   operator.
+			 */
+			template< typename OP >
+			class OperatorFL< OP, typename std::enable_if< OP::has_foldl && std::is_same< typename OP::left_type, typename OP::result_type >::value >::type > : public OperatorFR< OP > {
+
+			private:
+			public:
+				typedef typename OperatorBase< OP >::D2 D2;
+				typedef typename OperatorBase< OP >::D3 D3;
+				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+				/**
+				 * In-place application of this operator on two data elements.
+				 *
+				 * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
+				 *
+				 * \note This variant is only called when the underlying raw operator
+				 *       supports in-place operations.
+				 *
+				 * The caller must ensure the appropriate domains and casting behaviour
+				 * is applicable. Note that a user is never to call these functions
+				 * explicitly.
+				 *
+				 * @param[in,out] x The value \a y is to be applied against.
+				 * @param[in]     y The value that is to be applied to \a x.
+				 */
+				static void foldl( D3 & x, const D2 & y ) {
+					OP::foldl( &x, &y );
+				}
+
+				/**
+				 * In-place element-wise foldl function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
+				 *
+				 * @tparam IOType    The type of elements in \a x.
+				 * @tparam InputType The type of elements in \a y.
+				 *
+				 * @param[in,out] x At function extry: the left-hand side input data.
+				 *                  At function exit: the result data.
+				 * @param[in]     y The right-hand side input data.
+				 * @param[in]     n How many data elements \a x and \a y contain.
+				 *
+				 * This implementation requires two buffers only. It streams \a y once,
+				 * while streaming \a x twice (once for reading, once for writing). This
+				 * function should vectorise.
+				 */
+				template< typename InputType, typename IOType >
+				static void eWiseFoldlAA( IOType * __restrict__ const x, const InputType * __restrict__ const y, const size_t n ) {
+					// local buffers
+					D2 right_buffer[ blocksize ];
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+							result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+						result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+						OP::foldl( result_buffer, right_buffer );
+						x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+					}
+				}
+
+				/**
+				 * In-place element-wise foldl function. Calculates
+				 * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
+				 *
+				 * @tparam IOType    The type of elements in \a x.
+				 * @tparam InputType The type of \a y.
+				 *
+				 * @param[in,out] x At function extry: the left-hand side input data.
+				 *                  At function exit: the result data.
+				 * @param[in]     y The right-hand side input value.
+				 * @param[in]     n How many data elements \a x contains.
+				 *
+				 * This implementation requires one buffers only. It streams \a x twice
+				 * (once for reading, once for writing). This function should vectorise.
+				 */
+				template< typename InputType, typename IOType >
+				static void eWiseFoldlAS( IOType * __restrict__ const x, const InputType y, const size_t n ) {
+					// local buffers
+					const D2 right_buffer = static_cast< D2 >( y );
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::foldl( &( result_buffer[ b ] ), &right_buffer );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+						OP::foldl( result_buffer, &right_buffer );
+						x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+					}
+				}
+			};
+
+			/**
+			 * A class capable of adding an in-place \a eWiseApply function for an
+			 * operator that is fold-left capable but not fold-right capable.
+			 *
+			 * Like OperatorFR on an fold-right capable operator, this class is
+			 * capable of providing an eWiseApply function that requires only two
+			 * internal buffers by making use of the in-place foldl.
+			 *
+			 * @tparam OP The generic operator implementation.
+			 * @tparam guard This typename is void if and only if \a OP is fold-left
+			 *               capable but \em not fold-right capable. In this case,
+			 *               this class adds nothing to the resulting operator.
+			 *               If \a guard is not void, however, then this class adds an
+			 *               in-place eWiseApply implementation to this operator
+			 *               instead.
+			 *
+			 * @see Operator for full details.
+			 * @see OperatorFL for additional functions exposed to the resulting
+			 *                 operator.
+			 * @see OperatorFR for additional functions exposed to the resulting
+			 *                 operator and an alternative way of providing a more
+			 *                 efficient eWiseApply.
+			 * @see OperatorBase for additional functions exposed to the resulting
+			 *                   operator.
+			 */
+			template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+			class OperatorNoFR : public OperatorFL< OP > {};
+
+			/**
+			 * This class provides an in-place eWiseApply implementation for Base
+			 * Operators that are fold-left capable given its provided domains, but not
+			 * fold-right capable. This implementation uses two internal buffers and
+			 * relies on an in-place foldl. If this were not possible, then the
+			 * eWiseApply will be provided by OperatorNoFRFL in an implementation that
+			 * requires three buffers and out-of-place operations instead.
+			 *
+			 * @tparam OP The generic operator implementation.
+			 *
+			 * @see Operator for full details.
+			 * @see OperatorFL for additional functions exposed to the resulting
+			 *                 operator.
+			 * @see OperatorFR for additional functions exposed to the resulting
+			 *                 operator.
+			 * @see OperatorBase for additional functions exposed to the resulting
+			 *                   operator.
+			 */
+			template< typename OP >
+			class OperatorNoFR< OP, typename std::enable_if< OP::has_foldl && ! ( OP::has_foldr ) && std::is_same< typename OP::left_type, typename OP::result_type >::value >::type > :
+				public OperatorFL< OP > {
+
+			private:
+			public:
+				typedef typename OperatorBase< OP >::D2 D2;
+				typedef typename OperatorBase< OP >::D3 D3;
+				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+				/**
+				 * In-place element-wise apply function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ z_i = x_i \odot y_i \f$.
+				 *
+				 * @tparam InputType1 The type of elements in \a x.
+				 * @tparam InputType2 The type of elements in \a y.
+				 * @tparam OutputType The type of elements in \a z.
+				 *
+				 * If the \a InputType1 and \a D3 are not the same, then the existing data
+				 * in \a x is cast to \a D3 prior to application of this operator.
+				 * If \a InputType2 and \a D2 are not the same, then the existing data in
+				 * \a y is cast to \a D2 prior to application of this operator.
+				 * If \a OutputType and \a D3 are not the same, then the result of
+				 * applications of this operator are cast to \a OutputType prior to
+				 * writing it back to \a z.
+				 *
+				 * \warning The first casting behaviour may not be what you want. The two
+				 *          other casting behaviours are allowed by the GraphBLAS unless
+				 *          the alp::descriptor::no_casting is given.
+				 *
+				 * \note By default, this GraphBLAS implementation will only use this
+				 *       code when \a D1 matches \a D3 and OP::has_foldr is \a true.
+				 *       However, this implementation will never be enabled if \a D2
+				 *       equals \a D3 and OP::has_foldl is \a true.
+				 *
+				 * This implementation relies on an in-place foldl().
+				 *
+				 * @param[in]  x The left-hand side input data. The memory range starting
+				 *               at \a x and ending at \a x + n (exclusive) may not
+				 *               overlap with the memory area starting at \a z and ending
+				 *               at \a z + n (exclusive).
+				 * @param[in]  y The right-hand side input data. The memory range starting
+				 *               at \a y and ending at \a y + n (exclusive) may not
+				 *               overlap with the memory area starting at \a z and ending
+				 *               at \a z + n.
+				 * @param[out] z Where the map of \a x into \a y must be stored. This
+				 *               pointer is restricted in the sense that its memory may
+				 *               never overlap with those pointed to by \a x or \y, as
+				 *               detailed above.
+				 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
+				 */
+				template< typename InputType1, typename InputType2, typename OutputType >
+				static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
+#ifdef _DEBUG
+#ifdef D_ALP_NO_STDIO
+					std::cout << "In OperatorNoFR::eWiseApply\n";
+#endif
+#endif
+					// NOTE: this variant is only active when the computation can be done using two buffers only
+
+					// local buffers
+					D2 right_buffer[ blocksize ];
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+							result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+						result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+						OP::foldl( result_buffer, right_buffer );
+						z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
+					}
+				}
+			};
+
+			/**
+			 * A class capable of adding an out-of-place \a eWiseApply function for an
+			 * operator that, given its domains, is not fold-left capable \em an not
+			 * fold-right capable.
+			 *
+			 * If the given operator is not fold-left and not fold-right capable, then
+			 * both OperatorFR and OperatorNoFR have not yet added an eWiseApply
+			 * implementation. However, if there was already an in-place foldr or an
+			 * in-place foldl available, then this class will add no new functions to
+			 * the resulting operator.
+			 * A class capable of adding an out-of-place eWiseApply function for an
+			 * operator that is not fold-left capable \em and not fold-right capable.
+			 *
+			 * @tparam OP    The generic operator implementation.
+			 * @tparam guard This typename is void if and only if there is already an
+			 *               in-place eWiseApply defined by the base OperatorNoFR
+			 *               class or by the OperatorFR class. In this case, this
+			 *               class does not add any new public methods.
+			 *               If it is not void, then this class defines an
+			 *               out-of-place eWiseApply function.
+			 *
+			 * \note This specific class corresponds to the \a guard variable equal to
+			 *       \a void.
+			 *
+			 * @see Operator for full details.
+			 * @see OperatorNoFR for additional functions exposed to the resulting
+			 *                   operator.
+			 * @see OperatorFL for additional functions exposed to the resulting
+			 *                 operator.
+			 * @see OperatorFR for additional functions exposed to the resulting
+			 *                 operator and an alternative way of providing a more
+			 *                 efficient eWiseApply.
+			 * @see OperatorBase for additional functions exposed to the resulting
+			 *                   operator.
+			 */
+			template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+			class OperatorNoFRFL : public OperatorNoFR< OP > {};
+
+			/**
+			 * A class that adds an out-of-place \a eWiseApply function for an operator
+			 * that, given its domains, is not fold-left capable \em and not fold-right
+			 * capable.
+			 *
+			 * Contains further specialisations for an operator that is not fold-left,
+			 * capable \em and not fold-right capable. This means we have to supply an
+			 * eWiseApply function that uses the normal OperatorBase::apply function,
+			 * and thus uses three buffers instead of the two buffers required by its
+			 * in-place counterparts.
+			 *
+			 * @tparam OP The generic operator implementation.
+			 *
+			 * @see Operator for full details.
+			 * @see OperatorNoFR for additional functions exposed to the resulting
+			 *                   operator.
+			 * @see OperatorFL for additional functions exposed to the resulting
+			 *                 operator.
+			 * @see OperatorFR for additional functions exposed to the resulting
+			 *                 operator and an alternative way of providing a more
+			 *                 efficient eWiseApply.
+			 * @see OperatorBase for additional functions exposed to the resulting
+			 *                   operator and the OperatorBase::apply function this
+			 *                   class will use.
+			 */
+			template< typename OP >
+			class OperatorNoFRFL< OP,
+				typename std::enable_if< ( ! ( OP::has_foldl ) || ! ( std::is_same< typename OP::left_type, typename OP::result_type >::value ) ) &&
+					( ! ( OP::has_foldr ) || ! ( std::is_same< typename OP::right_type, typename OP::result_type >::value ) ) >::type > : public OperatorNoFR< OP > {
+
+			private:
+			public:
+				typedef typename OperatorBase< OP >::D1 D1;
+				typedef typename OperatorBase< OP >::D2 D2;
+				typedef typename OperatorBase< OP >::D3 D3;
+				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+				/** \anchor OperatorNoFRFLeWiseApply
+				 *
+				 * Standard out-of-place element-wise apply function. Calculates
+				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+				 * \f$ z_i = x_i \odot y_i \f$.
+				 *
+				 * This is the non-public variant that operates on raw arrays.
+				 *
+				 * @tparam InputType1 The type of elements in \a x.
+				 * @tparam InputType2 The type of elements in \a y.
+				 * @tparam OutputType The type of elements in \a z.
+				 *
+				 * If \a InputType1 and \a D1 are not the same, then the existing data in
+				 * \a x will be cast to \a D1 prior to application of this operator.
+				 * If \a InputType2 and \a D2 are not the same, then the existing data in
+				 * \a y will be cast to \a D2 prior to application of this operator.
+				 * If \a OutputType and \a D3 are not the same, then the results of
+				 * applications of this operator are cast to \a OutputType prior to
+				 * writing them back to \a z.
+				 *
+				 * \note The GraphBLAS can explicitly control all \em three of this
+				 *       casting behaviours via alp::descriptors::no_casting.
+				 *
+				 * \warning With the in-place variants of this code, unwanted behaviour
+				 *          cannot be prevented by use of alp::descriptors::no_casting.
+				 *          Therefore the current implementation only calls the in-place
+				 *          variants when \a D1 equals \a D3 (for foldl-based in-place),
+				 *          or when \a D2 equals \a D3 (for foldr-based ones).
+				 *
+				 * @param[in]  x The left-hand side input data. The memory range starting
+				 *               at \a x and ending at \a x + n (exclusive) may not
+				 *               overlap with the memory area starting at \a z and ending
+				 *               at \a z + n (exclusive).
+				 * @param[in]  y The right-hand side input data. The memory range starting
+				 *               at \a y and ending at \a y + n (exclusive) may not
+				 *               overlap with the memory area starting at \a z and ending
+				 *               at \a z + n.
+				 * @param[out] z Where the map of \a x into \a y must be stored. This
+				 *               pointer is restricted in the sense that its memory may
+				 *               never overlap with those pointed to by \a x or \y, as
+				 *               detailed above.
+				 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
+				 */
+				template< typename InputType1, typename InputType2, typename OutputType >
+				static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
+#ifdef _DEBUG
+#ifdef D_ALP_NO_STDIO
+					std::cout << "In OperatorNoFRFL::eWiseApply\n";
+#endif
+#endif
+					// NOTE: this variant is only active when the computation can NOT be done using two buffers only
+
+					// local buffers
+					D1 left_buffer[ blocksize ];
+					D2 right_buffer[ blocksize ];
+					D3 result_buffer[ blocksize ];
+
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+
+						// load into buffers
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+							right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+						}
+
+						// rewind source and output
+						i -= blocksize;
+
+						// operate within buffer
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
+						}
+
+						// write back result
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+						}
+					}
+
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+						right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+						OP::apply( left_buffer, right_buffer, result_buffer );
+						z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
+					}
+				}
+			};
+
+			/**
+			 * This is the operator interface exposed to the GraphBLAS implementation.
+			 *
+			 * \warning Note that most GraphBLAS usage requires associative operators.
+			 *          While very easily possible to create non-associative operators
+			 *          using this interface, passing them to GraphBLAS functions,
+			 *          either explicitly or indirectly (by, e.g., including them in a
+			 *          alp::Monoid or alp::Semiring), will lead to undefined
+			 *          behaviour.
+			 *
+			 * This class wraps around a base operator of type \a OP we denote by
+			 *        \f$ \odot:\ D_1\times D_2 \to D_3 \f$.
+			 *
+			 * \parblock
+			 * \par Base Operators
+			 *
+			 * The class \a OP is expected to define the following public function:
+			 *   - \a apply, which takes three pointers to parameters \f$ x \in D_1 \f$
+			 *      \f$ y \in D_2 \f$, and \f$ z \in D_3 \f$ and computes
+			 *      \f$ z = x \odot y \f$.
+			 *
+			 * It is also expected to define the following types:
+			 *   - \a left_type, which corresponds to \f$ D_1 \f$,
+			 *   - \a right_type, which corresponds to \f$ D_2 \f$,
+			 *   - \a result_type, which corresponds to \f$ D_3 \f$.
+			 *
+			 * It is also expected to define the following two public boolean fields:
+			 *   - \a has_foldr
+			 *   - \a has_foldl
+			 *
+			 * If \a has_foldr is \a true, then the class \a OP is expected to also
+			 * define the function
+			 *   - foldr, which takes two pointers to parameters \f$ x \in D_1 \f$
+			 *      and \f$ z \in D_2 \subseteq D_3 \f$ and stores in \a z the result of
+			 *      \f$ x \odot z \f$.
+			 *
+			 * If \a has_foldl is \a true, the the class \a OP is expected to also
+			 * define the function
+			 *   - foldl, which takes two pointers to parameters
+			 *      \f$ z \in D_1 \subseteq D_3 \f$ and \f$ y \in D_2 \f$ and stores in
+			 *      \a z the result of \f$ z \odot y \f$.
+			 *
+			 * For examples of these base operators, see alp::operators::internal::max
+			 * or alp::operators::internal::mul. An example of a full implementation,
+			 * in this case for numerical addition, is the following:
+			 *
+			 * \snippet internalops.hpp Example Base Operator Implementation
+			 *
+			 * \note GraphBLAS users should never call these functions directly. This
+			 *       documentation is provided for developers to understand or extend
+			 *       the current implementation, for example to include new operators.
+			 *
+			 * \warning When calling these functions directly, note that the pointers
+			 *          to the memory areas are declared using the \em restrict key
+			 *          word. One of the consequences is that all pointers given in a
+			 *          single call <em>may never refer to the same memory area, or
+			 *          undefined behaviour is invoked</em>.
+			 *
+			 * \endparblock
+			 *
+			 * \parblock
+			 * \par The exposed GraphBLAS Operator Interface
+			 *
+			 * The Base Operators as illustrated above are wrapped by this class to
+			 * provide a more convient API. It translates the functionality of any Base
+			 * Operator and exposes the following interface instead:
+			 *
+			 *   -# apply, which takes three parameters \f$ x, y, z \f$ of arbitrary
+			 *      types and computes \f$ z = x \odot y \f$ after performing any
+			 *      casting if required.
+			 *   -# foldr, which takes two parameters \f$ x, z \f$ of arbitrary types
+			 *      and computes \f$ z = x \odot z \f$ after performing any casting if
+			 *      required.
+			 *   -# foldl, which takes two parameters \f$ z, y \f$ of arbitrary types
+			 *      and computes \f$ z = z \odot y \f$ after performing any casting if
+			 *      required.
+			 *   -# eWiseApply, which takes three pointers to arrays \f$ x, y, z \f$
+			 *      and a size \a n. The arrays can correspond to elements of any type,
+			 *      all three with length at least \a n. For every i-th element of the
+			 *      three arrays, on the values \f$ x_i, y_i, z_i \f$, \f$ z_i \f$ will
+			 *      be set to \f$ x_i \odot y_i \f$.
+			 *   -# foldrArray, which takes a pointer to an array \f$ x \f$, a
+			 *      parameter \f$ z \f$ of arbitrary type, and a size \n as parameters.
+			 *      The value \f$ z \f$ will be overwritten to \f$ x_i \odot z \f$ for
+			 *      each of the \f$ i \in \{ 0, 1, \ldots, n-1 \} \f$. The order of
+			 *      application, in the sense of which \f$ i \f$ are processed first,
+			 *      is undefined.
+			 *   -# foldlArray, which takes as parameters: \f$ z \f$ of arbitrary type,
+			 *      an array \f$ y \f$, and a size \n. The value \f$ z \f$ will be
+			 *      overwritten to \f$ z \odot y_i \f$ for each of the
+			 *      \f$ i \in \{ 0, 1, \ldots, n-1 \} \f$. The order of application, in
+			 *      the sense of which \f$ i \f$ are processed first, is undefined.
+			 * \endparblock
+			 *
+			 * \note This class only allows wrapping of stateless base operators. This
+			 *       GraphBLAS implementation in principle allows for stateful
+			 *       operators, though they must be provided by a specialised class
+			 *       which directly implements the above public interface.
+			 *
+			 * @see OperatorBase::apply
+			 * @see OperatorFR::foldr
+			 * @see OperatorFL::foldl
+			 * @see \ref OperatorNoFRFLeWiseApply
+			 * @see Operator::foldrArray
+			 * @see Operator::foldlArray
+			 *
+			 * \parblock
+			 * \par Providing New Operators
+			 *
+			 * New operators are easily added to this
+			 * GraphBLAS implementation by providing a base operator and wrapping this
+			 * class around it, as illustrated, e.g., by alp::operators::add as follows:
+			 *
+			 * \snippet ops.hpp Operator Wrapping
+			 *
+			 * This need to be compatible with the GraphBLAS type traits, specifically,
+			 * the #is_operator template. To ensure this, a specialisation of it must be
+			 * privided:
+			 *
+			 * \snippet ops.hpp Operator Type Traits
+			 * \endparblock
+			 */
+			template< typename OP, enum Backend implementation = config::default_backend >
+			class Operator : public OperatorNoFRFL< OP > {
+
+			private:
+			public:
+				/** The maximum block size when vectorising this operation. */
+				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+				/** The left-hand side input domain of this operator. */
+				typedef typename OperatorBase< OP >::D1 D1;
+
+				/** The right-hand side input domain of this operator. */
+				typedef typename OperatorBase< OP >::D2 D2;
+
+				/** The output domain of this operator. */
+				typedef typename OperatorBase< OP >::D3 D3;
+
+				/**
+				 * Reduces a vector of type \a InputType into a value in \a IOType
+				 * by repeated application of this operator. The \a IOType is cast
+				 * into \a D3 prior reduction. The \a InputType is cast into \a D1
+				 * during reduction. The final result is cast to IOType after
+				 * reduction. The reduction happens `right-to-left'.
+				 *
+				 * This implementation relies on the \a foldr, whether it be an
+				 * true in-place or emulated version.
+				 *
+				 * @param[in,out] out On input, the initial value to be used for
+				 *                    reduction. On output, all elements of \a x
+				 *                    have been applied to \a out.
+				 * @param[in] x A vector of size \a n with elements of type \a left_type.
+				 * @param[in] n A positive integer (can be 0).
+				 */
+				template< typename IOType, typename InputType >
+				static void foldrArray( const InputType * __restrict__ const x, IOType & out, const size_t n ) {
+					// prepare scalar buffer
+					D3 reduced = static_cast< D3 >( out );
+					// prepare vectorisation buffer
+					D1 left_buffer[ blocksize ];
+					// blockwise application
+					size_t i = n - 1;
+					while( i - blocksize + 1 < n ) {
+						// load into buffer
+						for( size_t b = 0; b < blocksize; --i, ++b ) {
+							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+						}
+						// do reduce
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::foldr( &( left_buffer[ b ] ), &reduced );
+						}
+					}
+					// direct application for remainder
+					for( ; i < n; --i ) {
+						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+						OP::foldr( left_buffer, &reduced );
+					}
+					// write out
+					out = static_cast< IOType >( reduced );
+				}
+
+				/**
+				 * Reduces a vector of type \a InputType into a value in \a IOType
+				 * by repeated application of this operator. The \a IOType is cast
+				 * into \a D3 prior reduction. The \a InputType is cast into \a D2
+				 * during reduction. The final result is cast to IOType after
+				 * reduction. The reduction happens `left-to-right'.
+				 *
+				 * This implementation relies on the \a foldr, whether it be an
+				 * true in-place or emulated version.
+				 *
+				 * @param[in,out] out On input, the initial value to be used for
+				 *                    reduction. On output, all elements of \a x
+				 *                    have been applied to \a out.
+				 * @param[in] x A vector of size \a n with elements of type \a left_type.
+				 * @param[in] n A positive integer (can be 0).
+				 */
+				template< typename IOType, typename InputType >
+				static void foldlArray( IOType & out, const InputType * __restrict__ const x, const size_t n ) {
+					// prepare scalar buffer
+					D3 reduced = static_cast< D3 >( out );
+					// prepare vectorisation buffer
+					D2 right_buffer[ blocksize ];
+					// blockwise application
+					size_t i = 0;
+					while( i + blocksize <= n ) {
+						// load into buffer
+						for( size_t b = 0; b < blocksize; ++i, ++b ) {
+							right_buffer[ b ] = static_cast< D2 >( x[ i ] );
+						}
+						// do reduce
+						for( size_t b = 0; b < blocksize; ++b ) {
+							OP::foldl( &reduced, &( right_buffer[ b ] ) );
+						}
+					}
+					// direct application for remainder
+					for( ; i < n; ++i ) {
+						right_buffer[ 0 ] = static_cast< D2 >( x[ i ] );
+						OP::foldl( &reduced, right_buffer );
+					}
+					// write out
+					out = static_cast< IOType >( reduced );
+				}
+			};
+
+		} // namespace internal
+
+	} // namespace operators
+
+} // namespace alp
+
+#endif // _H_ALP_INTERNAL_OPERATORS_BASE
+
diff --git a/include/alp/base/internalrels.hpp b/include/alp/base/internalrels.hpp
new file mode 100644
index 000000000..4ca19a996
--- /dev/null
+++ b/include/alp/base/internalrels.hpp
@@ -0,0 +1,835 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author D. G. Spampinato
+ * @date 2nd of November, 2022
+ */
+
+#ifndef _H_ALP_INTERNAL_RELATIONS_BASE
+#define _H_ALP_INTERNAL_RELATIONS_BASE
+
+#include <type_traits>
+
+#include <alp/backends.hpp>
+#include <alp/type_traits.hpp>
+
+#include "internalops.hpp"
+
+
+namespace alp {
+
+	namespace relations {
+
+		/** Core implementations of the standard relations in #alp::relations. */
+		namespace internal {
+
+			/**
+			 * Standard less-than (\a lt) operator.
+			 *
+			 * Assumes native availability of operator< on the given data types 
+			 * or assumes that the relevant operators are properly overloaded.
+			 *
+			 * Assumes that \a lt is a strict total order. Non-standard/non-matching 
+			 * data types or non-standard (overloaded) \a operator< should 
+			 * therefore be used with caution.
+			 *
+			 * @tparam SET The input data type.
+			 */
+			template< typename SET, enum Backend implementation = config::default_backend >
+			class lt {
+
+				public:
+					/** Alias to the domain data type. */
+					typedef SET domain;
+
+					/** Alias to the codomain data type. */
+					typedef SET codomain;
+
+					/**
+					 * Whether this relation is \em reflexive; that is,
+					 * for all \a a in \a SET, \f$ a < a \f$.
+					 */
+					static constexpr bool is_reflexive = false;
+
+					/**
+					 * Whether this relation is \em irreflexive; that is,
+					 * for all \a a in \a SET, not \f$ a < a \f$.
+					 */
+					static constexpr bool is_irreflexive = true;
+
+					/**
+					 * Whether this relation is \em symmetric; that is,
+					 * for all \a a, \a b in \a SET, 
+					 * if \f$ a < b \f$ then \f$ b < a \f$.
+					 */
+					static constexpr bool is_symmetric = false;
+
+					/**
+					 * Whether this relation is \em antisymmetric; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a < b \f$ and 
+					 * \f$ b < a \f$ then \f$ a = b \f$.
+					 */
+					static constexpr bool is_antisymmetric = true;
+
+					/**
+					 * Whether this relation is \em transitive; that is,
+					 * for all \a a, \a b, \a c in \a SET, if \f$ a < b \f$ and
+					 * \f$ b < c \f$ then \f$ a < c \f$.
+					 */
+					static constexpr bool is_transitive = true;
+
+					/**
+					 * Whether this relation is \em connected (or total); that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \neq b \f$ then
+					 * either \f$ a < b \f$ or \f$ b < a \f$.
+					 */
+					static constexpr bool is_connected = true;
+
+					/**
+					 * Whether this relation is <em> strongly connected </em>; 
+					 * that is,
+					 * for all \a a, \a b in \a SET, 
+					 * either \f$ a < b \f$ or \f$ b < a \f$.
+					 */
+					static constexpr bool is_strongly_connected = false;
+
+					/**
+					 * This function checks if <em> a < b </em>.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static bool check( const domain * const a,
+						const codomain * const b
+					) {
+						return *a < *b;
+					}
+			};
+
+			/**
+			 * Standard greater-than (\a gt) operator.
+			 *
+			 * Assumes native availability of \a operator> on the given data types 
+			 * or assumes that the relevant operators are properly overloaded.
+			 *
+			 * Assumes that \a gt is a strict total order. Non-standard/non-matching 
+			 * data types or non-standard (overloaded) \a operator> should 
+			 * therefore be used with caution.
+			 *
+			 * @tparam SET The input data type.
+			 */
+			template< typename SET, enum Backend implementation = config::default_backend >
+			class gt {
+
+				public:
+					/** Alias to the domain data type. */
+					typedef SET domain;
+
+					/** Alias to the codomain data type. */
+					typedef SET codomain;
+
+					/**
+					 * Whether this relation is \em reflexive; that is,
+					 * for all \a a in \a SET, \f$ a > a \f$.
+					 */
+					static constexpr bool is_reflexive = false;
+
+					/**
+					 * Whether this relation is \em irreflexive; that is,
+					 * for all \a a in \a SET, not \f$ a > a \f$.
+					 */
+					static constexpr bool is_irreflexive = true;
+
+					/**
+					 * Whether this relation is \em symmetric; that is,
+					 * for all \a a, \a b in \a SET, 
+					 * if \f$ a > b \f$ then \f$ b > a \f$.
+					 */
+					static constexpr bool is_symmetric = false;
+
+					/**
+					 * Whether this relation is \em antisymmetric; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a > b \f$ and 
+					 * \f$ b > a \f$ then \f$ a = b \f$.
+					 */
+					static constexpr bool is_antisymmetric = true;
+
+					/**
+					 * Whether this relation is \em transitive; that is,
+					 * for all \a a, \a b, \a c in \a SET, if \f$ a > b \f$ and
+					 * \f$ b > c \f$ then \f$ a > c \f$.
+					 */
+					static constexpr bool is_transitive = true;
+
+					/**
+					 * Whether this relation is \em connected (or total); that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \neq b \f$ then
+					 * either \f$ a > b \f$ or \f$ b > a \f$.
+					 */
+					static constexpr bool is_connected = true;
+
+					/**
+					 * Whether this relation is <em> strongly connected </em>; 
+					 * that is,
+					 * for all \a a, \a b in \a SET, 
+					 * either \f$ a > b \f$ or \f$ b > a \f$.
+					 */
+					static constexpr bool is_strongly_connected = false;
+
+					/**
+					 * This function checks if <em> a > b </em>.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static bool check( const domain * const a,
+						const codomain * const b
+					) {
+						return *a > *b;
+					}
+			};
+
+			/**
+			 * Standard equal (\a eq) relation.
+			 *
+			 * Assumes native availability of ALP internal operator \a less_than
+			 * forming an equivalence relation on SET. Non-standard/non-matching 
+			 * data types should therefore be used with caution.
+			 *
+			 * @tparam SET The input data type.
+			 */
+			template< typename SET, enum Backend implementation = config::default_backend >
+			class eq {
+
+				public:
+					/** Alias to the domain data type. */
+					typedef SET domain;
+
+					/** Alias to the codomain data type. */
+					typedef SET codomain;
+
+					/**
+					 * Whether this relation is \em reflexive; that is,
+					 * for all \a a in \a SET, \f$ a = a \f$.
+					 */
+					static constexpr bool is_reflexive = true;
+
+					/**
+					 * Whether this relation is \em irreflexive; that is,
+					 * for all \a a in \a SET, not \f$ a = a \f$.
+					 */
+					static constexpr bool is_irreflexive = false;
+
+					/**
+					 * Whether this relation is \em symmetric; that is,
+					 * for all \a a, \a b in \a SET, 
+					 * if \f$ a = b \f$ then \f$ b = a \f$.
+					 */
+					static constexpr bool is_symmetric = true;
+
+					/**
+					 * Whether this relation is \em antisymmetric; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a = b \f$ and 
+					 * \f$ b = a \f$ then \f$ a = b \f$.
+					 */
+					static constexpr bool is_antisymmetric = true;
+
+					/**
+					 * Whether this relation is \em transitive; that is,
+					 * for all \a a, \a b, \a c in \a SET, if \f$ a = b \f$ and
+					 * \f$ b = c \f$ then \f$ a = c \f$.
+					 */
+					static constexpr bool is_transitive = true;
+
+					/**
+					 * Whether this relation is \em connected; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \neq b \f$ then
+					 * either \f$ a = b \f$ or \f$ b = a \f$.
+					 */
+					static constexpr bool is_connected = false;
+
+					/**
+					 * Whether this relation is <em> strongly connected </em> (or total); 
+					 * that is,
+					 * for all \a a, \a b in \a SET, 
+					 * either \f$ a = b \f$ or \f$ b = a \f$.
+					 */
+					static constexpr bool is_strongly_connected = false;
+
+					/**
+					 * This function checks if <em> a == b </em>.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static bool check( const domain * const a,
+						const codomain * const b
+					) {
+						bool check;
+						operators::internal::template equal<
+							SET, SET, bool, implementation 
+						>::apply( a, b, &check );
+						return check;
+					}
+			};
+
+			/**
+			 * Standard not-equal (\a neq) operator.
+			 *
+			 * Assumes availability of ALP internal operator \a not_equal.
+			 *
+			 * While \a not_equal does not require to form an order or an 
+			 * equivalence relation on SET, the formed relation is still assumed 
+			 * to be irreflexive, symmetric, and connected. Non-standard/non-matching 
+			 * data types should therefore be used with caution.
+			 *
+			 * @tparam SET The input data type.
+			 */
+			template< typename SET, enum Backend implementation = config::default_backend >
+			class neq {
+
+				public:
+					/** Alias to the domain data type. */
+					typedef SET domain;
+
+					/** Alias to the codomain data type. */
+					typedef SET codomain;
+
+					/**
+					 * Whether this relation is \em reflexive; that is,
+					 * for all \a a in \a SET, \f$ a \neq a \f$.
+					 */
+					static constexpr bool is_reflexive = false;
+
+					/**
+					 * Whether this relation is \em irreflexive; that is,
+					 * for all \a a in \a SET, not \f$ a \neq a \f$.
+					 */
+					static constexpr bool is_irreflexive = true;
+
+					/**
+					 * Whether this relation is \em symmetric; that is,
+					 * for all \a a, \a b in \a SET, 
+					 * if \f$ a \neq b \f$ then \f$ b \neq a \f$.
+					 */
+					static constexpr bool is_symmetric = true;
+
+					/**
+					 * Whether this relation is \em antisymmetric; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \neq b \f$ and 
+					 * \f$ b \neq a \f$ then \f$ a = b \f$.
+					 */
+					static constexpr bool is_antisymmetric = false;
+
+					/**
+					 * Whether this relation is \em transitive; that is,
+					 * for all \a a, \a b, \a c in \a SET, if \f$ a \neq b \f$ and
+					 * \f$ b \neq c \f$ then \f$ a \neq c \f$.
+					 */
+					static constexpr bool is_transitive = false;
+
+					/**
+					 * Whether this relation is \em connected; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \neq b \f$ then
+					 * either \f$ a \neq b \f$ or \f$ b \neq a \f$.
+					 */
+					static constexpr bool is_connected = true;
+
+					/**
+					 * Whether this relation is <em> strongly connected </em> (or total); 
+					 * that is,
+					 * for all \a a, \a b in \a SET, 
+					 * either \f$ a \neq b \f$ or \f$ b \neq a \f$.
+					 */
+					static constexpr bool is_strongly_connected = false;
+
+					/**
+					 * This function checks if <em> a != b </em>.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static bool check( const domain * const a,
+						const codomain * const b
+					) {
+						bool check;
+						operators::internal::template not_equal<
+							SET, SET, bool, implementation 
+						>::apply( a, b, &check );
+						return check;
+					}
+			};
+
+			/**
+			 * Standard less-than-or-equal (\a le) operator.
+			 *
+			 * Assumes native availability of \a operator<= on the given data types 
+			 * or assumes that the relevant operators are properly overloaded.
+			 *
+			 * Assumes that \a le is a total order. Non-standard/non-matching 
+			 * data types or non-standard (overloaded) \a operator<= should 
+			 * therefore be used with caution.
+			 *
+			 * @tparam SET The input data type.
+			 */
+			template< typename SET, enum Backend implementation = config::default_backend >
+			class le {
+
+				public:
+					/** Alias to the domain data type. */
+					typedef SET domain;
+
+					/** Alias to the codomain data type. */
+					typedef SET codomain;
+
+					/**
+					 * Whether this relation is \em reflexive; that is,
+					 * for all \a a in \a SET, \f$ a \le a \f$.
+					 */
+					static constexpr bool is_reflexive = true;
+
+					/**
+					 * Whether this relation is \em irreflexive; that is,
+					 * for all \a a in \a SET, not \f$ a \le a \f$.
+					 */
+					static constexpr bool is_irreflexive = false;
+
+					/**
+					 * Whether this relation is \em symmetric; that is,
+					 * for all \a a, \a b in \a SET, 
+					 * if \f$ a \le b \f$ then \f$ b \le a \f$.
+					 */
+					static constexpr bool is_symmetric = false;
+
+					/**
+					 * Whether this relation is \em antisymmetric; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \le b \f$ and 
+					 * \f$ b \le a \f$ then \f$ a = b \f$.
+					 */
+					static constexpr bool is_antisymmetric = true;
+
+					/**
+					 * Whether this relation is \em transitive; that is,
+					 * for all \a a, \a b, \a c in \a SET, if \f$ a \le b \f$ and
+					 * \f$ b \le c \f$ then \f$ a \le c \f$.
+					 */
+					static constexpr bool is_transitive = true;
+
+					/**
+					 * Whether this relation is \em connected; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \neq b \f$ then
+					 * either \f$ a \le b \f$ or \f$ b \le a \f$.
+					 */
+					static constexpr bool is_connected = true;
+
+					/**
+					 * Whether this relation is <em> strongly connected </em> (or total); 
+					 * that is,
+					 * for all \a a, \a b in \a SET, 
+					 * either \f$ a \le b \f$ or \f$ b \le a \f$.
+					 */
+					static constexpr bool is_strongly_connected = true;
+
+					/**
+					 * This function checks if <em> a <= b </em>.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static bool check( const domain * const a,
+						const codomain * const b
+					) {
+						return *a <= *b;
+					}
+			};
+
+			/**
+			 * Standard greater-than-or-equal (\a ge) operator.
+			 *
+			 * Assumes native availability of \a operator>= on the given data types 
+			 * or assumes that the relevant operators are properly overloaded.
+			 *
+			 * Assumes that \a ge is a total order. Non-standard/non-matching 
+			 * data types or non-standard (overloaded) \a operator>= should 
+			 * therefore be used with caution.
+			 *
+			 * @tparam SET The input data type.
+			 */
+			template< typename SET, enum Backend implementation = config::default_backend >
+			class ge {
+
+				public:
+					/** Alias to the domain data type. */
+					typedef SET domain;
+
+					/** Alias to the codomain data type. */
+					typedef SET codomain;
+
+					/**
+					 * Whether this relation is \em reflexive; that is,
+					 * for all \a a in \a SET, \f$ a \ge a \f$.
+					 */
+					static constexpr bool is_reflexive = true;
+
+					/**
+					 * Whether this relation is \em irreflexive; that is,
+					 * for all \a a in \a SET, not \f$ a \ge a \f$.
+					 */
+					static constexpr bool is_irreflexive = false;
+
+					/**
+					 * Whether this relation is \em symmetric; that is,
+					 * for all \a a, \a b in \a SET, 
+					 * if \f$ a \ge b \f$ then \f$ b \ge a \f$.
+					 */
+					static constexpr bool is_symmetric = false;
+
+					/**
+					 * Whether this relation is \em antisymmetric; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \ge b \f$ and 
+					 * \f$ b \ge a \f$ then \f$ a = b \f$.
+					 */
+					static constexpr bool is_antisymmetric = true;
+
+					/**
+					 * Whether this relation is \em transitive; that is,
+					 * for all \a a, \a b, \a c in \a SET, if \f$ a \ge b \f$ and
+					 * \f$ b \ge c \f$ then \f$ a \ge c \f$.
+					 */
+					static constexpr bool is_transitive = true;
+
+					/**
+					 * Whether this relation is \em connected; that is,
+					 * for all \a a, \a b in \a SET, if \f$ a \neq b \f$ then
+					 * either \f$ a \ge b \f$ or \f$ b \ge a \f$.
+					 */
+					static constexpr bool is_connected = true;
+
+					/**
+					 * Whether this relation is <em> strongly connected </em> (or total); 
+					 * that is,
+					 * for all \a a, \a b in \a SET, 
+					 * either \f$ a \ge b \f$ or \f$ b \ge a \f$.
+					 */
+					static constexpr bool is_strongly_connected = true;
+
+					/**
+					 * This function checks if <em> a >= b </em>.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static bool check( const domain * const a,
+						const codomain * const b
+					) {
+						return *a >= *b;
+					}
+			};
+
+			/**
+			 * This class takes a generic operator implementation and exposes a more
+			 * convenient test() function based on it. This function allows arbitrary
+			 * data types being passed as parameters, and automatically handles any
+			 * casting required for the raw operator.
+			 *
+			 * @tparam REL The generic operator implementation.
+			 *
+			 */
+			template< typename REL, enum Backend implementation = config::default_backend >
+			class RelationBase {
+
+				public:
+
+					/** The domain type. */
+					typedef typename REL::domain D1;
+
+					/** The codomain type. */
+					typedef typename REL::codomain D2;
+
+					/** @return Whether this relation is reflexive. */
+					static constexpr bool is_reflexive() {
+						return REL::is_reflexive;
+					}
+
+					/** @return Whether this relation is irreflexive. */
+					static constexpr bool is_irreflexive() {
+						return REL::is_irreflexive;
+					}
+
+					/** @return Whether this relation is symmetric. */
+					static constexpr bool is_symmetric() {
+						return REL::is_symmetric;
+					}
+
+					/** @return Whether this relation is antisymmetric. */
+					static constexpr bool is_antisymmetric() {
+						return REL::is_antisymmetric;
+					}
+
+					/** @return Whether this relation is transitive. */
+					static constexpr bool is_transitive() {
+						return REL::is_transitive;
+					}
+
+					/** @return Whether this relation is connected. */
+					static constexpr bool is_connected() {
+						return REL::is_connected;
+					}
+
+					/** @return Whether this relation is strongly connected. */
+					static constexpr bool is_strongly_connected() {
+						return REL::is_strongly_connected;
+					}
+
+					/**
+					 * This function checks if \f$ x REL y \f$.
+					 *
+					 * @tparam InputType1 The type of the input parameter \a x.
+					 * @tparam InputType2 The type of the input parameter \a y.
+					 *
+					 * \warning If \a InputType1 does not match \a D1 \em or \a InputType2 does
+					 *          not match \a D2, then input will be cast into temporary 
+					 *          variables of the correct types.
+					 *
+					 * \note Best performance is thus only guaranteed when all domains match.
+					 *
+					 * @param[in]  x The left-hand side input.
+					 * @param[in]  y The right-hand side input.
+					 */
+					template< typename InputType1, typename InputType2 >
+					static bool check( const InputType1 &x, const InputType2 &y ) {
+						const D1 a = static_cast< D1 >( x );
+						const D2 b = static_cast< D2 >( y );
+						return REL::check( &a, &b );
+					}
+
+					/**
+					 * This is the high-performance version of check() in the sense that no
+					 * casting is required. This version will be automatically called whenever
+					 * possible.
+					 */
+					static bool check( const D1 &x, const D2 &y ) {
+						return REL::check( &x, &y );
+					}
+			};
+
+			/**
+			 * This is the relation interface exposed to the ALP implementation.
+			 *
+			 * This class wraps around a base relation of type \a REL we denote by
+			 *        \f$ REL \subseteq D_1\times D_2 \f$.
+			 *
+			 * \parblock
+			 * \par Base Operators
+			 *
+			 * The class \a REL is expected to define the following public function:
+			 *   - \a check, which takes two pointers to parameters \f$ a \in D_1 \f$
+			 *      and \f$ b \in D_2 \f$ and checks if 
+			 *      \f$ a REL b \f$.
+			 *
+			 * It is also expected to define the following types:
+			 *   - \a domain, which corresponds to \f$ D_1 \f$,
+			 *   - \a codomain, which corresponds to \f$ D_2 \f$.
+			 *
+			 * It is also expected to define the following public boolean fields:
+			 *   - \a is_reflexive
+			 *   - \a is_irreflexive
+			 *   - \a is_symmetric
+			 *   - \a is_antisymmetric
+			 *   - \a is_transitive
+			 *   - \a is_connected
+			 *   - \a is_strongly_connected
+			 *
+			 * For an example of base relation, see alp::relations::internal::lt.
+			 *
+			 * \note ALP users should never access these classes directly. This
+			 *       documentation is provided for developers to understand or extend
+			 *       the current implementation, for example to include new relations.
+			 *
+			 * \endparblock
+			 *
+			 * \parblock
+			 * \par The exposed GraphBLAS Relation Interface
+			 *
+			 * The Base Relations as illustrated above are wrapped by this class to
+			 * provide a more convient API. It translates the functionality of any Base
+			 * Relation and exposes the following interface instead:
+			 *
+			 *   -# check, which takes two parameters \f$ a, b \f$ of arbitrary
+			 *      types and checks \f$ a REL b \f$ after performing any
+			 *      casting if required.
+			 * \endparblock
+			 *
+			 * \note This class only allows wrapping of stateless base relations. This
+			 *       ALP implementation in principle allows for stateful
+			 *       relations, though they must be provided by a specialised class
+			 *       which directly implements the above public interface.
+			 *
+			 * @see RelationBase::check
+			 *
+			 * \parblock
+			 * \par Providing New Relations
+			 *
+			 * New relations are easily added to this
+			 * ALP implementation by providing a base relation and wrapping this
+			 * class around it, as illustrated, e.g., by alp::relations::lt as follows:
+			 *
+			 * \snippet rels.hpp Relation Wrapping
+			 *
+			 * This need to be compatible with the ALP type traits, specifically,
+			 * the #is_relation template. To ensure this, a specialisation of it must be
+			 * privided:
+			 *
+			 * \snippet rels.hpp Relation Type Traits
+			 * \endparblock
+			 */
+			template< typename REL, enum Backend implementation = config::default_backend >
+			class Relation : public RelationBase< REL, implementation > {
+
+				// public:
+				// 	typedef typename RelationBase< REL, implementation >::D1 D1;
+				// 	typedef typename RelationBase< REL, implementation >::D2 D2;
+
+			};
+
+			/**
+			 *
+			 * @tparam REL   The generic homogeneous relation.
+			 *
+			 * @see Relation
+			 * @see RelationBase for additional functions exposed to the final relation.
+			 */
+			template< 
+				typename REL, 
+				enum Backend implementation = config::default_backend,
+				std::enable_if_t< 
+					std::is_same< 
+						typename REL::domain, 
+						typename REL::codomain 
+					>::value 
+				> * = nullptr
+			>
+			class HomogeneousRelation : public Relation< REL, implementation > {
+			};
+
+		} // namespace internal
+
+	} // namespace relations
+
+	template< typename Rel >
+	struct is_homogeneous_relation {
+		static const constexpr bool value = is_relation< Rel >::value
+			and std::is_same< typename Rel::D1, typename Rel::D2 >::value;
+	};
+
+	template< typename Rel >
+	struct is_reflexive {
+		static const constexpr bool value = is_homogeneous_relation< Rel >::value
+			and Rel::is_reflexive();
+	};
+
+	template< typename Rel >
+	struct is_irreflexive {
+		static const constexpr bool value = is_homogeneous_relation< Rel >::value
+			and Rel::is_irreflexive();
+	};
+
+	template< typename Rel >
+	struct is_symmetric {
+		static const constexpr bool value = is_homogeneous_relation< Rel >::value
+			and Rel::is_symmetric();
+	};
+
+	template< typename Rel >
+	struct is_antisymmetric {
+		static const constexpr bool value = is_homogeneous_relation< Rel >::value
+			and Rel::is_antisymmetric();
+	};
+
+	template< typename Rel >
+	struct is_transitive {
+		static const constexpr bool value = is_homogeneous_relation< Rel >::value
+			and Rel::is_transitive();
+	};
+
+	template< typename Rel >
+	struct is_connected {
+		static const constexpr bool value = is_homogeneous_relation< Rel >::value
+			and Rel::is_connected();
+	};
+
+	template< typename Rel >
+	struct is_strongly_connected {
+		static const constexpr bool value = is_homogeneous_relation< Rel >::value
+			and Rel::is_strongly_connected();
+	};
+
+	template< typename Rel >
+	struct is_asymmetric {
+		static const constexpr bool value = is_irreflexive< Rel >::value
+			and is_antisymmetric< Rel >::value;
+	};
+
+	template< typename Rel >
+	struct is_partial_order {
+		static const constexpr bool value = is_reflexive< Rel >::value
+			and is_antisymmetric< Rel >::value
+			and is_transitive< Rel >::value;
+	};
+
+	template< typename Rel >
+	struct is_strict_partial_order {
+		static const constexpr bool value = is_asymmetric< Rel >::value
+			and is_transitive< Rel >::value;
+	};
+
+	template< typename Rel >
+	struct is_total_order {
+		static const constexpr bool value = is_partial_order< Rel >::value
+			and is_strongly_connected< Rel >::value;
+	};
+
+	template< typename Rel >
+	struct is_strict_total_order {
+		static const constexpr bool value = is_strict_partial_order< Rel >::value
+			and is_connected< Rel >::value;
+	};
+
+	template< typename Rel >
+	struct is_equivalence_relation {
+		static const constexpr bool value = is_reflexive< Rel >::value
+			and is_symmetric< Rel >::value
+			and is_transitive< Rel >::value;
+	};
+
+} // namespace alp
+
+#endif // _H_ALP_INTERNAL_RELATIONS_BASE
+
diff --git a/include/alp/base/io.hpp b/include/alp/base/io.hpp
new file mode 100644
index 000000000..926b4b7b6
--- /dev/null
+++ b/include/alp/base/io.hpp
@@ -0,0 +1,624 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 21st of February, 2017
+ */
+
+#ifndef _H_ALP_IO_BASE
+#define _H_ALP_IO_BASE
+
+#include <alp/iomode.hpp>
+#include <alp/rc.hpp>
+#include <alp/utils/SynchronizedNonzeroIterator.hpp>
+
+#include "matrix.hpp"
+#include "scalar.hpp"
+#include "vector.hpp"
+
+
+namespace alp {
+
+	/**
+	 * \defgroup IO Data Ingestion and Extraction.
+	 * Provides functions for putting user data into opaque GraphBLAS objects,
+	 * and provides functions for extracting data from opaque GraphBLAS objects.
+	 *
+	 * The GraphBLAS operates on opaque data objects. Users can input data using
+	 * alp::buildVector and/or alp::buildMatrixUnique. This group provides free
+	 * functions that automatically dispatch to those variants.
+	 *
+	 * The standard output methods are provided by alp::Vector::cbegin and
+	 * alp::Vector::cend, and similarly for alp::Matrix. Iterators provide
+	 * parallel output (see #IOMode for a discussion on parallel versus
+	 * sequential IO).
+	 *
+	 * Sometimes it is desired to have direct access to a GraphBLAS memory
+	 * area, and to have that memory available even after the GraphBLAS
+	 * context has been closed (via alp::finalize). This functionality is
+	 * provided by alp::pin_memory.
+	 *
+	 * @{
+	 */
+
+	template< typename D, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	size_t nrows( const Matrix< D, Structure, Density::Dense, View, ImfR, ImfC, backend > & A ) noexcept;
+
+	template< typename D, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	size_t ncols( const Matrix< D, Structure, Density::Dense, View, ImfR, ImfC, backend > & A ) noexcept;
+
+	template< typename D, typename Structure, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	std::pair< size_t, size_t > dims( const Matrix< D, Structure, Density::Dense, View, ImfR, ImfC, backend > & A ) noexcept;
+
+	/**
+	 * Request the size (dimension) of a given Vector.
+	 */
+	template<
+		typename DataType, typename DataStructure, typename View,
+		typename ImfR, typename ImfC, Backend backend
+	>
+	size_t size(
+		const Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, backend > &x
+	) noexcept {
+
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_size_for_vector = false;
+		assert( selected_backend_does_not_support_size_for_vector );
+#endif
+		(void) x;
+		return SIZE_MAX;
+	}
+
+	/**
+	 * Request the number of nonzeroes in a given Vector.
+	 */
+	template<
+		typename DataType, typename DataStructure, typename View,
+		typename ImfR, typename ImfC, Backend backend
+	>
+	size_t nnz(
+		const Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, reference > &x
+	) noexcept {
+
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_nnz_for_vector = false;
+		assert( selected_backend_does_not_support_nnz_for_vector );
+#endif
+		(void) x;
+		return SIZE_MAX;
+	}
+
+	/**
+	 * Retrieve the number of nonzeroes contained in this matrix.
+	 *
+	 * @returns The number of nonzeroes the current matrix contains.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# This function consitutes \f$ \Theta(1) \f$ work.
+	 *        -# This function allocates no additional dynamic memory.
+	 *        -# This function uses \f$ \mathcal{O}(1) \f$ memory
+	 *           beyond that which was already used at function entry.
+	 *        -# This function will move
+	 *             \f$ \mathit{sizeof}( size\_t ) \f$
+	 *           bytes of memory.
+	 * \endparblock
+	 */
+	template<
+		typename DataType, typename Structure, typename View,
+		typename ImfR, typename ImfC, Backend backend
+	>
+	size_t nnz(
+		const Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A
+	) noexcept {
+
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_nnz_for_matrix = false;
+		assert( selected_backend_does_not_support_nnz_for_matrix );
+#endif
+		(void) A;
+		return SIZE_MAX;
+	}
+
+	/**
+	 * Clears all elements from the given vector \a x.
+	 *
+	 * At the end of this operation, the number of nonzero elements in this vector
+	 * will be zero. The size of the vector remains unchanged.
+	 */
+	template<
+		typename DataType, typename DataStructure, typename View,
+		typename ImfR, typename ImfC, Backend backend
+	>
+	RC clear(
+		Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, backend > &x
+	) noexcept {
+		(void) x;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Resizes the Scalar to have at least the given number of nonzeroes.
+	 * The contents of the scalar are not retained.
+	 */
+	template<
+		typename InputType, typename InputStructure,
+		typename length_type, Backend backend
+	>
+	RC resize( Scalar< InputType, InputStructure, backend > &s, const length_type new_nz ) {
+		(void) s;
+		(void) new_nz;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Resizes the vector to have at least the given number of nonzeroes.
+	 * The contents of the vector are not retained.
+	 */
+	template<
+		typename InputType, typename InputStructure, typename View,
+		typename ImfR, typename ImfC,
+		typename length_type, Backend backend
+	>
+	RC resize(
+		Vector< InputType, InputStructure, Density::Dense, View, ImfR, ImfC, backend > &x,
+		const length_type new_nz
+	) noexcept {
+		(void) x;
+		(void) new_nz;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Resizes the matrix to have at least the given number of nonzeroes.
+	 * The contents of the matrix are not retained.
+	 */
+	template<
+		typename InputType, typename InputStructure, typename InputView,
+		typename InputImfR, typename InputImfC, Backend backend
+	>
+	RC resize(
+		Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &A,
+		const size_t new_nz
+	) noexcept {
+		(void) A;
+		(void) new_nz;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Sets all elements of a Vector to the given value.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename DataStructure, typename View,
+		typename ImfR, typename ImfC,
+		typename T, typename ValStructure,
+		Backend backend
+	>
+	RC set(
+		Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, backend > &x,
+		const Scalar< T, ValStructure, backend > val,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value &&
+			!alp::is_object< T >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) val;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Sets the element of a given Vector at a given position to a given value.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename DataStructure, typename View,
+		typename ImfR, typename ImfC,
+		typename T, typename ValStructure,
+		Backend backend
+	>
+	RC setElement(
+		Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, backend > &x,
+		const Scalar< T, ValStructure, backend > val,
+		const size_t i,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value &&
+			!alp::is_object< T >::value
+		> * const = nullptr
+	) {
+		(void) x;
+		(void) val;
+		(void) i;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Sets all elements of the output matrix to the values of the input matrix.
+	 * C = A
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView,
+		typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure, typename InputView,
+		typename InputImfR, typename InputImfC,
+		Backend backend
+	>
+	RC set(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &C,
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, backend > &A
+	) noexcept {
+		(void) C;
+		(void) A;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Sets all elements of the given matrix to the value of the given scalar.
+	 * C = val
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView,
+		typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure,
+		Backend backend
+	>
+	RC set(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, backend > &C,
+		const Scalar< InputType, InputStructure, backend > &val
+	) noexcept {
+		(void) C;
+		(void) val;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Constructs a dense vector from a container of exactly alp::size(x)
+	 * elements. This function aliases to the buildVector routine that takes
+	 * an accumulator, using alp::operators::right_assign (thus overwriting
+	 * any old contents).
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename fwd_iterator,
+		Backend backend, typename Coords
+	>
+	RC buildVector(
+		internal::Vector< InputType, backend > &x,
+		fwd_iterator start, const fwd_iterator end,
+		const IOMode mode
+	) {
+		operators::right_assign< InputType > accum;
+		return buildVector< descr >( x, accum, start, end, mode );
+	}
+
+	/**
+	 * Ingests possibly sparse input from a container to which iterators are
+	 * provided. This function dispatches to the buildVector routine that
+	 * includes an accumulator, here set to alp::operators::right_assign.
+	 * Any existing values in \a x that overlap with newer values will hence
+	 * be overwritten.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		class Merger = operators::right_assign< InputType >,
+		typename fwd_iterator1, typename fwd_iterator2,
+		Backend backend, typename Coords
+	>
+	RC buildVector(
+		internal::Vector< InputType, backend > &x,
+		fwd_iterator1 ind_start, const fwd_iterator1 ind_end,
+		fwd_iterator2 val_start, const fwd_iterator2 val_end,
+		const IOMode mode, const Merger &merger = Merger()
+	) {
+		operators::right_assign< InputType > accum;
+		return buildVector< descr >( x, accum, ind_start, ind_end, val_start, val_end, mode, merger );
+	}
+
+	/**
+	 * Ingests a set of nonzeroes into a given vector \a x.
+	 *
+	 * Old values will be overwritten. The given set of nonzeroes must not contain
+	 * duplicate nonzeroes that should be stored at the same index.
+	 *
+	 * \warning Inputs with duplicate nonzeroes when passed into this function will
+	 *          invoke undefined behaviour.
+	 *
+	 * @param[in,out] x     The vector where to ingest nonzeroes into.
+	 * @param[in] ind_start Start iterator to the nonzero indices.
+	 * @param[in] ind_end   End iterator to the nonzero indices.
+	 * @param[in] val_start Start iterator to the nonzero values.
+	 * @param[in] val_end   End iterator to the nonzero values.
+	 * @param[in] mode      Whether sequential or parallel ingestion is requested.
+	 *
+	 * The containers the two iterator pairs point to must contain an equal number
+	 * of elements. Any pre-existing nonzeroes that do not overlap with any nonzero
+	 * between \a ind_start and \a ind_end will remain unchanged.
+	 *
+	 * \parblock
+	 * \par Performance semantics:
+	 * A call to this function
+	 *   -# comprises \f$ \mathcal{O}( n ) \f$ work where \a n is the number of
+	 *      elements pointed to by the given iterator pairs. This work may be
+	 *      distributed over multiple user processes.
+	 *   -# results in at most \f$   n \mathit{sizeof}( T ) +
+	 *                               n \mathit{sizeof}( U ) +
+	 *                               n \mathit{sizeof}( \mathit{InputType} ) +
+	 *                             2 n \mathit{sizeof}( \mathit{bool} ) \f$
+	 *      bytes of data movement, where \a T and \a U are the underlying data
+	 *      types of the input iterators. These costs may be distributed over
+	 *      multiple user processes.
+	 *   -# inter-process communication costs are \f$ \mathcal{O}(n) g + l \f$.
+	 *   -# if the capacity of this vector is not large enough to hold \a n
+	 *      elements, a call to this function may allocate
+	 *         \f$ \mathcal{O}( n ) \f$
+	 *      new bytes of memory which \em may be distributed over multiple user
+	 *      processes.
+	 *   -# if the capacity of this vector is not large enough to hold \a n
+	 *      elements, a call to this function may result in system calls at any of
+	 *      the user processes.
+	 *   -# If the IOMode is sequential, then the work and data movement costs are
+	 *      incurred <em>per user process</em> and will not be distributed. In this
+	 *      case the inter-process communication costs will, however, be zero.
+	 *   -# if the IOMode is parallel, then a good implementation under a uniformly
+	 *      randomly distributed input incurs an inter-process communication cost
+	 *      of expected value \f$ n/p g + l \f$. The best-case inter-process cost
+	 *      is \f$ (p-1)g + l \f$.
+	 * \endparblock
+	 *
+	 * @returns alp::SUCCESS When ingestion has completed successfully.
+	 * @returns alp::ILLEGAL When a nonzero has an index larger than alp::size(x).
+	 * @returns alp::PANIC   If an unmitigable error has occured during ingestion.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		class Merger = operators::right_assign< InputType >,
+		typename fwd_iterator1, typename fwd_iterator2,
+		Backend backend, typename Coords
+	>
+	RC buildVectorUnique(
+		internal::Vector< InputType, backend > &x,
+		fwd_iterator1 ind_start, const fwd_iterator1 ind_end,
+		fwd_iterator2 val_start, const fwd_iterator2 val_end,
+		const IOMode mode
+	) {
+		return buildVector< descr | descriptors::no_duplicates >( x,
+			ind_start, ind_end,
+			val_start, val_end,
+			mode );
+	}
+
+	/**
+	 * Assigns nonzeroes to the matrix from a coordinate format.
+	 *
+	 * Invalidates any prior existing content. Disallows different nonzeroes
+	 * to have the same row and column coordinates; input must consist out of
+	 * unique triples. See #buildMatrix for an alternate function that does
+	 * not have these restrictions-- at the cost of lower performance.
+	 *
+	 * \warning Calling this function with duplicate input coordinates will
+	 *          lead to undefined behaviour.
+	 *
+	 * @tparam descr         The descriptor used. The default is
+	 *                       #alp::descriptors::no_operation, which means that
+	 *                       no pre- or post-processing of input or input is
+	 *                       performed.
+	 * @tparam fwd_iterator1 The type of the row index iterator.
+	 * @tparam fwd_iterator2 The type of the column index iterator.
+	 * @tparam fwd_iterator3 The type of the nonzero value iterator.
+	 * @tparam length_type   The type of the number of elements in each iterator.
+	 *
+	 * The iterators will only be used to read from, never to assign to.
+	 *
+	 * @param[in] I  A forward iterator to \a cap row indices.
+	 * @param[in] J  A forward iterator to \a cap column indices.
+	 * @param[in] V  A forward iterator to \a cap nonzero values.
+	 * @param[in] nz The number of items pointed to by \a I, \a J, \em and \a V.
+	 *
+	 * @return alp::MISMATCH -# when an element from \a I dereferences to a value
+	 *                          larger than the row dimension of this matrix, or
+	 *                       -# when an element from \a J dereferences to a value
+	 *                          larger than the column dimension of this matrix.
+	 *                       When this error code is returned the state of this
+	 *                       container will be as though this function was never
+	 *                       called; however, the given forward iterators may
+	 *                       have been copied and the copied iterators may have
+	 *                       incurred multiple increments and dereferences.
+	 * @return alp::OVERFLW  When the internal data type used for storing the
+	 *                       number of nonzeroes is not large enough to store
+	 *                       the number of nonzeroes the user wants to assign.
+	 *                       When this error code is returned the state of this
+	 *                       container will be as though this function was never
+	 *                       called; however, the given forward iterators may
+	 *                       have been copied and the copied iterators may have
+	 *                       incurred multiple increments and dereferences.
+	 * @return alp::SUCCESS  When the function completes successfully.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# This function contains
+	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ amount of work.
+	 *        -# This function may dynamically allocate
+	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of memory.
+	 *        -# A call to this function will use \f$ \mathcal{O}(m+n) \f$ bytes
+	 *           of memory beyond the memory in use at the function call entry.
+	 *        -# This function will copy each input forward iterator at most
+	 *           \em once; the three input iterators \a I, \a J, and \a V thus
+	 *           may have exactly one copyeach, meaning that all input may be
+	 *           traversed only once.
+	 *  base/blas1.hpp      -# Each of the at most three iterator copies will be incremented
+	 *           at most \f$ \mathit{nz} \f$ times.
+	 *        -# Each position of the each of the at most three iterator copies
+	 *           will be dereferenced exactly once.
+	 *        -# This function moves
+	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of data.
+	 *        -# This function will likely make system calls.
+	 * \endparblock
+	 *
+	 * \warning This is an expensive function. Use sparingly and only when
+	 *          absolutely necessary.
+	 *
+	 * \note Streaming input can be implemented by supplying buffered
+	 *       iterators to this GraphBLAS implementation.
+	 *
+	 * \note The functionality herein described is exactly that of buildMatrix,
+	 *       though with stricter input requirements. These requirements allow
+	 *       much faster construction.
+	 *
+	 * \note No masked version of this variant is provided. The use of masks in
+	 *       matrix construction is costly and the user is referred to the
+	 *       costly buildMatrix() function instead.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename fwd_iterator1 = const size_t * __restrict__,
+		typename fwd_iterator2 = const size_t * __restrict__,
+		typename fwd_iterator3 = const InputType * __restrict__,
+		typename length_type = size_t,
+		Backend implementation = config::default_backend
+	>
+	RC buildMatrixUnique(
+		internal::Matrix< InputType, implementation > &A,
+		fwd_iterator1 I, fwd_iterator1 I_end,
+		fwd_iterator2 J, fwd_iterator2 J_end,
+		fwd_iterator3 V, fwd_iterator3 V_end,
+		const IOMode mode
+	) {
+		// derive synchronized iterator
+		auto start = utils::makeSynchronized( I, J, V, I_end, J_end, V_end );
+		const auto end = utils::makeSynchronized( I_end, J_end, V_end, I_end, J_end, V_end );
+
+		// defer to other signature
+		return buildMatrixUnique< descr >( A, start, end, mode );
+	}
+
+	/**
+	 * Alias that transforms a set of pointers and an array length to the
+	 * buildMatrixUnique variant based on iterators.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename fwd_iterator1 = const size_t * __restrict__,
+		typename fwd_iterator2 = const size_t * __restrict__,
+		typename fwd_iterator3 = const InputType * __restrict__,
+		typename length_type = size_t,
+		Backend implementation = config::default_backend
+	>
+	RC buildMatrixUnique( internal::Matrix< InputType, implementation > &A,
+		fwd_iterator1 I, fwd_iterator2 J, fwd_iterator3 V,
+		const size_t nz, const IOMode mode
+	) {
+		return buildMatrixUnique< descr >( A,
+			I, I + nz,
+			J, J + nz,
+			V, V + nz,
+			mode
+		);
+	}
+
+	/** Version of the above #buildMatrixUnique that handles \a NULL value pointers. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename fwd_iterator1 = const size_t * __restrict__,
+		typename fwd_iterator2 = const size_t * __restrict__,
+		typename length_type = size_t,
+		Backend implementation = config::default_backend
+	>
+	RC buildMatrixUnique(
+		internal::Matrix< InputType, implementation > &A,
+		fwd_iterator1 I, fwd_iterator2 J,
+		const length_type nz, const IOMode mode
+	) {
+		// derive synchronized iterator
+		auto start = utils::makeSynchronized( I, J, I + nz, J + nz );
+		const auto end = utils::makeSynchronized( I + nz, J + nz, I + nz, J + nz );
+
+		// defer to other signature
+		return buildMatrixUnique< descr >( A, start, end, mode );
+	}
+
+	/**
+	 * Version of buildMatrixUnique that works by supplying a single iterator
+	 * (instead of three).
+	 *
+	 * This is useful in cases where the input is given as a single struct per
+	 * nonzero, whatever this struct may be exactly, as opposed to multiple
+	 * containers for row indices, column indices, and nonzero values.
+	 *
+	 * This GraphBLAS implementation provides both input modes since which one is
+	 * more appropriate (and performant!) depends mostly on how the data happens
+	 * to be stored in practice.
+	 *
+	 * @tparam descr          The currently active descriptor.
+	 * @tparam InputType      The value type the output matrix expects.
+	 * @tparam fwd_iterator   The iterator type.
+	 * @tparam implementation For which backend a matrix is being read.
+	 *
+	 * The iterator \a fwd_iterator, in addition to being STL-compatible, must
+	 * support the following three public functions:
+	 *  -# <tt>S fwd_iterator.i();</tt> which returns the row index of the current
+	 *     nonzero;
+	 *  -# <tt>S fwd_iterator.j();</tt> which returns the columnindex of the
+	 *     current nonzero;
+	 *  -# <tt>V fwd_iterator.v();</tt> which returns the nonzero value of the
+	 *     current nonzero.
+	 *
+	 * It also must provide the following public typedefs:
+	 *  -# <tt>fwd_iterator::row_coordinate_type</tt>
+	 *  -# <tt>fwd_iterator::column_coordinate_type</tt>
+	 *  -# <tt>fwd_iterator::nonzero_value_type</tt>
+	 *
+	 * This means a specialised iterator is required for use with this function.
+	 * See, for example, alp::utils::internal::MatrixFileIterator.
+	 *
+	 * @param[out]   A   The matrix to be filled with nonzeroes from \a start to
+	 *                   \a end.
+	 * @param[in]  start Iterator pointing to the first nonzero to be added.
+	 * @param[in]   end  Iterator pointing past the last nonzero to be added.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename fwd_iterator,
+		Backend implementation = config::default_backend
+	>
+	RC buildMatrixUnique(
+		internal::Matrix< InputType, implementation > &A,
+		fwd_iterator start, const fwd_iterator end,
+		const IOMode mode
+	) {
+		(void)A;
+		(void)start;
+		(void)end;
+		(void)mode;
+		return UNSUPPORTED;
+	}
+
+	/** @} */
+
+} // namespace alp
+
+#endif // end _H_ALP_IO_BASE
+
diff --git a/include/alp/base/matrix.hpp b/include/alp/base/matrix.hpp
new file mode 100644
index 000000000..605e23ec5
--- /dev/null
+++ b/include/alp/base/matrix.hpp
@@ -0,0 +1,487 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 10th of August
+ */
+
+#ifndef _H_ALP_MATRIX_BASE
+#define _H_ALP_MATRIX_BASE
+
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include <stddef.h>
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/imf.hpp>
+#include <alp/ops.hpp>
+#include <alp/rc.hpp>
+#include <alp/structures.hpp>
+#include <alp/utils.hpp>
+#include <alp/views.hpp>
+
+
+namespace alp {
+
+	namespace internal {
+		/**
+		 * A GraphBLAS matrix.
+		 *
+		 * This is an opaque data type that implements the below functions.
+		 *
+		 * @tparam D  The type of a nonzero element. \a D shall not be a GraphBLAS
+		 *            type.
+		 * @tparam implementation Allows multiple backends to implement different
+		 *         versions of this data type.
+		 *
+		 * \warning Creating a alp::Matrix of other GraphBLAS types is
+		 *                <em>not allowed</em>.
+		 *          Passing a GraphBLAS type as template parameter will lead to
+		 *          undefined behaviour.
+		 */
+		template< typename D, enum Backend implementation >
+		class Matrix {
+
+			typedef Matrix< D, implementation > self_type;
+
+			public :
+
+				/**
+				 * A standard iterator for a GraphBLAS aatrix.
+				 *
+				 * This iterator is used for data extraction only. Hence only this const
+				 * version is specified.
+				 *
+				 * Dereferencing an iterator of this type that is not in end position yields
+				 * a pair \f$ (c,v) \f$. The value \a v is of type \a D and corresponds to
+				 * the value of the dereferenced nonzero.
+				 * The value \a c is another pair \f$ (i,j) \f$. The values \a i and \a j
+				 * are of type <code>size_t</code> and correspond to the coordinate of the
+				 * dereferenced nonzero.
+				 *
+				 * \note `Pair' here corresponds to the regular <code>std::pair</code>.
+				 *
+				 * \warning Comparing two const iterators corresponding to different
+				 *          containers leads to undefined behaviour.
+				 * \warning Advancing an iterator past the end iterator of the container
+				 *          it corresponds to, leads to undefined behaviour.
+				 * \warning Modifying the contents of a container makes any use of any
+				 *          iterator derived from it incur invalid behaviour.
+				 * \note    These are standard limitations of STL iterators.
+				 */
+				class const_iterator : public std::iterator< std::forward_iterator_tag, std::pair< std::pair< const size_t, const size_t >, const D >, size_t > {
+
+					public :
+
+						/** Standard equals operator. */
+						bool
+						operator==( const const_iterator & other ) const { (void)other; return false; }
+
+						/** @returns The negation of operator==(). */
+						bool operator!=( const const_iterator & other ) const {
+							(void)other;
+							return true;
+						}
+
+						/**
+						 * Dereferences the current position of this iterator.
+						 *
+						 * @return If this iterator is valid and not in end position, this returns
+						 *         an std::pair with in its first field the position of the
+						 *         nonzero value, and in its second field the value of the nonzero.
+						 *         The position of a nonzero is another std::pair with both the
+						 *         first and second field of type <code>size_t</code>.
+						 *
+						 * \note If this iterator is invalid or in end position, the result is
+						 *       undefined.
+						 */
+						std::pair< const size_t, const D > operator*() const {
+							return std::pair< const size_t, const D >();
+						}
+
+						/**
+						 * Advances the position of this iterator by one.
+						 *
+						 * If the current position corresponds to the last element in the
+						 * container, the new position of this iterator will be its end
+						 * position.
+						 *
+						 * If the current position of this iterator is already the end
+						 * position, this iterator will become invalid; any use of invalid
+						 * iterators will lead to undefined behaviour.
+						 *
+						 * @return A reference to this iterator.
+						 */
+						const_iterator & operator++() {
+							return *this;
+						}
+
+				}; // class const_iterator
+
+				/** The value type of elements stored in this matrix. */
+				typedef D value_type;
+
+				/**
+				 * The main GraphBLAS matrix constructor.
+				 *
+				 * Matrix nonzeroes will be uninitalised after successful construction.
+				 *
+				 * Requesting a matrix with zero \a rows or \a columns will yield an empty
+				 * matrix; i.e., it will be useless but will not result in an error.
+				 *
+				 * @param rows        The number of rows in the new matrix.
+				 * @param columns     The number of columns in the new matrix.
+				 *
+				 * @return SUCCESS This function never fails.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor completes in \f$ \Theta(1) \f$ time.
+				 *        -# This constructor will not allocate any new dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+				 *        -# This constructor \em may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Matrix( const size_t rows, const size_t columns ) {
+					(void)rows;
+					(void)columns;
+				}
+
+				/**
+				 * Copy constructor.
+				 *
+				 * @param other The matrix to copy.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *      Allocates the same capacity as the \a other matrix, even if the
+				 *      actual number of nonzeroes contained in \a other is less.
+				 *        -# This constructor entails \f$ \Theta(\mathit{nz}) \f$ amount of
+				 *           work.
+				 *        -# This constructor allocates \f$ \Theta(\mathit{nz}) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor incurs \f$ \Theta(\mathit{nz}) \f$ of data
+				 *           movement.
+				 *        -# This constructor \em may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Matrix( const Matrix< D, implementation > & other ) {
+					(void)other;
+				}
+
+				/**
+				 * Move constructor. This will make the new matrix equal to the given
+				 * GraphBLAS matrix while destroying the given matrix.
+				 *
+				 * @param[in] other The GraphBLAS matrix to move to this new instance.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor will not allocate any new dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+				 * \endparblock
+				 */
+				Matrix( self_type && other ) {
+					(void)other;
+				}
+
+				/**
+				 * Matrix destructor.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This destructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This destructor will not perform any memory allocations.
+				 *        -# This destructor will use \f$ \mathcal{O}(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This destructor will move \f$ \Theta(1) \f$ bytes of data.
+				 *        -# This destructor makes system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid calling destructors from within performance critical
+				 *          code sections.
+				 */
+				~Matrix() {}
+
+				/**
+				 * Assigns nonzeroes to the matrix from a coordinate format.
+				 *
+				 * Any prior content may be combined with new input according to the
+				 * user-supplied accumulator operator (\a accum).
+				 *
+				 * Input triplets need not be unique. Input triplets that are written to the
+				 * same row and column coordinates will be combined using the supplied
+				 * duplicate operator (\a dup).
+				 *
+				 * \note Note that \a dup and \a accum may differ. The duplicate operator is
+				 *       \em not applied to any pre-existing nonzero values.
+				 *
+				 * \note The order of application of the operators is undefined.
+				 *
+				 * The number of nonzeroes, after reduction by duplicate removals and after
+				 * merger with the existing nonzero structure, must be equal or less than the
+				 * space reserved during the construction of this matrix. The nonzeroes will
+				 * not be stored in a fully sorted fashion-- it will be sorted column-wise,
+				 * but within each column the order can be arbitrary.
+				 *
+				 * @tparam accum         How existing entries of this matrix should be
+				 *                       treated.
+				 *                       The default is #alp::operators::right_assign, which
+				 *                       means that any existing values are overwritten with
+				 *                       the new values.
+				 * @tparam dup           How to handle duplicate entries. The default is
+				 *                       #alp::operators::add, which means that duplicated
+				 *                       values are combined by addition.
+				 * @tparam descr         The descriptor used. The default is
+				 *                       #alp::descriptors::no_operation, which means that
+				 *                       no pre- or post-processing of input or input is
+				 *                       performed.
+				 * @tparam fwd_iterator1 The type of the row index iterator.
+				 * @tparam fwd_iterator2 The type of the column index iterator.
+				 * @tparam fwd_iterator3 The type of the nonzero value iterator.
+				 * @tparam length_type   The type of the number of elements in each iterator.
+				 * @tparam T             The type of the supplied mask.
+				 *
+				 * \note By default, the iterator types are raw, unaliased, pointers.
+				 *
+				 * \warning This means that by default, input arrays are \em not
+				 *          allowed to overlap.
+				 *
+				 * Forward iterators will only be used to read from, never to assign to.
+				 *
+				 * \note It is therefore both legal and preferred to pass constant forward
+				 *       iterators, as opposed to mutable ones as \a I, \a J, and \a V.
+				 *
+				 * @param[in]  I   A forward iterator to \a cap row indices.
+				 * @param[in]  J   A forward iterator to \a cap column indices.
+				 * @param[in]  V   A forward iterator to \a cap nonzero values.
+				 * @param[in]  nz  The number of items pointed to by \a I, \a J, \em and
+				 *                 \a V.
+				 * @param[in] mask An input element at coordinate \f$ (i,j) \f$ will only be
+				 *                 added to this matrix if there exists a matching element
+				 *                 \f$ \mathit{mask}_{ij} \f$ in the given \a mask that
+				 *                 eveluates <tt>true</tt>. The matrix in \a mask must be
+				 *                 of the same dimension as this matrix.
+				 *
+				 * @return alp::MISMATCH -# when an element from \a I dereferences to a value
+				 *                          larger than the row dimension of this matrix, or
+				 *                       -# when an element from \a J dereferences to a value
+				 *                          larger than the column dimension of this matrix.
+				 *                       When this error code is returned the state of this
+				 *                       container will be as though this function was never
+				 *                       called; however, the given forward iterators may
+				 *                       have been copied and the copied iterators may have
+				 *                       incurred multiple increments and dereferences.
+				 * @return alp::OVERFLW  When the internal data type used for storing the
+				 *                       number of nonzeroes is not large enough to store
+				 *                       the number of nonzeroes the user wants to assign.
+				 *                       When this error code is returned the state of this
+				 *                       container will be as though this function was never
+				 *                       called; however, the given forward iterators may
+				 *                       have been copied and the copied iterators may have
+				 *                       incurred multiple increments and dereferences.
+				 * @return alp::SUCCESS  When the function completes successfully.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This function contains
+				 *           \f$ \Theta(\mathit{nz}\log\mathit{nz})+\mathcal{O}(m+n)) \f$
+				 *           amount of work.
+				 *        -# This function may dynamically allocate
+				 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of memory.
+				 *        -# A call to this function will use \f$ \mathcal{O}(m+n) \f$ bytes
+				 *           of memory beyond the memory in use at the function call entry.
+				 *        -# This function will copy each input forward iterator at most
+				 *           \em twice; the three input iterators \a I, \a J, and \a V thus
+				 *           may have exactly two copies each, meaning that all input may be
+				 *           traversed \em twice.
+				 *        -# Each of the at most six iterator copies will be incremented at
+				 *           most \f$ \mathit{nz} \f$ times.
+				 *        -# Each position of the each of the at most six iterator copies
+				 *           will be dereferenced exactly once.
+				 *        -# This function moves
+				 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of data.
+				 *        -# If the mask is nonempty, the performance costs of alp::eWiseMul
+				 *           on two matrix arguments must be added to the above costs.
+				 *        -# This function will likely make system calls.
+				 * \endparblock
+				 *
+				 * \warning This is an extremely expensive function. Use sparingly and only
+				 *          when absolutely necessary
+				 *
+				 * \note Streaming input can be implemented by supplying buffered
+				 *       iterators to this GraphBLAS implementation.
+				 */
+				template< Descriptor descr = descriptors::no_operation,
+					template< typename, typename, typename > class accum = operators::right_assign,
+					template< typename, typename, typename > class dup = operators::add,
+					typename fwd_iterator1 = const size_t * __restrict__,
+					typename fwd_iterator2 = const size_t * __restrict__,
+					typename fwd_iterator3 = const D * __restrict__,
+					typename length_type = size_t,
+					typename T >
+				RC buildMatrix( const fwd_iterator1 I, const fwd_iterator2 J, const fwd_iterator3 V, const length_type nz, const Matrix< T, implementation > & mask ) {
+					(void)I;
+					(void)J;
+					(void)V;
+					(void)nz;
+					(void)mask;
+					return PANIC;
+				}
+
+				//@{
+				/**
+				 * Provides the only mechanism to extract data from a GraphBLAS matrix.
+				 *
+				 * The order in which nonzero elements are returned is undefined.
+				 *
+				 * @return An iterator pointing to the first element of this matrix, if any;
+				 *         \em or an iterator in end position if this vector contains no
+				 *         nonzeroes.
+				 *
+				 * \note An `iterator in end position' compares equal to the const_iterator
+				 *       returned by cend().
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *        -# This function contains \f$ \mathcal{O}(1) \f$ work.
+				 *        -# This function is allowed allocate dynamic memory.
+				 *        -# This function uses up to \f$ \mathcal{O}(1) \f$ more memory
+				 *           than already used by this application at entry.
+				 *        -# This function shall move at most \f$ \mathcal{O}(1) \f$ bytes
+				 *           of data.
+				 *        -# This function may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this function within performance critical code
+				 *          sections.
+				 *
+				 * \note This function may make use of a const_iterator that is buffered,
+				 *       hence possibly causing its implicitly called constructor to
+				 *       allocate dynamic memory.
+				 */
+				const_iterator cbegin() const {}
+
+				/**
+				 * Same as cbegin().
+				 * Since iterators are only supplied as a data extraction mechanism, there
+				 * is no overloaded version of this function that returns a non-const
+				 * iterator.
+				 */
+				const_iterator begin() const {}
+				//@}
+
+				//@{
+				/**
+				 * Indicates the end to the elements in this container.
+				 *
+				 * @return An iterator at the end position of this container.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *        -# This function contains \f$ \mathcal{O}(1) \f$ work.
+				 *        -# This function is not allowed allocate dynamic memory.
+				 *        -# This function uses up to \f$ \mathcal{O}(1) \f$ more memory
+				 *           than already used by this application at entry.
+				 *        -# This function shall move at most \f$ \mathcal{O}(1) \f$ bytes
+				 *           of data.
+				 *        -# This function shall \em not induce any system calls.
+				 * \endparblock
+				 *
+				 * \note Even if cbegin() returns a buffered const_iterator that may require
+				 *       dynamic memory allocation and additional data movement, this
+				 *       specification disallows the same to happen for the construction of
+				 *       an iterator in end position.
+				 */
+				const_iterator cend() const {}
+
+				/**
+				 * Same as cend().
+				 * Since iterators are only supplied as a data extraction mechanism, there
+				 * is no overloaded version of this function that returns a non-const
+				 * iterator.
+				 */
+				const_iterator end() const {}
+				//@}
+
+				template< typename InputType, Backend backend >
+				RC clear( Matrix< InputType, backend > & A ) noexcept {
+					// this is the generic stub implementation
+					return UNSUPPORTED;
+				}
+		}; // class Matrix
+	} // namespace internal
+
+
+	template< typename T, typename Structure, enum Density density, typename View,
+		typename ImfR, typename ImfC, enum Backend backend >
+	class Matrix;
+
+		// These two comments are left here until a better place is found for them.
+		/**
+		 * When a structured matrix instanciate a \em container it defines a new \em physical
+		 * (concrete?) layout. This is characterized by an ALP container (aka an \a internal::Matrix) and a 
+		 * storage scheme that defines a unique interpretation of its content.
+		 * The combination of the logical and physical layout of a structured matrix enables to
+		 * identify a precise mapping between an element in the structured matrix and a position
+		 * wihtin one or more 1/2D-arrays that store it.
+		 */
+		//internal::Matrix< T, reference > * _container;
+
+		/**
+		 * A container's storage scheme. \a storage_scheme is not exposed to the user as an option
+		 * but can defined by ALP at different points in the execution depending on the \a backend choice.
+		 * For example, if the container is associated to an I/O matrix, with a reference backend
+		 * it might be set to reflect the storage scheme of the user data as specified at buildMatrix.
+		 * If \a backend is set to \a mlir then the scheme could be fixed by the JIT compiler to effectively
+		 * support its optimization strategy.
+		 * At construction time and until the moment the scheme decision is made it may be set to
+		 * an appropriate default choice, e.g. if \a density is \a Density::Dense then
+		 * \a Density::Dense::full could be used.
+		 * \internal \todo Revisit this. The change of storage scheme type to enum (dense/sparse) and
+		 * implementing storage mapping functions requires a change of this spec.
+		 */
+		// Storage storage_scheme;
+
+	/**
+	 * Check if type \a T is a Matrix.
+	 */
+	template< typename T >
+	struct is_structured_matrix : std::false_type {};
+	template< typename T, typename Structure, enum Density density, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	struct is_structured_matrix< Matrix< T, Structure, density, View, ImfR, ImfC, backend > > : std::true_type {};
+
+} // end namespace ``alp''
+
+#endif // end _H_ALP_MATRIX_BASE
diff --git a/include/alp/base/scalar.hpp b/include/alp/base/scalar.hpp
new file mode 100644
index 000000000..45a36c7d9
--- /dev/null
+++ b/include/alp/base/scalar.hpp
@@ -0,0 +1,229 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_SCALAR_BASE
+#define _H_ALP_SCALAR_BASE
+
+#include <cstdlib>  //size_t
+#include <stdexcept>
+
+#include <alp/backends.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/ops.hpp>
+#include <alp/rc.hpp>
+
+
+namespace alp {
+
+	/**
+	 * \brief An ALP scalar.
+	 *
+	 * This is an opaque data type for scalars.
+	 *
+	 * @tparam T         The type of the vector elements. \a T shall not
+	 *                   be an ALP type.
+	 * @tparam Structure One of the structures. One of possible use cases
+	 *                   for a structured scalar is a random structure.
+	 *                   Depending on the backend implementation, this may mean,
+	 *                   for example, randomizing the scalar value on each
+	 *                   interaction with the scalar.
+	 *
+	 * \warning Creating a alp::Scalar of other ALP types is
+	 *                <em>not allowed</em>.
+	 *          Passing a ALP type as template parameter will lead to
+	 *          undefined behaviour.
+	 *
+	 */
+	template< typename T, typename Structure, enum Backend backend >
+	class Scalar {
+
+		public:
+			/** @see Vector::value_type. */
+			typedef T value_type;
+
+			/** @see Vector::lambda_reference */
+			typedef T& lambda_reference;
+
+			/**
+			 * The default ALP scalar constructor.
+			 *
+			 * The constructed object will be uninitalised after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar() {}
+
+			/**
+			 * The ALP scalar constructor for converting a reference to C/C++ scalar
+			 * to ALP scalar.
+			 *
+			 * The constructed object will be initialized after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 * \warning This constructor saves the reference to the provied value.
+			 *          Therefore, the changes to the container or the value will
+			 *          be mirrored into each-other. For preserving the separation,
+			 *          use Scalar( const T ) version.
+			 *
+			 */
+			explicit Scalar( T &value ) {
+				(void)value;
+			}
+
+			/**
+			 * The ALP scalar constructor for converting a C/C++ scalar to ALP scalar.
+			 *
+			 * The constructed object will be initialized after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			explicit Scalar( T value ) {
+				(void)value;
+			}
+
+			/**
+			 * Copy constructor.
+			 *
+			 * @param other The scalar to copy. The initialization state of the copy
+			 *              reflects the state of \a other.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor allocates \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ of data
+			 *           movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar( const Scalar &other ) noexcept {
+				(void)other;
+			}
+
+			/**
+			 * Move constructor. The new scalar equals the given
+			 * scalar. Invalidates the use of the input scalar.
+			 *
+			 * @param[in] other The ALP scalar to move to this new instance.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor will not allocate any new dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+			 * \endparblock
+			 */
+			Scalar( Scalar &&other ) noexcept {
+				(void)other;
+			}
+
+			/**
+			 * Returns a lambda reference to the value of this Scalar. The user
+			 * ensures that the requested reference only corresponds to a pre-existing
+			 * nonzero in this scalar, <em>or undefined behaviour will occur</em>.
+			 * This addresses the sparse specialization of scalars. In the dense
+			 * context, scalar is considered to have a nonzero value \em iff initialized.
+			 *
+			 * A lambda reference to the value of this scalar is only valid when used
+			 * inside a lambda function evaluated via alp::eWiseLambda. Outside this
+			 * scope the returned reference incurs undefined behaviour.
+			 *
+			 *
+			 * \warning In parallel contexts the use of a returned lambda reference
+			 *          outside the context of an eWiseLambda will incur at least one of
+			 *          the following ill effects: it may
+			 *            -# fail outright,
+			 *            -# work on stale data,
+			 *            -# work on incorrect data, or
+			 *            -# incur high communication costs to guarantee correctness.
+			 *          In short, such usage causes undefined behaviour. Implementers are
+			 *          \em not advised to provide GAS-like functionality through this
+			 *          interface, as it invites bad programming practices and bad
+			 *          algorithm design decisions. This operator is instead intended to
+			 *          provide for generic BLAS0-type operations only.
+			 *
+			 * \note    For I/O, use the iterator retrieved via cbegin() instead of
+			 *          relying on a lambda_reference.
+			 *
+			 * @return      A lambda reference to the value of this scalar
+			 *
+			 * \par Example.
+			 * See alp::eWiseLambda() for a practical and useful example.
+			 *
+			 * \warning There is no similar concept in the official GraphBLAS specs.
+			 *
+			 * @see lambda_reference For more details on the returned reference type.
+			 * @see alp::eWiseLambda For one legal way in which to use the returned
+			 *      #lambda_reference.
+			 */
+			lambda_reference operator*() noexcept {
+#ifndef _ALP_NO_EXCEPTIONS
+				assert( false ); // Requesting lambda reference of unimplemented Scalar backend.
+#endif
+			}
+
+			/** Returns a constant reference to the scalar value.
+			 * See the non-constant variant for further details.
+			 */
+			const lambda_reference operator*() const noexcept {
+#ifndef _ALP_NO_EXCEPTIONS
+				assert( false ); // Requesting lambda reference of unimplemented Scalar backend.
+#endif
+			}
+
+	}; // class Scalar
+
+} // namespace alp
+
+#endif // _H_ALP_SCALAR_BASE
diff --git a/include/alp/base/vector.hpp b/include/alp/base/vector.hpp
new file mode 100644
index 000000000..4ba5440a8
--- /dev/null
+++ b/include/alp/base/vector.hpp
@@ -0,0 +1,892 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 10th of August, 2016
+ */
+
+#ifndef _H_ALP_VECTOR_BASE
+#define _H_ALP_VECTOR_BASE
+
+#include <cstdlib>  //size_t
+#include <iterator> //std::iterator
+#include <stdexcept>
+#include <utility> //pair
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/ops.hpp>
+#include <alp/rc.hpp>
+
+
+namespace alp {
+	namespace internal {
+		/**
+		 * A GraphBLAS vector. This is an opaque data type that can be provided to
+		 * any GraphBLAS function, such as, alp::eWiseMulAdd, for example.
+		 *
+		 * @tparam D The type of an element of this vector. \a D shall not be a
+		 *           GraphBLAS type.
+		 * @tparam implementation Allows different backends to implement different
+		 *         versions of this data type.
+		 *
+		 * \warning Creating a alp::Vector of other GraphBLAS types is
+		 *                <em>not allowed</em>.
+		 *          Passing a GraphBLAS type as template parameter will lead to
+		 *          undefined behaviour.
+		 *
+		 * \note The implementation found in the same file as this documentation
+		 *       catches invalid backends only. This class should never compile.
+		 *
+		 * @see alp::Vector< D, reference > for an actual implementation example.
+		 */
+		template< typename D, enum Backend implementation >
+		class Vector {
+
+			public :
+
+				/** The type of elements stored in this vector. */
+				typedef D value_type;
+
+				/**
+				 * Defines a reference to a value of type \a D. This reference is only valid
+				 * when used inside a lambda function that is passed to alp::eWiseLambda().
+				 *
+				 * \warning Any other use of this reference incurs undefined behaviour.
+				 *
+				 * \par Example.
+				 * An example valid use:
+				 * \code
+				 * void f(
+				 *      Vector< D >::lambda_reference x,
+				 *      const Vector< D >::lambda_reference y,
+				 *      const Vector< D > &v
+				 * ) {
+				 *      alp::eWiseLambda( [x,y](const size_t i) {
+				 *          x += y;
+				 *      }, v );
+				 * }
+				 * \endcode
+				 * This code adds \a y to \a x for every element in \a v. For a more useful
+				 * example, see alp::eWiseLambda.
+				 *
+				 * \warning Note that, unlike the above, this below code is illegal since it
+				 *          does not evaluate via a lambda passed to any of the above
+				 *          GraphBLAS lambda functions (such as alp::eWiseLambda).
+				 *          \code{.cpp}
+				 *              void f(
+				 *                   Vector< D >::lambda_reference x,
+				 *                   const Vector< D >::lambda_reference y
+				 *              ) {
+				 *                   x += y;
+				 *              }
+				 *          \endcode
+				 *          Also this usage is illegal since it does not rely on any
+				 *          GraphBLAS-approved function listed above:
+				 *          \code{.cpp}
+				 *              void f(
+				 *                   Vector< D >::lambda_reference x,
+				 *                   const Vector< D >::lambda_reference y
+				 *              ) {
+				 *                   std::functional< void() > f =
+				 *                       [x,y](const size_t i) {
+				 *                           x += y;
+				 *                       };
+				 *                   f();
+				 *              }
+				 *          \endcode
+				 *
+				 * \warning There is no similar concept in the official GraphBLAS specs.
+				 *
+				 * @see alp::Vector::operator[]()
+				 * @see alp::eWiseLambda
+				 */
+				typedef D & lambda_reference;
+
+				/**
+				 * A standard iterator for the Vector< D > class.
+				 *
+				 * This iterator is used for data extraction only. Hence only this const
+				 * version is supplied.
+				 *
+				 * \warning Comparing two const iterators corresponding to different
+				 *          containers leads to undefined behaviour.
+				 * \warning Advancing an iterator past the end iterator of the container
+				 *          it corresponds to leads to undefined behaviour.
+				 * \warning Modifying the contents of a container makes any use of any
+				 *          iterator derived from it incur invalid behaviour.
+				 * \note    These are standard limitations of STL iterators.
+				 */
+				class const_iterator : public std::iterator< std::forward_iterator_tag, std::pair< const size_t, const D >, size_t > {
+
+					public :
+
+						/** Standard equals operator. */
+						bool
+						operator==( const const_iterator & other ) const { (void)other; return false; }
+
+						/** @returns The negation of operator==(). */
+						bool operator!=( const const_iterator & other ) const {
+							(void)other;
+							return true;
+						}
+
+						/**
+						 * Dereferences the current position of this iterator.
+						 *
+						 * @return If this iterator is valid and not in end position,
+						 *         this returns a new std::pair with in its first
+						 *         field the position of the nonzero value, and in its
+						 *         second field the value of the nonzero.
+						 *
+						 * \note If this iterator is invalid or in end position, the result is,
+						 *       undefined.
+						 */
+						std::pair< const size_t, const D > operator*() const {
+							return std::pair< const size_t, const D >();
+						}
+
+						/**
+						 * Advances the position of this iterator by one.
+						 *
+						 * If the current position corresponds to the last element in the
+						 * container, the new position of this iterator will be its end
+						 * position.
+						 *
+						 * If the current position of this iterator is already the end
+						 * position, this iterator will become invalid; any use of invalid
+						 * iterators will lead to undefined behaviour.
+						 *
+						 * @return A reference to this iterator.
+						 */
+						const_iterator & operator++() {
+							return *this;
+						}
+
+				}; // class const_iterator
+
+				/**
+				 * The only way to create an empty GraphBLAS vector. The given dimension will
+				 * be fixed throughout the lifetime of this container.
+				 *
+				 * The vector will be empty after successful construction.
+				 *
+				 * @param[in]     n   The dimension of this vector.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *        -# This constructor completes in \f$ \mathcal{O}(n) \f$ time.
+				 *        -# This constructor allocates \f$ \mathcal{O}(n) \f$ bytes of
+				 *           dynamic memory.
+				 *        -# This constructor moves at most \f$ \mathcal{O}( n ) \f$ bytes
+				 *           of data.
+				 *        -# This constructor may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector( const size_t n ) {
+					(void)n;
+				}
+
+				/**
+				 * Move constructor.
+				 *
+				 * This will make the new vector equal the given GraphBLAS vector while
+				 * destroying the supplied GraphBLAS vector.
+				 *
+				 * This function always succeeds and will not throw exceptions.
+				 *
+				 * @param[in] x The GraphBLAS vector to move to this new container.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *        -# This constructor completes in \f$ \Theta(1) \f$ time.
+				 *        -# This constructor does not allocate new data on the heap.
+				 *        -# This constructor uses \f$ \mathcal{O}(1) \f$ more memory than
+				 *           already used by this application at constructor entry.
+				 *        -# This constructor incurs at most \f$ \mathcal{O}(1) \f$ bytes of
+				 *           data movement.
+				 * \endparblock
+				 */
+				Vector( Vector< D, implementation > &&x ) noexcept {
+					(void)x;
+				}
+
+				/**
+				 * Move-from-temporary assignment.
+				 *
+				 * @param[in,out] x The temporary instance from which this instance shall
+				 *                  take over its resources.
+				 *
+				 * After a call to this function, \a x shall correspond to an empy vector.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *         -# This move assignment completes in \f$ \Theta(1) \f$ time.
+				 *         -# This move assignment may not make system calls.
+				 *         -# this move assignment moves \f$ \Theta(1) \f$ data only.
+				 * \endparblock
+				 */
+				Vector< D, implementation >& operator=( Vector< D, implementation > &&x ) noexcept {
+					(void)x;
+					return *this;
+				}
+
+				/**
+				 * Default destructor. Frees all associated memory areas.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *        -# This destructor contains \f$ \mathcal{O}(n) \f$ work, where
+				 *           \f$ n \f$ is the capacity of this vector.
+				 *        -# This destructor is only allowed to free memory, not allocate.
+				 *        -# This destructor uses \f$ \mathcal{O}(1) \f$ more memory than
+				 *           already used by this application at entry.
+				 *        -# This destructor shall move at most \f$ \mathcal{O}(n) \f$ bytes
+				 *           of data.
+				 *        -# This destructor will make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this destructor within performance critical
+				 *          code sections.
+				 *
+				 * \note Destruction of this GraphBLAS container is the only way to
+				 *       guarantee that any underlying dynamically allocated memory is
+				 *       freed.
+				 */
+				~Vector() {}
+
+				//@{
+				/**
+				 * Provides the only mechanism to extract data from this GraphBLAS vector.
+				 *
+				 * The order in which nonzero elements are returned is undefined.
+				 *
+				 * @return An iterator pointing to the first element of this vector, if any;
+				 *         \em or an iterator in end position if this vector contains no
+				 *         nonzeroes.
+				 *
+				 * \note An `iterator in end position' compares equal to the const_iterator
+				 *       returned by cend().
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *        -# This function contains \f$ \mathcal{O}(1) \f$ work.
+				 *        -# This function is allowed allocate dynamic memory.
+				 *        -# This function uses up to \f$ \mathcal{O}(1) \f$ more memory
+				 *           than already used by this application at entry.
+				 *        -# This function shall move at most \f$ \mathcal{O}(1) \f$ bytes
+				 *           of data.
+				 *        -# This function may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this function within performance critical code
+				 *          sections.
+				 *
+				 * \note This function may make use of a const_iterator that is buffered,
+				 *       hence possibly causing its implicitly called constructor to
+				 *       allocate dynamic memory.
+				 */
+				const_iterator cbegin() const {}
+
+				/**
+				 * Same as cbegin().
+				 * Since iterators are only supplied as a data extraction mechanism, there
+				 * is no overloaded version of this function that returns a non-const
+				 * iterator.
+				 */
+				const_iterator begin() const {}
+				//@}
+
+				//@{
+				/**
+				 * Indicates the end to the elements in this container.
+				 *
+				 * @return An iterator at the end position of this container.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *        -# This function contains \f$ \mathcal{O}(1) \f$ work.
+				 *        -# This function is not allowed allocate dynamic memory.
+				 *        -# This function uses up to \f$ \mathcal{O}(1) \f$ more memory
+				 *           than already used by this application at entry.
+				 *        -# This function shall move at most \f$ \mathcal{O}(1) \f$ bytes
+				 *           of data.
+				 *        -# This function shall \em not induce any system calls.
+				 * \endparblock
+				 *
+				 * \note Even if cbegin() returns a buffered const_iterator that may require
+				 *       dynamic memory allocation and additional data movement, this
+				 *       specification disallows the same to happen for the construction of
+				 *       an iterator in end position.
+				 */
+				const_iterator cend() const {}
+
+				/**
+				 * Same as cend().
+				 * Since iterators are only supplied as a data extraction mechanism, there
+				 * is no overloaded version of this function that returns a non-const
+				 * iterator.
+				 */
+				const_iterator end() const {}
+				//@}
+
+				/**
+				 * Copy from raw user-supplied data into a vector.
+				 *
+				 * This is the dense unmasked variant.
+				 *
+				 * @tparam descr        The pre-processing descriptor to use.
+				 * @tparam fwd_iterator The type of input iterator. By default, this will be
+				 *                      a raw \em unaliased pointer.
+				 * @tparam Accum        The accumulator type used to merge incoming new
+				 *                      elements with existing contents, if any.
+				 *
+				 * @param[in] accum The accumulator used to merge incoming new elements with
+				 *                  existing content, if any.
+				 * @param[in] start The iterator to the first element that should be copied
+				 *                  into this GraphBLAS vector.
+				 * @param[in] end   Iterator shifted exactly one past the last element that
+				 *                  should be copied into this GraphBLAS vector.
+				 * @param[out] npos The last iterator position after exiting this function.
+				 *                  In most cases this will equal \a end. This parameter is
+				 *                  optional.
+				 *
+				 * The first element from \a it will be copied into the element with index
+				 * \f$ 0 \f$ in this vector. The \f$ k \f$-th element will be copied into
+				 * the element with index \f$ k - 1 \f$. The iterator \a start will be
+				 * incremented along with \f$ k \f$ until it compares equal to \a end, or
+				 * until it has been incremented \a n times, where \a n is the dimension of
+				 * this vector. In the latter case, any remaining values are ignored.
+				 *
+				 * @return alp::SUCCESS This function always succeeds.
+				 *
+				 * \note The default accumulator expects \a val to be of the same type
+				 *       as nonzero elements in this function, and will cause old
+				 *       values to be overwritten by the incoming new values.
+				 *
+				 * \note Previous contents of the vector are retained. If these are to be
+				 *       cleared first, see clear(). The default accumulator is NOT an
+				 *       alternative since any pre-existing values corresponding to entries
+				 *       in the mask that evaluate to false will be retained.
+				 *
+				 * \note The parameter \a n can be used to ingest only a subset of a larger
+				 *       data structure pointed to by \a start. At the end of the call, \a
+				 *       start will then not be equal to \a end, but instead point to the
+				 *       first element of the remainder of the larger data structure.
+				 *
+				 * \par Valid descriptors
+				 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+				 *
+				 * \note Invalid descriptors will be ignored.
+				 *
+				 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+				 * \a accum must match the type of \a val, 2) the second domain must match
+				 * the type \a D of nonzeroes in this vector, and 3) the third domain must
+				 * match \a D. If one of these is not true, the code shall not compile.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *      If the capacity of this container is sufficient to perform the
+				 *      requested operation, then:
+				 *        -# This function contains \f$ \Theta(n) \f$ work.
+				 *        -# This function will take at most \f$ \Theta(1) \f$ memory beyond
+				 *           the memory already used by the application before the call to
+				 *           this function.
+				 *        -# This function moves at most \f$ n ( 2\mathit{sizeof}(D) +
+				 *           \mathit{sizeof}(\mathit{bool}) ) + \mathcal{O}(1) \f$ bytes of
+				 *           data.
+				 * \endparblock
+				 *
+				 * \parblock
+				 * \par Performance exceptions
+				 *      If the capacity of this container at function entry is insufficient
+				 *      to perform the requested operation, then, in addition to the above:
+				 *        -# this function allocates \f$ \Theta(n) \f$ bytes of memory .
+				 *        -# this function frees \f$ \mathcal{O}(n) \f$ bytes of memory.
+				 *        -# this function will make system calls.
+				 * \endparblock
+				 *
+				 * \note An implementation may ensure that at object construction the
+				 *       capacity is maximised. In that case, the above performance
+				 *       exceptions will never come to pass.
+				 *
+				 * @see alp::buildVector for the GraphBLAS standard dispatcher to this
+				 *                       function.
+				 */
+				template< Descriptor descr = descriptors::no_operation, class Accum = typename operators::right_assign< D, D, D >, typename fwd_iterator = const D * __restrict__ >
+				RC build( const Accum & accum, const fwd_iterator start, const fwd_iterator end, fwd_iterator npos ) {
+					(void)accum;
+					(void)start;
+					(void)end;
+					(void)npos;
+					return PANIC;
+				}
+
+				/**
+				 * Copy from raw user-supplied data into a vector.
+				 *
+				 * This is the sparse non-masked variant.
+				 *
+				 * @tparam descr        The pre-processing descriptor to use.
+				 * @tparam Accum        The type of the operator used to combine newly input
+				 *                      data with existing data, if any.
+				 * @tparam ind_iterator The type of index input iterator. By default, this
+				 *                      will be a raw \em unaliased pointer to elements of
+				 *                      type \a size_t.
+				 * @tparam nnz_iterator The type of nonzero input iterator. By default, this
+				 *                      will be a raw \em unaliased pointer to elements of
+				 *                      type \a D.
+				 * @tparam Dup          The type of operator used to combine any duplicate
+				 *                      input values.
+				 *
+				 * @param[in] accum     The operator to be used when writing back the result
+				 *                      of data that was already in this container prior to
+				 *                      calling this function.
+				 * @param[in] ind_start The iterator to the first index value that should be
+				 *                      added to this GraphBLAS vector.
+				 * @param[in] ind_end   Iterator corresponding to the end position of
+				 *                      \a ind_start.
+				 * @param[in] nnz_start The iterator to the first nonzero value that should
+				 *                      be added to this GraphBLAS vector.
+				 * @param[in] nnz_end   Iterator corresponding to the end position of
+				 *                      \a nnz_start.
+				 * @param[in] dup       The operator to be used when handling multiple
+				 *                      nonzero values that are to be mapped to the same
+				 *                      index position.
+				 *
+				 * The first element from \a nnz_start will be copied into this vector at
+				 * the index corresponding to the first element from \a ind_start. Then,
+				 * both nonzero and index value iterators advance to add the next input
+				 * element and the process repeats until either of the input iterators
+				 * reach \a nnz_end or \a ind_end, respectively.
+				 * If at that point one of the iterators still has remaining elements, then
+				 * those elements are ignored.
+				 *
+				 * @return alp::MISMATCH When attempting to insert a nonzero value at an
+				 *                       index position that is larger or equal to the
+				 *                       dimension of this vector. When this code is
+				 *                       returned, the contents of this container are
+				 *                       undefined.
+				 * @return alp::SUCCESS  When all elements are successfully assigned.
+				 *
+				 * \note The default accumulator expects \a D to be of the same type
+				 *       as nonzero elements of this operator, and will cause old
+				 *       values to be overwritten by the incoming new values.
+				 *
+				 * \note The default \a dup expects \a D to be of the same type as nonzero
+				 *       elements of this operator, and will cause duplicate values to be
+				 *       discarded in favour of the last seen value.
+				 *
+				 * \note Previous contents of the vector are retained. If these are to be
+				 *       cleared first, see clear(). The default accumulator is NOT an
+				 *       alternative since any pre-existing values corresponding to entries
+				 *       in the mask that evaluate to false will be retained.
+				 *
+				 * \par Valid descriptors
+				 * alp::descriptors::no_operation, alp::descriptors::no_casting,
+				 * alp::descriptors::no_duplicates.
+				 *
+				 * \note Invalid descriptors will be ignored.
+				 *
+				 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+				 * \a accum must match the type of \a D, 2) the second domain must match
+				 * nnz_iterator::value_type, and 3) the third domain must \a D. If one of
+				 * these is not true, the code shall not compile.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This function contains \f$ \Theta(n) \f$ work.
+				 *        -# This function will take at most \f$ \Theta(1) \f$ memory beyond
+				 *           the memory already used by the application before the call to
+				 *           this function.
+				 *        -# This function moves at most \f$ n ( 2\mathit{sizeof}(D) +
+				 *           \mathit{sizeof}(\mathit{bool}) ) + \mathcal{O}(1) \f$ bytes of
+				 *           data.
+				 * \endparblock
+				 *
+				 * \parblock
+				 * \par Performance exceptions
+				 *      If the capacity of this container at function entry is insufficient
+				 *      to perform the requested operation, then, in addition to the above:
+				 *        -# this function allocates \f$ \Theta(n) \f$ bytes of memory .
+				 *        -# this function frees \f$ \mathcal{O}(n) \f$ bytes of memory.
+				 *        -# this function will make system calls.
+				 * \endparblock
+				 *
+				 * \note An implementation may ensure that at object construction the
+				 *       capacity is maximised. In that case, the above performance
+				 *       exceptions will never come to pass.
+				 *
+				 * @see alp::buildVector for the GraphBLAS standard dispatcher to this
+				 *                       function.
+				 */
+				template< Descriptor descr = descriptors::no_operation,
+					class Accum = operators::right_assign< D, D, D >,
+					typename ind_iterator = const size_t * __restrict__,
+					typename nnz_iterator = const D * __restrict__,
+					class Dup = operators::right_assign< D, D, D > >
+				RC build( const Accum & accum, const ind_iterator ind_start, const ind_iterator ind_end, const nnz_iterator nnz_start, const nnz_iterator nnz_end, const Dup & dup = Dup() ) {
+					(void)accum;
+					(void)ind_start;
+					(void)ind_end;
+					(void)nnz_start;
+					(void)nnz_end;
+					(void)dup;
+					return PANIC;
+				}
+
+				/**
+				 * Copy from raw user-supplied data into a vector.
+				 *
+				 * This is the sparse masked variant.
+				 *
+				 * @tparam descr        The pre-processing descriptor to use.
+				 * @tparam mask_type    The value type of the \a mask vector. This type is
+				 *                      \em not required to be \a bool.
+				 * @tparam Accum        The type of the operator used to combine newly input
+				 *                      data with existing data, if any.
+				 * @tparam ind_iterator The type of index input iterator. By default, this
+				 *                      will be a raw \em unaliased pointer to elements of
+				 *                      type \a size_t.
+				 * @tparam nnz_iterator The type of nonzero input iterator. By default, this
+				 *                      will be a raw \em unaliased pointer to elements of
+				 *                      type \a D.
+				 * @tparam Dup          The type of operator used to combine any duplicate
+				 *                      input values.
+				 *
+				 * @param[in] mask      An element is only added to this container if its
+				 *                      index \f$ i \f$ has a nonzero at the same position
+				 *                      in \a mask that evaluates true.
+				 * @param[in] accum     The operator to be used when writing back the result
+				 *                      of data that was already in this container prior to
+				 *                      calling this function.
+				 * @param[in] ind_start The iterator to the first index value that should be
+				 *                      added to this GraphBLAS vector.
+				 * @param[in] ind_end   Iterator corresponding to the end position of
+				 *                      \a ind_start.
+				 * @param[in] nnz_start The iterator to the first nonzero value that should
+				 *                      be added to this GraphBLAS vector.
+				 * @param[in] nnz_end   Iterator corresponding to the end position of
+				 *                      \a nnz_start.
+				 * @param[in] dup       The operator to be used when handling multiple
+				 *                      nonzero values that are to be mapped to the same
+				 *                      index position.
+				 *
+				 * The first element from \a nnz_start will be copied into this vector at
+				 * the index corresponding to the first element from \a ind_start. Then,
+				 * both nonzero and index value iterators advance to add the next input
+				 * element and the process repeats until either of the input iterators
+				 * reach \a nnz_end or \a ind_end, respectively.
+				 * If at that point one of the iterators still has remaining elements, then
+				 * those elements are ignored.
+				 *
+				 * @return alp::MISMATCH When attempting to insert a nonzero value at an
+				 *                       index position that is larger or equal to the
+				 *                       dimension of this vector. When this code is
+				 *                       returned, the contents of this container are
+				 *                       undefined.
+				 * @return alp::SUCCESS  When all elements are successfully assigned.
+				 *
+				 * \note The default accumulator expects \a D to be of the same type
+				 *       as nonzero elements of this operator, and will cause old
+				 *       values to be overwritten by the incoming new values.
+				 *
+				 * \note The default \a dup expects \a D to be of the same type as nonzero
+				 *       elements of this operator, and will cause duplicate values to be
+				 *       discarded in favour of the last seen value.
+				 *
+				 * \note Previous contents of the vector are retained. If these are to be
+				 *       cleared first, see clear(). The default accumulator is NOT an
+				 *       alternative since any pre-existing values corresponding to entries
+				 *       in the mask that evaluate to false will be retained.
+				 *
+				 * \par Valid descriptors
+				 * alp::descriptors::no_operation, alp::descriptors::no_casting,
+				 * alp::descriptors::invert_mask, alp::descriptors::no_duplicates.
+				 *
+				 * \note Invalid descriptors will be ignored.
+				 *
+				 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+				 * \a accum must match the type of \a D, 2) the second domain must match
+				 * nnz_iterator::value_type, and 3) the third domain must \a D. If one of
+				 * these is not true, the code shall not compile.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This function contains \f$ \Theta(n) \f$ work.
+				 *        -# This function will take at most \f$ \Theta(1) \f$ memory beyond
+				 *           the memory already used by the application before the call to
+				 *           this function.
+				 *        -# This function moves at most \f$ n ( 2\mathit{sizeof}(D) +
+				 *           \mathit{sizeof}(\mathit{bool}) ) + \mathcal{O}(1) \f$ bytes of
+				 *           data.
+				 * \endparblock
+				 *
+				 * \parblock
+				 * \par Performance exceptions
+				 *      If the capacity of this container at function entry is insufficient
+				 *      to perform the requested operation, then, in addition to the above:
+				 *        -# this function allocates \f$ \Theta(n) \f$ bytes of memory .
+				 *        -# this function frees \f$ \mathcal{O}(n) \f$ bytes of memory.
+				 *        -# this function will make system calls.
+				 * \endparblock
+				 *
+				 * \note An implementation may ensure that at object construction the
+				 *       capacity is maximised. In that case, the above performance
+				 *       exceptions will never come to pass.
+				 *
+				 * @see alp::buildVector for the GraphBLAS standard dispatcher to this
+				 *                       function.
+				 */
+				template< Descriptor descr = descriptors::no_operation,
+					typename mask_type,
+					class Accum,
+					typename ind_iterator = const size_t * __restrict__,
+					typename nnz_iterator = const D * __restrict__,
+					class Dup = operators::right_assign< D, typename nnz_iterator::value_type, D > >
+				RC build( const Vector< mask_type, implementation > mask,
+					const Accum & accum,
+					const ind_iterator ind_start,
+					const ind_iterator ind_end,
+					const nnz_iterator nnz_start,
+					const nnz_iterator nnz_end,
+					const Dup & dup = Dup() ) {
+					(void)mask;
+					(void)accum;
+					(void)ind_start;
+					(void)ind_end;
+					(void)nnz_start;
+					(void)nnz_end;
+					(void)dup;
+					return PANIC;
+				}
+
+				/**
+				 * Return the dimension of this vector.
+				 *
+				 * @tparam T The integral output type.
+				 *
+				 * @param[out] size Where to store the size of this vector.
+				 *                  The initial value is ignored.
+				 *
+				 * @returns alp::SUCCESS When the function call completes successfully.
+				 *
+				 * \note This function cannot fail.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *      This function
+				 *        -# contains \f$ \Theta(1) \f$ work,
+				 *        -# will not allocate new dynamic memory,
+				 *        -# will take at most \f$ \Theta(1) \f$ memory beyond the memory
+				 *           already used by the application before the call to this
+				 *           function.
+				 *        -# will move at most \f$ \mathit{sizeof}(T) +
+				 *          \mathit{sizeof}(\mathit{size\_t}) \f$ bytes of data.
+				 * \endparblock
+				 */
+				template< typename T >
+				RC size( T & size ) const {
+					(void)size;
+					return PANIC;
+				}
+
+				/**
+				 * Return the number of nonzeroes in this vector.
+				 *
+				 * @tparam T The integral output type.
+				 *
+				 * @param[out] nnz Where to store the number of nonzeroes contained in this
+				 *                 vector. Its initial value is ignored.
+				 *
+				 * @returns alp::SUCCESS When the function call completes successfully.
+				 *
+				 * \note This function cannot fail.
+				 *
+				 * \parblock
+				 * \par Performance semantics
+				 *      This function
+				 *        -# contains \f$ \Theta(1) \f$ work,
+				 *        -# will not allocate new dynamic memory,
+				 *        -# will take at most \f$ \Theta(1) \f$ memory beyond the memory
+				 *           already used by the application before the call to this
+				 *           function.
+				 *        -# will move at most \f$ \mathit{sizeof}(T) +
+				 *           \mathit{sizeof}(\mathit{size\_t}) \f$ bytes of data.
+				 * \endparblock
+				 */
+				template< typename T >
+				RC nnz( T & nnz ) const {
+					(void)nnz;
+					return PANIC;
+				}
+
+				/**
+				 * Returns a lambda reference to an element of this sparse vector.
+				 *
+				 * A lambda reference to an element of this vector is only valid when used
+				 * inside a lambda function evaluated via alp::eWiseLambda. The lambda
+				 * function is called for specific indices only-- that is, the GraphBLAS
+				 * implementation decides at which elements to dereference this container.
+				 * Outside this scope the returned reference incurs undefined behaviour.
+				 *
+				 * \warning In particular, for the given index \a i by the lambda function,
+				 *          it shall be \em illegal to refer to indices relative to that
+				 *          \a i; including, but not limited to, \f$ i+1 \f$, \f$ i-1 \f$, et
+				 *          cetera.
+				 *
+				 * \note    As a consequence, this function cannot be used to perform stencil
+				 *          or halo based operations.
+				 *
+				 * If a previously non-existing entry of the vector is requested, a new
+				 * nonzero is added at position \a i in this vector. The new element will
+				 * have its initial value equal to the \em identity corresponding to the
+				 * given monoid.
+				 *
+				 * \warning In parallel contexts the use of a returned lambda reference
+				 *          outside the context of an eWiseLambda will incur at least one of
+				 *          the following ill effects: it may
+				 *            -# fail outright,
+				 *            -# work on stale data,
+				 *            -# work on incorrect data, or
+				 *            -# incur high communication costs to guarantee correctness.
+				 *          In short, such usage causes undefined behaviour. Implementers are
+				 *          \em not advised to provide GAS-like functionality through this
+				 *          interface, as it invites bad programming practices and bad
+				 *          algorithm design decisions. This operator is instead intended to
+				 *          provide for generic BLAS1-type operations only.
+				 *
+				 * \note    For I/O, use the iterator retrieved via cbegin() instead of
+				 *          relying on a lambda_reference.
+				 *
+				 * @param[in] i      Which element to return a lambda reference of.
+				 * @param[in] monoid Under which generalised monoid to interpret the
+				 *                   requested \f$ i \f$th element of this vector.
+				 *
+				 * \note The \a monoid (or a ring) is required to be able to interpret a
+				 *       sparse vector. A user who is sure this vector is dense, or otherwise
+				 *       is able to ensure that the a lambda_reference will only be requested
+				 *       at elements where nonzeroes already exists, may refer to
+				 *       Vector::operator[],
+				 *
+				 * @return A lambda reference to the element \a i of this vector.
+				 *
+				 * \par Example.
+				 * See alp::eWiseLambda() for a practical and useful example.
+				 *
+				 * \warning There is no similar concept in the official GraphBLAS specs.
+				 *
+				 * @see lambda_reference For more details on the returned reference type.
+				 * @see alp::eWiseLambda For one legal way in which to use the returned
+				 *      #lambda_reference.
+				 */
+				template< class Monoid >
+				lambda_reference operator()( const size_t i, const Monoid & monoid = Monoid() ) {
+					(void)i;
+					(void)monoid;
+					return PANIC;
+				}
+
+				/**
+				 * Returns a lambda reference to an element of this vector. The user
+				 * ensures that the requested reference only corresponds to a pre-existing
+				 * nonzero in this vector, <em>or undefined behaviour will occur</em>.
+				 *
+				 * A lambda reference to an element of this vector is only valid when used
+				 * inside a lambda function evaluated via alp::eWiseLambda. The lambda
+				 * function is called for specific indices only-- that is, the GraphBLAS
+				 * implementation decides at which elements to dereference this container.
+				 * Outside this scope the returned reference incurs undefined behaviour.
+				 *
+				 * \warning In particular, for the given index \a i by the lambda function,
+				 *          it shall be \em illegal to refer to indices relative to that
+				 *          \a i; including, but not limited to, \f$ i+1 \f$, \f$ i-1 \f$, et
+				 *          cetera.
+				 *
+				 * \note    As a consequence, this function cannot be used to perform stencil
+				 *          or halo based operations.
+				 *
+				 * If a previously non-existing entry of the vector is requested, undefined
+				 * behaviour will occur. Functions that are defined to work with references
+				 * of this kind, such as alp::eWiseLambda, define exactly which elements are
+				 * dereferenced.
+				 *
+				 * \warning In parallel contexts the use of a returned lambda reference
+				 *          outside the context of an eWiseLambda will incur at least one of
+				 *          the following ill effects: it may
+				 *            -# fail outright,
+				 *            -# work on stale data,
+				 *            -# work on incorrect data, or
+				 *            -# incur high communication costs to guarantee correctness.
+				 *          In short, such usage causes undefined behaviour. Implementers are
+				 *          \em not advised to provide GAS-like functionality through this
+				 *          interface, as it invites bad programming practices and bad
+				 *          algorithm design decisions. This operator is instead intended to
+				 *          provide for generic BLAS1-type operations only.
+				 *
+				 * \note    For I/O, use the iterator retrieved via cbegin() instead of
+				 *          relying on a lambda_reference.
+				 *
+				 * @param[in] i    Which element to return a lambda reference of.
+				 * @param[in] ring Under which generalised semiring to interpret the
+				 *                 requested \f$ i \f$th element of this vector.
+				 *
+				 * \note The \a ring is required to be able to interpret a sparse vector. A
+				 *       user who is sure this vector is dense, or otherwise is able to
+				 *       ensure that the a lambda_reference will only be requested at
+				 *       elements where nonzeroes already exists, may refer to
+				 *       Vector::operator[],
+				 *
+				 * @return A lambda reference to the element \a i of this vector.
+				 *
+				 * \par Example.
+				 * See alp::eWiseLambda() for a practical and useful example.
+				 *
+				 * \warning There is no similar concept in the official GraphBLAS specs.
+				 *
+				 * @see lambda_reference For more details on the returned reference type.
+				 * @see alp::eWiseLambda For one legal way in which to use the returned
+				 *      #lambda_reference.
+				 */
+				lambda_reference operator[]( const size_t i ) {
+					(void)i;
+#ifndef _ALP_NO_EXCEPTIONS
+					assert( false ); // Requesting lambda reference of unimplemented Vector backend.
+#endif
+				}
+		}; // class Vector
+	} // namespace internal
+
+	template<
+		typename T,
+		typename Structure,
+		enum Density density,
+		typename View,
+		typename ImfR,
+		typename ImfC,
+		enum Backend backend
+	>
+	class Vector;
+
+}
+
+#endif // _H_ALP_VECTOR_BASE
diff --git a/include/alp/blas0.hpp b/include/alp/blas0.hpp
new file mode 100644
index 000000000..d69148f93
--- /dev/null
+++ b/include/alp/blas0.hpp
@@ -0,0 +1,34 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 5th of December 2016
+ */
+
+#ifndef _H_ALP_BLAS0
+#define _H_ALP_BLAS0
+
+// now include all specialisations contained in the backend directories:
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/blas0.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/blas0.hpp>
+#endif
+
+#endif // end ``_H_ALP_BLAS0''
diff --git a/include/alp/blas1.hpp b/include/alp/blas1.hpp
new file mode 100644
index 000000000..1d3a72b34
--- /dev/null
+++ b/include/alp/blas1.hpp
@@ -0,0 +1,34 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 29th of March 2017
+ */
+
+#ifndef _H_ALP_BLAS1
+#define _H_ALP_BLAS1
+
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/blas1.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/blas1.hpp>
+#endif
+
+#endif // end ``_H_ALP_BLAS1''
+
diff --git a/include/alp/blas2.hpp b/include/alp/blas2.hpp
new file mode 100644
index 000000000..740b134a9
--- /dev/null
+++ b/include/alp/blas2.hpp
@@ -0,0 +1,39 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Defines the GraphBLAS level 2 API.
+ *
+ * @author A. N. Yzelman
+ * @date: 30th of March 2017.
+ */
+
+#ifndef _H_ALP_BLAS2
+#define _H_ALP_BLAS2
+
+// now include all specialisations contained in the backend directories:
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/blas2.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/blas2.hpp>
+#endif
+
+#endif // end ``_H_ALP_BLAS2''
+
diff --git a/include/alp/blas3.hpp b/include/alp/blas3.hpp
new file mode 100644
index 000000000..95b6ac5c4
--- /dev/null
+++ b/include/alp/blas3.hpp
@@ -0,0 +1,37 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author: A. N. Yzelman
+ */
+
+#ifndef _H_ALP_BLAS3
+#define _H_ALP_BLAS3
+
+// now include all specialisations contained in the backend directories:
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/blas3.hpp>
+#endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/blas3.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/blas3.hpp>
+#endif
+
+#endif // end _H_ALP_BLAS3
+
diff --git a/include/alp/config.hpp b/include/alp/config.hpp
new file mode 100644
index 000000000..3913ee6da
--- /dev/null
+++ b/include/alp/config.hpp
@@ -0,0 +1,35 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 8th of August, 2016
+ */
+
+#ifndef _H_ALP_CONFIG
+#define _H_ALP_CONFIG
+
+// include all active configurations
+#ifdef _ALP_WITH_REFERENCE
+ #include "alp/reference/config.hpp"
+#endif
+#ifdef _ALP_WITH_OMP
+ #include "alp/omp/config.hpp"
+#endif
+
+#endif // end ``_H_ALP_CONFIG''
+
diff --git a/include/alp/density.hpp b/include/alp/density.hpp
new file mode 100644
index 000000000..aee7f1a18
--- /dev/null
+++ b/include/alp/density.hpp
@@ -0,0 +1,62 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file 
+ * 
+ * This file registers the enum that allows a user to specify the density of a
+ * given ALP container.
+ * 
+ */
+
+#ifndef _H_ALP_DENSITY
+#define _H_ALP_DENSITY
+
+
+namespace alp {
+
+	/**
+	 * Specifies whether an ALP container is dense or sparse.
+	 * 
+	 * This is specified by the user and may be used by a backend to drive
+	 * a choice of a storage scheme.
+	 *
+	 */
+	enum Density {
+		/**
+		 * Dense containers do not allow nonzero elements.
+		 *
+		 * Depending on the container's \a Structure, the backend may decide to
+		 * not store all the elements. For example, an upper triangular matrix
+		 * can be stored without the all-zero part below the diagonal.
+		 * 
+		 * @see Structure
+		 * 
+		 */
+		Dense,
+		/**
+		 * Sparse containers mostly having nonzero elements.
+		 *
+		 * The backend can decide which specific format to use.
+		 *
+		 */
+		Sparse
+	}; // enum Density
+
+} // namespace alp
+
+#endif // _H_ALP_DENSITY
diff --git a/include/alp/descriptors.hpp b/include/alp/descriptors.hpp
new file mode 100644
index 000000000..d9a84b7f8
--- /dev/null
+++ b/include/alp/descriptors.hpp
@@ -0,0 +1,208 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Defines the GraphBLAS various descriptors.
+ *
+ * @author A. N. Yzelman
+ * @date 15 March, 2016
+ */
+
+#ifndef _H_ALP_DESCRIPTOR
+#define _H_ALP_DESCRIPTOR
+
+#include <string>
+
+
+namespace alp {
+
+	/**
+	 * Descriptors indicate pre- or post-processing for some or all of the
+	 * arguments to a GraphBLAS call.
+	 *
+	 * They can be combined using bit-wise operators. For instance, to both
+	 * indicate the matrix needs be transposed and the mask needs be
+	 * inverted, the following descriptor can be passed:
+	 *    <tt> transpose_matrix | invert_mask </tt>
+	 */
+	typedef unsigned int Descriptor;
+
+	/** Collection of standard descriptors. */
+	namespace descriptors {
+
+		/**
+		 * Indicates no additional pre- or post-processing on any of
+		 * the GraphBLAS function arguments.
+		 */
+		static constexpr Descriptor no_operation = 0;
+
+		/** Inverts the mask prior to applying it. */
+		static constexpr Descriptor invert_mask = 1;
+
+		/**
+		 * Transposes the input matrix prior to applying it.
+		 */
+		static constexpr Descriptor transpose_matrix = 2;
+
+		/**
+		 * For data ingestion methods, such as grb::buildVector or grb::buildMatrix,
+		 * this descriptor indicates that the input shall not contain any duplicate
+		 * entries.
+		 *
+		 * Use of this descriptor will speed up the corresponding function call
+		 * significantly.
+		 *
+		 * A call to buildMatrix with this descriptor set will pass its arguments to
+		 * buildMatrixUnique.
+		 *
+		 * \warning Use of this descriptor while the data to be ingested actually
+		 *          \em does contain duplicates will lead to undefined behaviour.
+		 *
+		 * Currently, the reference implementation only supports ingesting data
+		 * using this descriptor. Support for duplicate input is not yet
+		 * implemented everywhere.
+		 */
+		static constexpr Descriptor no_duplicates = 4;
+
+		/**
+		 * Uses the structure of a mask vector only.
+		 *
+		 * This ignores the actual values of the mask argument. The i-th element of
+		 * the mask now evaluates true if the mask has \em any value assigned to its
+		 * i-th index, regardless of how that value evaluates. It evaluates false
+		 * if there was no value assigned.
+		 *
+		 * @see structural_complement
+		 */
+		static constexpr Descriptor structural = 8;
+
+		/**
+		 * Uses the structural complement of a mask vector.
+		 *
+		 * This is a convenience short-hand for:
+		 * \code
+		 * constexpr Descriptor structural_complement = structural | invert_mask;
+		 * \endcode
+		 *
+		 * This ignores the actual values of the mask argument. The i-th element of
+		 * the mask now evaluates true if the mask has \em no value assigned to its
+		 * i-th index, and evaluates false otherwise.
+		 */
+		static constexpr Descriptor structural_complement = structural | invert_mask;
+
+		/**
+		 * Indicates all vectors used in a computation are dense. This is a hint that
+		 * might affect performance but will never affect the semantics of the
+		 * computation.
+		 */
+		static constexpr Descriptor dense = 16;
+
+		/**
+		 * For any call to a matrix computation, the input matrix \a A is instead
+		 * interpreted as \f$ A+I \f$, with \a I the identity matrix of dimension
+		 * matching \a A. If \a A is not square, padding zero columns or rows will
+		 * be added to \a I in the largest dimension.
+		 */
+		static constexpr Descriptor add_identity = 32;
+
+		/**
+		 * Instead of using input vector elements, use the index of those elements.
+		 *
+		 * Indices are cast from their internal data type (<tt>size_t</tt>, e.g.)
+		 * to the relevant domain of the operator used.
+		 */
+		static constexpr Descriptor use_index = 64;
+
+		/**
+		 * Disallows the standard casting of input parameters to a compatible domain
+		 * in case they did not match exactly.
+		 *
+		 * Setting this descriptor will yield compile-time errors whenever casting
+		 * would have been necessary to successfully compile the requested graphBLAS
+		 * operation.
+		 *
+		 * \warning It is illegal to perform conditional toggling on this descriptor.
+		 *
+		 * \note With conditional toggling, if <tt>descr</tt> is a descriptor, we
+		 *       mean <code>if( descr & descriptors::no_casting ) {
+		 *                      new_descr = desc - descriptors::no_casting
+		 *                      //followed by any use of this new descriptor
+		 *                  }
+		 *            </code>
+		 *       The reason we cannot allow for this type of toggling is because this
+		 *       descriptor makes use of the <tt>static_assert</tt> C++11 function,
+		 *       which is checked regardless of the result of the if-statement. Thus
+		 *       the above code actually always throws compile errors on mismatching
+		 *       domains, no matter the original value in <tt>descr</tt>.
+		 *
+		 * \internal Simply making this descriptor the one with the largest integral
+		 *           value amongst the various descriptors is enough to guarantee
+		 *           nothing bad will happen. A notable exception are underflows,
+		 *           which are caught by using internal::MAX_DESCRIPTOR_VALUE.
+		 */
+		static constexpr Descriptor no_casting = 256;
+
+		/**
+		 * Computation shall proceed with zeros (according to the current semiring)
+		 * propagating throughout the requested computation.
+		 *
+		 * \warning This may lead to unexpected results if the same output container
+		 * is interpreted under a different semiring-- what is zero for the current
+		 * semiring may not be zero for another. In other words: the concept of
+		 * sparsity will no longer generalise to other semirings.
+		 */
+		static constexpr Descriptor explicit_zero = 512;
+
+		/**
+		 * Indicates overlapping input and output vectors is intentional and safe, due
+		 * to, for example, the use of masks.
+		 */
+		static constexpr Descriptor safe_overlap = 1024;
+
+		/**
+		 * For operations involving 2 matrices, transposes the left-hand side input
+		 * matrix prior to applying it.
+		 */
+		static constexpr Descriptor transpose_left = 2048;
+
+		/**
+		 * For operations involving 2 matrices, transposes the right-hand side input
+		 * matrix prior to applying it.
+		 */
+		static constexpr Descriptor transpose_right = 4096;
+
+		/**
+		 * Translates a descriptor into a string.
+		 *
+		 * @param[in] descr The input descriptor.
+		 *
+		 * @returns A detailed English description.
+		 */
+		std::string toString( const Descriptor descr );
+
+	} // namespace descriptors
+
+	namespace internal {
+		/** A descriptor cannot have a higher value than the below. */
+		static constexpr Descriptor MAX_DESCRIPTOR_VALUE = 8191;
+	} // namespace internal
+
+} // namespace alp
+
+#endif
diff --git a/include/alp/dispatch/blas0.hpp b/include/alp/dispatch/blas0.hpp
new file mode 100644
index 000000000..0a6c41309
--- /dev/null
+++ b/include/alp/dispatch/blas0.hpp
@@ -0,0 +1,80 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_DISPATCH_BLAS0
+#define _H_ALP_DISPATCH_BLAS0
+
+#include <type_traits> // std::enable_if, std::is_same
+
+#include <alp/backends.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/rc.hpp>
+#include <alp/type_traits.hpp>
+
+#include <alp/base/blas0.hpp>
+
+#include "scalar.hpp"
+
+#include <alp/reference/blas0.hpp> // for internal apply and fold
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+namespace alp {
+
+	/**
+	 * @brief Reference implementation of \a foldl.
+	 */
+	template<
+		class OP,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure >
+	RC foldl( Scalar< IOType, IOStructure, dispatch > &x,
+		const Scalar< InputType, InputStructure, dispatch > &y,
+		const OP & op = OP(),
+		const typename std::enable_if< is_operator< OP >::value && ! is_object< InputType >::value && ! is_object< IOType >::value, void >::type * = NULL ) {
+
+		RC rc = internal::foldl( *x, *y, op );
+
+		return rc;
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_ALP_DISPATCH_BLAS0''
+
diff --git a/include/alp/dispatch/blas1.hpp b/include/alp/dispatch/blas1.hpp
new file mode 100644
index 000000000..19d38ba20
--- /dev/null
+++ b/include/alp/dispatch/blas1.hpp
@@ -0,0 +1,492 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_DISPATCH_BLAS1
+#define _H_ALP_DISPATCH_BLAS1
+
+#include <functional>
+#include <alp/backends.hpp>
+#include <alp/config.hpp>
+#include <alp/rc.hpp>
+#include <alp/density.hpp>
+
+#include "scalar.hpp"
+#include "matrix.hpp"
+#include "vector.hpp"
+#include "blas0.hpp"
+#include "blas2.hpp"
+
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+#define NO_CAST_OP_ASSERT( x, y, z )                                           \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the operator domains, as specified in the "            \
+		"documentation of the function " y ", supply an input argument of "    \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible operator where all domains "  \
+		"match those of the input parameters, as specified in the "            \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace alp {
+
+	/**
+	 * Calculates the dot product, \f$ \alpha = (x,y) \f$, under a given additive
+	 * monoid and multiplicative operator.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class AddMonoid, class AnyOp
+	>
+	RC dot(
+		Scalar< OutputType, OutputStructure, dispatch > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const std::enable_if_t< !alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< AddMonoid >::value &&
+			alp::is_operator< AnyOp >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< InputType1, typename AnyOp::D1 >::value ), "alp::dot",
+			"called with a left-hand vector value type that does not match the first "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< InputType2, typename AnyOp::D2 >::value ), "alp::dot",
+			"called with a right-hand vector value type that does not match the second "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename AddMonoid::D3, typename AnyOp::D1 >::value ), "alp::dot",
+			"called with a multiplicative operator output domain that does not match "
+			"the first domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< OutputType, typename AddMonoid::D2 >::value ), "alp::dot",
+			"called with an output vector value type that does not match the second "
+			"domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename AddMonoid::D3, typename AddMonoid::D2 >::value ), "alp::dot",
+			"called with an additive operator whose output domain does not match its "
+			"second input domain" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< OutputType, typename AddMonoid::D3 >::value ), "alp::dot",
+			"called with an output vector value type that does not match the third "
+			"domain of the given additive operator" );
+		(void)z;
+		if( size( x ) != size( y ) ) {
+			return MISMATCH;
+		}
+
+		if( !( internal::getInitialized( z ) && internal::getInitialized( x ) && internal::getInitialized( y ) ) ) {
+#ifdef _DEBUG
+			std::cout << "dot(): one of input vectors or scalar are not initialized: do noting!\n";
+#endif
+			return SUCCESS;
+		}
+
+		std::function< void( typename AddMonoid::D3 &, const size_t, const size_t ) > data_lambda =
+			[ &x, &y, &anyOp ]( typename AddMonoid::D3 &result, const size_t i, const size_t j ) {
+				(void) j;
+				internal::apply(
+					result, x[ i ],
+					grb::utils::is_complex< InputType2 >::conjugate( y[ i ] ),
+					anyOp
+				);
+			};
+
+		std::function< bool() > init_lambda =
+			[ &x ]() -> bool {
+				return internal::getInitialized( x );
+			};
+
+		Vector<
+			typename AddMonoid::D3,
+			structures::General,
+			Density::Dense,
+			view::Functor< std::function< void( typename AddMonoid::D3 &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			dispatch
+		> temp(
+			init_lambda,
+			getLength( x ),
+			data_lambda
+		);
+		RC rc = foldl( z, temp, addMonoid );
+		return rc;
+	}
+
+	/**
+	 * Provides a generic implementation of the dot computation on semirings by
+	 * translating it into a dot computation on an additive commutative monoid
+	 * with any multiplicative operator.
+	 *
+	 * For return codes, exception behaviour, performance semantics, template
+	 * and non-template arguments, @see alp::dot.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring
+	>
+	RC dot( Scalar< IOType, IOStructure, dispatch > &x,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > &left,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > &right,
+		const Ring &ring = Ring(),
+		const typename std::enable_if<
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_semiring< Ring >::value,
+		void >::type * const = NULL
+	) {
+		return alp::dot< descr >( x,
+			left, right,
+			ring.getAdditiveMonoid(),
+			ring.getMultiplicativeOperator()
+		);
+	}
+
+	/**
+	 * This is the eWiseLambda that performs length checking by recursion.
+	 *
+	 * in the reference implementation all vectors are distributed equally, so no
+	 * need to synchronise any data structures. We do need to do error checking
+	 * though, to see when to return alp::MISMATCH. That's this function.
+	 *
+	 * @see Vector::operator[]()
+	 * @see Vector::lambda_reference
+	 */
+	template<
+		typename Func,
+		typename DataType1, typename DataStructure1, typename DataView1, typename InputImfR1, typename InputImfC1,
+		typename DataType2, typename DataStructure2, typename DataView2, typename InputImfR2, typename InputImfC2,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		Vector< DataType1, DataStructure1, Density::Dense, DataView1, InputImfR1, InputImfC1, dispatch > &x,
+		const Vector< DataType2, DataStructure2, Density::Dense, DataView2, InputImfR2, InputImfC2, dispatch > &y,
+		Args const &... args
+	) {
+		// catch mismatch
+		if( getLength( x ) != getLength( y ) ) {
+			return MISMATCH;
+		}
+		// continue
+		return eWiseLambda( f, x, args... );
+	}
+
+	/**
+	 * No implementation notes. This is the `real' implementation on dispatch
+	 * vectors.
+	 *
+	 * @see Vector::operator[]()
+	 * @see Vector::lambda_reference
+	 */
+	template<
+		typename Func,
+		typename DataType, typename DataStructure, typename DataView, typename DataImfR, typename DataImfC
+	>
+	RC eWiseLambda( const Func f, Vector< DataType, DataStructure, Density::Dense, DataView, DataImfR, DataImfC, dispatch > &x ) {
+#ifdef _DEBUG
+		std::cout << "Info: entering eWiseLambda function on vectors.\n";
+#endif
+		auto x_as_matrix = get_view< view::matrix >( x );
+		return eWiseLambda(
+			[ &f ]( const size_t i, const size_t j, DataType &val ) {
+				(void)j;
+				f( i, val );
+			},
+			x_as_matrix
+		);
+	}
+
+	/**
+	 * Reduces a vector into a scalar. Reduction takes place according a monoid
+	 * \f$ (\oplus,1) \f$, where \f$ \oplus:\ D_1 \times D_2 \to D_3 \f$ with an
+	 * associated identity \f$ 1 \in \{D_1,D_2,D_3\} \f$. Elements from the given
+	 * vector \f$ y \in \{D_1,D_2\} \f$ will be applied at the left-hand or right-
+	 * hand side of \f$ \oplus \f$; which, exactly, is implementation-dependent
+	 * but should not matter since \f$ \oplus \f$ should be associative.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Monoid
+	>
+	RC foldl(
+		Scalar< IOType, IOStructure, dispatch > &alpha,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, dispatch > &y,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			! alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+
+		// static sanity checks
+		NO_CAST_ASSERT(
+			( ! ( descr & descriptors::no_casting ) || std::is_same< IOType, InputType >::value ),
+			"alp::reduce",
+			"called with a scalar IO type that does not match the input vector type"
+		);
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Monoid::D1 >::value ), "alp::reduce",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Monoid::D2 >::value ), "alp::reduce",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Monoid::D3 >::value ), "alp::reduce",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+
+#ifdef _DEBUG
+		std::cout << "foldl(Scalar,Vector,Monoid) called. Vector has size " << getLength( y ) << " .\n";
+#endif
+
+		internal::setInitialized(
+			alpha,
+			internal::getInitialized( alpha ) && internal::getInitialized( y )
+		);
+
+		if( !internal::getInitialized( alpha ) ) {
+			return SUCCESS;
+		}
+
+		const size_t n = getLength( y );
+		for ( size_t i = 0; i < n; ++i ) {
+			(void) internal::foldl( *alpha, y[ i ], monoid.getOperator() );
+		}
+		return SUCCESS;
+	}
+	/**
+	 * For all elements in a ALP Vector \a x, fold the value \f$ \beta \f$
+	 * into each element.
+	 *
+	 * The original value of \f$ \beta \f$ is used as the right-hand side input
+	 * of the operator \a op. The left-hand side inputs for \a op are retrieved
+	 * from the input vector \a x. The result of the operation is stored in
+	 * \f$ \beta \f$, thus overwriting its previous value. This process is
+	 * repeated for every element in \a y.
+	 *
+	 * The value of \f$ x_i \f$ after a call to thus function thus equals
+	 * \f$ x_i \odot \beta \f$, for all \f$ i \in \{ 0, 1, \dots, n - 1 \} \f$.
+	 *
+	 * @tparam descr       The descriptor used for evaluating this function. By
+	 *                     default, this is alp::descriptors::no_operation.
+	 * @tparam OP          The type of the operator to be applied.
+	 * @tparam IOType      The type of the value \a beta.
+	 * @tparam InputType   The type of the elements of \a x.
+	 * @tparam IOStructure The structure of the vector \a x.
+	 * @tparam IOView      The view type applied to the vector \a x.
+	 *
+	 * @param[in,out] x    On function entry: the initial values to be applied as
+	 *                     the left-hand side input to \a op. The input vector must
+	 *                     be dense.
+	 *                     On function exit: the output data.
+	 * @param[in]     beta The input value to apply as the right-hand side input
+	 *                     to \a op.
+	 * @param[in]     op   The operator under which to perform this left-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note This function is also defined for monoids.
+	 *
+	 * \warning If \a x is sparse and this operation is requested, a monoid instead
+	 *          of an operator is required!
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirement).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ 2n \cdot \mathit{sizeof}(\mathit{IOType}) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType, typename InputStructure,
+		class Op
+	>
+	RC foldl(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, dispatch > &x,
+		const Scalar< InputType, InputStructure, dispatch > beta,
+		const Op &op = Op(),
+		const std::enable_if_t<
+			! alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_operator< Op >::value
+		> * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( ! ( descr & descriptors::no_casting )	|| std::is_same< typename Op::D1, IOType >::value ),
+			"alp::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( ! ( descr & descriptors::no_casting )	|| std::is_same< typename Op::D2, InputType >::value ),
+			"alp::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( ! ( descr & descriptors::no_casting )	|| std::is_same< typename Op::D3, IOType >::value ),
+			"alp::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "foldl(Vector,Scalar,Op) called. Vector has size " << getLength( x ) << " .\n";
+#endif
+
+		internal::setInitialized(
+			x,
+			internal::getInitialized( x ) && internal::getInitialized( beta )
+		);
+
+		if( !internal::getInitialized( x ) ) {
+			return SUCCESS;
+		}
+
+		const size_t n = getLength( x );
+		for ( size_t i = 0; i < n; ++i ) {
+			(void) internal::foldl( x[ i ], *beta, op );
+		}
+		return SUCCESS;
+	}
+
+	/**
+	 * Returns a view over the input vector returning conjugate of the accessed element.
+	 * This avoids materializing the resulting container.
+	 * The elements are calculated lazily on access.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC
+	>
+	Vector<
+		DataType, Structure, Density::Dense,
+		view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		dispatch
+	>
+	conjugate(
+		const Vector< DataType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &x,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value
+		> * const = nullptr
+	) {
+
+		std::function< void( DataType &, const size_t, const size_t ) > data_lambda =
+			[ &x ]( DataType &result, const size_t i, const size_t j ) {
+				(void) j;
+				result = grb::utils::is_complex< DataType >::conjugate( x[ i ] );
+			};
+
+		std::function< bool() > init_lambda =
+			[ &x ]() -> bool {
+				return internal::getInitialized( x );
+			};
+
+		return Vector<
+			DataType,
+			Structure,
+			Density::Dense,
+			view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			dispatch
+		>( init_lambda,	getLength( x ),	data_lambda );
+
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_ALP_DISPATCH_BLAS1''
+
diff --git a/include/alp/dispatch/blas2.hpp b/include/alp/dispatch/blas2.hpp
new file mode 100644
index 000000000..7fd51740a
--- /dev/null
+++ b/include/alp/dispatch/blas2.hpp
@@ -0,0 +1,558 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_DISPATCH_BLAS2
+#define _H_ALP_DISPATCH_BLAS2
+
+#include <cstddef>
+
+#include <alp/backends.hpp>
+#include <alp/config.hpp>
+#include <alp/rc.hpp>
+
+#include <alp/base/blas2.hpp>
+
+#include "blas0.hpp"
+#include "matrix.hpp"
+#include "scalar.hpp"
+
+#include <graphblas/utils/iscomplex.hpp>
+
+#define NO_CAST_OP_ASSERT( x, y, z )                                           \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the operator domains, as specified in the "            \
+		"documentation of the function " y ", supply an input argument of "    \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible operator where all domains "  \
+		"match those of the input parameters, as specified in the "            \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * Applies the provided function to each element of the given band.
+		 * This function is called by the public eWiseLambda variant.
+		 * Forward declaration. Specializations handle bound checking.
+		 */
+		template<
+			size_t BandIndex, typename Func,
+			typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+			std::enable_if_t<
+				BandIndex >= std::tuple_size< typename Structure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseLambda(
+			const Func f,
+			alp::Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &A
+		);
+
+		/** Specialization for an out-of-bounds band index */
+		template<
+			size_t BandIndex, typename Func,
+			typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+			std::enable_if_t<
+				BandIndex >= std::tuple_size< typename Structure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseLambda(
+			const Func f,
+			alp::Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &A
+		) {
+			(void)f;
+			(void)A;
+			// nothing to do
+			return SUCCESS;
+		}
+
+		/**
+		 * Specialization for a within-the-range band index.
+		 * Applies the provided function to each element of the given band.
+		 * Upon completion, calls itself for the next band.
+		 */
+		template<
+			size_t band_index, typename Func,
+			typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+			std::enable_if_t<
+				band_index < std::tuple_size< typename Structure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseLambda(
+			const Func f,
+			alp::Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &A
+		) {
+			const auto i_limits = structures::calculate_row_coordinate_limits< band_index >( A );
+
+			for( size_t i = i_limits.first; i < i_limits.second; ++i ) {
+
+				const auto j_limits = structures::calculate_column_coordinate_limits< band_index >( A, i );
+
+				for( size_t j = j_limits.first; j < j_limits.second; ++j ) {
+					auto &a_val = internal::access( A, internal::getStorageIndex( A, i, j ) );
+					f( i, j, a_val );
+				}
+			}
+			return eWiseLambda< band_index + 1 >( f, A );
+		}
+
+	} // namespace internal
+
+	/**
+	 * Delegates to single-band variant.
+	 *
+	 * @see alp::eWiseLambda for the user-level specification.
+	 */
+	template<
+		typename Func,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC
+	>
+	RC eWiseLambda(
+		const Func f,
+		Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &A
+	) {
+#ifdef _DEBUG
+		std::cout << "entering alp::eWiseLambda (matrices, dispatch ). A is " << alp::nrows( A ) << " by " << alp::ncols( A ) << " and holds " << alp::nnz( A ) << " nonzeroes.\n";
+#endif
+		return internal::eWiseLambda< 0 >( f, A );
+	}
+
+	/**
+	 * This function provides dimension checking and will defer to the below
+	 * function for the actual implementation.
+	 *
+	 * @see alp::eWiseLambda for the user-level specification.
+	 */
+	template<
+		typename Func,
+		typename DataType1, typename DataStructure1, typename DataView1, typename DataImfR1, typename DataImfC1,
+		typename DataType2, typename DataStructure2, typename DataView2, typename DataImfR2, typename DataImfC2,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		Matrix< DataType1, DataStructure1, Density::Dense, DataView1, DataImfR1, DataImfC1, dispatch > &A,
+		const Vector< DataType2, DataStructure2, Density::Dense, DataView2, DataImfR2, DataImfC2, dispatch > &x,
+		Args const &... args
+	) {
+		// do size checking
+		if( !( getLength( x ) == nrows( A ) || getLength( x ) == ncols( A ) ) ) {
+			std::cerr << "Mismatching dimensions: given vector of size " << size( x )
+				<< " has nothing to do with either matrix dimension (" << nrows( A ) << " nor " << ncols( A ) << ").\n";
+			return MISMATCH;
+		}
+
+		return eWiseLambda( f, A, args... );
+	}
+
+	namespace internal {
+
+		/**
+		 * Applies fold to all elements of the given band
+		 * Depending on the values of left and scalar, performs the following variants:
+		 * - left == true  && scalar == true:  C = C . alpha
+		 * - left == true  && scalar == false: C = C . A
+		 * - left == false && scalar == true:  C = alpha . C
+		 * - left == false && scalar == false: C = A . C
+		 * This variants handles out-of-bounds band index.
+		 * All variants assume compatible parameters:
+		 *   - matching structures
+		 *   - matching dynamic sizes
+		 */
+		template<
+			size_t band_index,
+			bool left, // if true, performs foldl, otherwise foldr
+			bool scalar,
+			Descriptor descr,
+			class Operator,
+			typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+			typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+			typename InputTypeScalar, typename InputStructureScalar,
+			std::enable_if_t<
+				band_index >= std::tuple_size< typename IOStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC fold_matrix_band_generic(
+			alp::Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, dispatch > *C,
+			const alp::Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, dispatch > *A,
+			const alp::Scalar< InputTypeScalar, InputStructureScalar, dispatch > *alpha,
+			const Operator &op,
+			const std::enable_if_t<
+				!alp::is_object< IOType >::value &&
+				!alp::is_object< InputType >::value &&
+				alp::is_operator< Operator >::value
+			> * const = nullptr
+		) {
+			(void) C;
+			(void) A;
+			(void) alpha;
+			(void) op;
+			return SUCCESS;
+		}
+
+		/** Specialization for band index within the bounds */
+		template<
+			size_t band_index,
+			bool left, // if true, performs foldl, otherwise foldr
+			bool scalar,
+			Descriptor descr,
+			class Operator,
+			typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+			typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+			typename InputTypeScalar, typename InputStructureScalar,
+			std::enable_if_t<
+				band_index < std::tuple_size< typename IOStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC fold_matrix_band_generic(
+			alp::Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, dispatch > *C,
+			const alp::Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, dispatch > *A,
+			const alp::Scalar< InputTypeScalar, InputStructureScalar, dispatch > *alpha,
+			const Operator &op,
+			const std::enable_if_t<
+				!alp::is_object< IOType >::value &&
+				!alp::is_object< InputType >::value &&
+				alp::is_operator< Operator >::value
+			> * const = nullptr
+		) {
+			// Ensure that the provided containers are compatible with static configuration
+			assert( C != nullptr );
+			if( scalar ) {
+				assert( alpha != nullptr );
+			} else {
+				assert( A != nullptr );
+			}
+
+			constexpr bool is_sym_c = structures::is_a< IOStructure, structures::Symmetric >::value;
+			constexpr bool is_sym_a = structures::is_a< InputStructure, structures::Symmetric >::value;
+
+			// Temporary until adding multiple symmetry directions
+			constexpr bool sym_up_c = is_sym_c;
+			constexpr bool sym_up_a = is_sym_a;
+
+			// It is assumed without checking that bands of A are a subset of bands of C. TODO: Implement proper check.
+			// If input is scalar, iterating over bands of C, otherwise over bands of A
+			const auto i_limits = scalar ?
+				structures::calculate_row_coordinate_limits< band_index >( *C ) :
+				structures::calculate_row_coordinate_limits< band_index >( *A );
+
+			for( size_t i = i_limits.first; i < i_limits.second; ++i ) {
+
+				const auto j_limits = scalar ?
+					structures::calculate_column_coordinate_limits< band_index >( *C, i ) :
+					structures::calculate_column_coordinate_limits< band_index >( *A, i );
+
+				for( size_t j = j_limits.first; j < j_limits.second; ++j ) {
+					auto &IO_val = internal::access( *C, internal::getStorageIndex( *C, i, j ) );
+
+					if( scalar ) {
+						if( left ) {
+							// C = C . alpha
+							(void) internal::foldl( IO_val, **alpha, op );
+						} else {
+							// C = alpha . C
+							(void) internal::foldr( **alpha, IO_val, op );
+						}
+					} else {
+						// C = A . C
+						// Calculate indices to 'A' depending on matching symmetry with 'C'
+						const size_t A_i = ( sym_up_c == sym_up_a ) ? i : j;
+						const size_t A_j = ( sym_up_c == sym_up_a ) ? j : i;
+						const auto &A_val = internal::access( *A, internal::getStorageIndex( *A, A_i, A_j ) );
+
+						if( left ) {
+							// C = C . A
+							(void) internal::foldl( IO_val, A_val, op );
+						} else {
+							// C = A . C
+							(void) internal::foldr( A_val, IO_val, op );
+						}
+					}
+				}
+			}
+			return fold_matrix_band_generic<
+				band_index + 1, left, scalar, descr
+			>( C, A, alpha, op );
+		}
+
+		/**
+		 * \internal general elementwise matrix application that all eWiseApply variants refer to.
+		 */
+		template<
+			bool left, bool scalar,
+			Descriptor descr,
+			class Operator,
+			typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+			typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+			typename InputTypeScalar, typename InputStructureScalar
+		>
+		RC fold_matrix_generic(
+			alp::Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, dispatch > *C,
+			const alp::Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, dispatch > *A,
+			const alp::Scalar< InputTypeScalar, InputStructureScalar, dispatch > *alpha,
+			const Operator &op,
+			const std::enable_if_t<
+				!alp::is_object< IOType >::value &&
+				!alp::is_object< InputType >::value &&
+				alp::is_operator< Operator >::value
+			> * const = nullptr
+		) {
+
+#ifdef _DEBUG
+			std::cout << "In alp::internal::fold_matrix_generic\n";
+#endif
+
+			// run-time checks
+			// TODO: support left/right_scalar
+			const size_t m = alp::nrows( *C );
+			const size_t n = alp::ncols( *C );
+
+			if( !scalar ){
+				assert( A != nullptr );
+				if( m != nrows( *A ) || n != ncols( *A ) ) {
+					return MISMATCH;
+				}
+			}
+
+			// delegate to single-band variant
+			return fold_matrix_band_generic< 0, left, scalar, descr >( C, A, alpha, op );
+		}
+
+	} // namespace internal
+
+	/** Folds element-wise B into A, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, dispatch > &A,
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, dispatch > &B,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, InputType >::value ),
+			"alp::foldl",
+			"called with a matrix B of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D3, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are identical to IOStructure's bands
+
+		constexpr bool left = true;
+		constexpr bool scalar = false;
+		constexpr Scalar< InputType, structures::General, dispatch > *no_scalar = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &A, &B, no_scalar, op ) ;
+	}
+
+	/** Folds element-wise beta into A, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, dispatch > &A,
+		const Scalar< InputType, InputStructure, dispatch > &beta,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value && !alp::is_object< InputType >::value && alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, InputType >::value ),
+			"alp::foldl",
+			"called with a scalar beta of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D3, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are identical to IOStructure's bands
+
+		constexpr bool left = true;
+		constexpr bool scalar = true;
+		constexpr Matrix< InputType, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, dispatch > *no_matrix = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &A, no_matrix, &beta, op ) ;
+	}
+
+	/**
+	 * Returns a view over the input matrix returning conjugate of the accessed element.
+	 * This avoids materializing the resulting container.
+	 * The elements are calculated lazily on access.
+	 *
+	 * @tparam descr      	    The descriptor to be used (descriptors::no_operation
+	 *                    	    if left unspecified).
+	 * @tparam InputType  	    The value type of the input matrix.
+	 * @tparam InputStructure   The Structure type applied to the input matrix.
+	 * @tparam InputView        The view type applied to the input matrix.
+	 *
+	 * @param A      The input matrix
+	 *
+	 * @return Matrix view over a lambda function defined in this function.
+	 *
+	 * Specialization for non-square matrices. This distinction is necessary due
+	 * to different constructor signature for square and non-square matrices.
+	 *
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+		std::enable_if_t<
+			!structures::is_a< Structure, structures::Square >::value
+		> * = nullptr
+	>
+	Matrix<
+		DataType, Structure, Density::Dense,
+		view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		dispatch
+	>
+	conjugate(
+		const Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &A,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value
+		> * const = nullptr
+	) {
+
+		std::function< void( DataType &, const size_t, const size_t ) > data_lambda =
+			[ &A ]( DataType &result, const size_t i, const size_t j ) {
+				result = grb::utils::is_complex< DataType >::conjugate(
+					internal::access( A, internal::getStorageIndex( A, i, j ) )
+				);
+			};
+		std::function< bool() > init_lambda =
+			[ &A ]() -> bool {
+				return internal::getInitialized( A );
+			};
+
+		return Matrix<
+			DataType,
+			Structure,
+			Density::Dense,
+			view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			dispatch
+			>(
+				init_lambda,
+				nrows( A ),
+				ncols( A ),
+				data_lambda
+			);
+
+	}
+
+	/** Specialization for square matrices */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+		std::enable_if_t<
+			structures::is_a< Structure, structures::Square >::value
+		> * = nullptr
+	>
+	Matrix<
+		DataType, Structure, Density::Dense,
+		view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		dispatch
+	>
+	conjugate(
+		const Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &A,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value
+		> * const = nullptr
+	) {
+
+		std::function< void( DataType &, const size_t, const size_t ) > data_lambda =
+			[ &A ]( DataType &result, const size_t i, const size_t j ) {
+				result = grb::utils::is_complex< DataType >::conjugate(
+					internal::access( A, internal::getStorageIndex( A, i, j ) )
+				);
+			};
+		std::function< bool() > init_lambda =
+			[ &A ]() -> bool {
+				return internal::getInitialized( A );
+			};
+
+		return Matrix<
+			DataType,
+			Structure,
+			Density::Dense,
+			view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			dispatch
+			>(
+				init_lambda,
+				nrows( A ),
+				data_lambda
+			);
+
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_OP_ASSERT
+
+#endif // end ``_H_ALP_DISPATCH_BLAS2''
+
diff --git a/include/alp/dispatch/blas3.hpp b/include/alp/dispatch/blas3.hpp
new file mode 100644
index 000000000..57f64ad5a
--- /dev/null
+++ b/include/alp/dispatch/blas3.hpp
@@ -0,0 +1,902 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_DISPATCH_BLAS3
+#define _H_ALP_DISPATCH_BLAS3
+
+#include <algorithm>   // for std::min/max
+#include <type_traits> // for std::enable_if
+#include <alp_blas.h> // for cblas_?gemm
+
+#include <alp/base/blas3.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/structures.hpp>
+
+#include "blas0.hpp"
+#include "io.hpp"
+#include "matrix.hpp"
+#include "vector.hpp"
+
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the semiring domains, as specified in the "            \
+		"documentation of the function " y ", supply a container argument of " \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible semiring where all domains "  \
+		"match those of the container arguments, as specified in the "         \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+#define NO_CAST_OP_ASSERT( x, y, z )                                           \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the operator domains, as specified in the "            \
+		"documentation of the function " y ", supply an input argument of "    \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible operator where all domains "  \
+		"match those of the input parameters, as specified in the "            \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+namespace alp {
+	namespace internal {
+
+		/**
+		 * \internal generic band mxm implementation - forward declaration.
+		 */
+		template<
+			size_t BandPos1, size_t BandPos2,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		typename std::enable_if<
+			( BandPos1 < std::tuple_size< typename InputStructure1::band_intervals >::value ) &&
+			( BandPos2 < std::tuple_size< typename InputStructure2::band_intervals >::value ),
+		RC >::type mxm_band_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid
+		);
+
+		/**
+		 * \internal generic band mxm implementation.
+		 * Recursively enumerating the cartesian product of non-zero bands 
+		 * (Base case).
+		 */
+		template<
+			size_t BandPos1, size_t BandPos2,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		typename std::enable_if<
+			( BandPos1 == std::tuple_size< typename InputStructure1::band_intervals >::value ),
+		RC >::type mxm_band_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid
+		) {
+			(void)C;
+			(void)A;
+			(void)B;
+			(void)oper;
+			(void)monoid;
+			(void)mulMonoid;
+
+			return SUCCESS;
+		}
+
+		/**
+		 * \internal generic band mxm implementation. 
+		 * Recursively enumerating the cartesian product of non-zero bands.
+		 * Move to next non-zero band of A.
+		 */
+		template<
+			size_t BandPos1, size_t BandPos2,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		typename std::enable_if<
+			( BandPos1 < std::tuple_size< typename InputStructure1::band_intervals >::value ) &&
+			( BandPos2 == std::tuple_size< typename InputStructure2::band_intervals >::value ),
+		RC >::type mxm_band_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid
+		) {
+			return mxm_band_generic< BandPos1 + 1, 0 >( C, A, B, oper, monoid, mulMonoid );
+		}
+
+		/**
+		 * \internal generic band mxm implementation. 
+		 * Recursively enumerating the cartesian product of non-zero bands.
+		 * Compute and move to next non-zero band of B.
+		 */
+		template<
+			size_t BandPos1, size_t BandPos2,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		typename std::enable_if<
+			( BandPos1 < std::tuple_size< typename InputStructure1::band_intervals >::value ) &&
+			( BandPos2 < std::tuple_size< typename InputStructure2::band_intervals >::value ),
+		RC >::type mxm_band_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid
+		) {
+
+			OutputType temp;
+
+			const std::ptrdiff_t M   { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t N   { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t K   { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+
+			const std::ptrdiff_t l_a { structures::get_lower_limit< BandPos1 >( A ) };
+			const std::ptrdiff_t u_a { structures::get_upper_limit< BandPos1 >( A ) };
+
+			const std::ptrdiff_t l_b { structures::get_lower_limit< BandPos2 >( B ) };
+			const std::ptrdiff_t u_b { structures::get_upper_limit< BandPos2 >( B ) };
+			
+			// In case of symmetry the iteration domain intersects the the upper 
+			// (or lower) domain of C
+			constexpr bool is_sym_a { structures::is_a< InputStructure1, structures::Symmetric >::value };
+			constexpr bool is_sym_b { structures::is_a< InputStructure2, structures::Symmetric >::value };
+			constexpr bool is_sym_c { structures::is_a< OutputStructure, structures::Symmetric >::value };
+
+			// Temporary until adding multiple symmetry directions
+			constexpr bool sym_up_a { is_sym_a };
+			constexpr bool sym_up_b { is_sym_b };
+			constexpr bool sym_up_c { is_sym_c };
+
+			// Intersecting potential symmetry of A and B, 
+			// in which case consider case Up( A ) * Up( B )
+			for( std::ptrdiff_t i = 0; i < M; ++i ) {
+				// Size + Symmetry constraints
+				//    sym_up_c * i   <= j < N
+				// Band constraints
+				// /\ i + l_a + l_b <= j < i + u_a + u_b - 1 (u is past-the-end)
+				for( std::ptrdiff_t j = std::max( sym_up_c * i, i + l_a + l_b ); 
+					 j < std::min( N, i + u_a + u_b - 1 ); 
+					 ++j ) {
+					
+					auto &c_val = internal::access( C, internal::getStorageIndex( C, i, j ) );
+
+					// Size + Symmetry constraints
+					//    sym_up_a * i <= l < K * (!sym_up_b) + ( j + 1 ) * (sym_up_b)   
+					// Band constraints
+					// /\ i + l_a      <= l < i + u_a        
+					// /\ j - u_b + 1  <= l < j - l_b + 1
+					for( std::ptrdiff_t l = std::max( { sym_up_a * i, i + l_a, j - u_b + 1 } ); 
+						 l < std::min( { K * ( !sym_up_b ) + ( j + 1 ) * sym_up_b, i + u_a, j - l_b + 1 } ); 
+						 ++l ) {
+						const auto ta { internal::access( A, internal::getStorageIndex( A, i, l ) ) };
+						const auto tb { internal::access( B, internal::getStorageIndex( B, l, j ) ) };
+						(void)internal::apply( temp, ta, tb, oper );
+						// std::cout << c_val << " += " << temp << " = " << ta << " * " << tb << std::endl;
+						(void)internal::foldl( c_val, temp, monoid.getOperator() );
+						// std::cout << c_val << std::endl;
+					}
+				}
+			}
+
+			if ( sym_up_b ) {
+				// Intersecting potential symmetry of A and B, 
+				// in which case consider case Up( A ) * Lo( B )
+				for( std::ptrdiff_t i = 0; i < M; ++i ) {
+					// Size + Symmetry constraints
+					//    sym_up_c * i   <= j < N - 1 
+					// Band constraints
+					// /\ i + l_a + l_b <= j < i + u_a + u_b - 1
+					for( std::ptrdiff_t j = std::max( sym_up_c * i, i + l_a + l_b ); 
+						j < std::min( N - 1, i + u_a + u_b - 1 ); 
+						++j ) {
+						
+						auto &c_val = internal::access( C, internal::getStorageIndex( C, i, j ) );
+
+						// Size + Symmetry constraints
+						//    max(sym_up_a * i, j + 1 ) <= l < K
+						// Band constraints
+						// /\ i + l_a              <= l < i + u_a 
+						// /\ j - u_b + 1          <= l < j - l_b + 1
+						for( std::ptrdiff_t l = std::max( { sym_up_a * i, j + 1, i + l_a, j - u_b + 1 } ); 
+							l < std::min( { K, i + u_a, j - l_b + 1 } ); 
+							++l ) {
+							const auto ta { internal::access( A, internal::getStorageIndex( A, i, l ) ) };
+							// Access to B^T
+							const auto tb { internal::access( B, internal::getStorageIndex( B, j, l ) ) };
+							(void)internal::apply( temp, ta, tb, oper );
+							(void)internal::foldl( c_val, temp, monoid.getOperator() );
+						}
+					}
+				}
+			}
+
+			if ( sym_up_a ) {
+				// Intersecting potential symmetry of A and B, 
+				// in which case consider case Lo( A ) * Up( B )
+				for( std::ptrdiff_t i = 0; i < M; ++i ) {
+					// Size + Symmetry constraints
+					//    sym_up_c * i   <= j < N
+					// Band constraints
+					// /\ i + l_a + l_b <= j < i + u_a + u_b - 1
+					for( std::ptrdiff_t j = std::max( sym_up_c * i, i + l_a + l_b ); 
+						j < std::min( N, i + u_a + u_b - 1 ); 
+						++j ) {
+						
+						auto &c_val = internal::access( C, internal::getStorageIndex( C, i, j ) );
+
+						// Size + Symmetry constraints
+						//    0                    <= l < min(i, 
+						//                                    K * ( !sym_up_b ) 
+						//                                    + ( j + 1 ) * sym_up_b
+						//                                    )
+						// Band constraints
+						// /\ i + l_a              <= l < i + u_a 
+						// /\ j - u_b + 1          <= l < j - l_b + 1
+						for( std::ptrdiff_t l = std::max( { ( std::ptrdiff_t )0, i + l_a, j - u_b + 1 } ); 
+							l < std::min( { i, K * ( !sym_up_b ) + ( j + 1 ) * sym_up_b, i + u_a, j - l_b + 1 } ); 
+							++l ) {
+							// Access to A^T
+							const auto ta { internal::access( A, internal::getStorageIndex( A, l, i ) ) };
+							const auto tb { internal::access( B, internal::getStorageIndex( B, l, j ) ) };
+							(void)internal::apply( temp, ta, tb, oper );
+							(void)internal::foldl( c_val, temp, monoid.getOperator() );
+						}
+					}
+				}
+
+				if( ( !sym_up_c ) && sym_up_b ) {
+					// Intersecting potential symmetry of A and B, 
+					// in which case consider case Lo( A ) * Lo( B ).
+					// Useful only if C is not sym
+					for( std::ptrdiff_t i = 2; i < M; ++i ) {
+						// Size + Symmetry constraints
+						//    0             <= j < i - 1
+						// Band constraints
+						// /\ i + l_a + l_b <= j < i + u_a + u_b - 1
+						for( std::ptrdiff_t j = std::max( ( std::ptrdiff_t )0, i + l_a + l_b ); 
+							j < std::min( i - 1, i + u_a + u_b - 1 ); 
+							++j ) {
+							
+							auto &c_val = internal::access( C, internal::getStorageIndex( C, i, j ) );
+
+							// Size + Symmetry constraints
+							//    j + 1                <= l < i
+							// Band constraints
+							// /\ i + l_a              <= l < i + u_a 
+							// /\ j - u_b + 1          <= l < j - l_b + 1
+							for( std::ptrdiff_t l = std::max( { j + 1, i + l_a, j - u_b + 1 } ); 
+								l < std::min( { i, i + u_a, j - l_b + 1 } ); 
+								++l ) {
+								// Access to A^T
+								const auto ta { internal::access( A, internal::getStorageIndex( A, l, i ) ) };
+								// Access to B^T
+								const auto tb { internal::access( B, internal::getStorageIndex( B, j, l ) ) };
+								(void)internal::apply( temp, ta, tb, oper );
+								(void)internal::foldl( c_val, temp, monoid.getOperator() );
+							}
+						}
+					}
+				}
+			}
+
+
+			return mxm_band_generic< BandPos1, BandPos2 + 1 >( C, A, B, oper, monoid, mulMonoid );
+		}
+
+		/**
+		 * \internal general mxm implementation that all mxm variants using 
+		 * structured matrices refer to.
+		 */
+		template<
+			bool allow_void,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView,
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1,
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2,
+			typename InputImfR2, typename InputImfC2
+		>
+		RC mxm_generic(
+			alp::Matrix< OutputType, OutputStructure,
+			Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > &C,
+			const alp::Matrix< InputType1, InputStructure1,
+			Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > &A,
+			const alp::Matrix< InputType2, InputStructure2,
+			Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if< !alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value && !
+				alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value &&
+				alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL
+		) {
+
+			static_assert(
+				!(
+					std::is_same< InputType1, void >::value ||
+					std::is_same< InputType2, void >::value
+				),
+				"alp::internal::mxm_generic: the operator-monoid version of mxm cannot be "
+				"used if either of the input matrices is a pattern matrix (of type "
+				"void)"
+			);
+
+#ifdef _DEBUG
+			std::cout << "In alp::internal::mxm_generic (dispatch)\n";
+#endif
+
+			// Early exit checks
+			if( ! internal::getInitialized( A ) ||
+				! internal::getInitialized( B ) ||
+				! internal::getInitialized( C )
+			) {
+				internal::setInitialized( C, false );
+				return SUCCESS;
+			}
+
+			const std::ptrdiff_t m   { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t n   { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) };
+			const std::ptrdiff_t k   { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+			const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) };
+			const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) };
+
+			if( m != m_a || k != k_b || n != n_b ) {
+				return MISMATCH;
+			}
+
+#if 0
+			return mxm_band_generic< 0, 0 >( C, A, B, oper, monoid, mulMonoid );
+#else
+			// \todo Check that the ring is using standard + and * operators
+			(void) oper;
+			(void) monoid;
+			(void) mulMonoid;
+
+
+#ifndef NDEBUG
+			std::cout << "Calling cblas_dgemm" << std::endl;
+#endif
+			cblas_dgemm(
+				CblasRowMajor, CblasNoTrans, CblasNoTrans,
+				m, n, k,
+				1,
+				internal::getRawPointerToFirstElement( A ), internal::getLeadingDimension( A ),
+				internal::getRawPointerToFirstElement( B ), internal::getLeadingDimension( B ),
+				1,
+				internal::getRawPointerToFirstElement( C ), internal::getLeadingDimension( C )
+			);
+
+			return SUCCESS;
+#endif
+
+		}
+
+	} // namespace internal
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with semiring parameter.
+	 *
+	 * @tparam descr      		The descriptors under which to perform the computation.
+	 * @tparam OutputStructMatT The structured matrix type of the output matrix.
+	 * @tparam InputStructMatT1 The structured matrix type of the the left-hand side input
+	 *                    		matrix.
+	 * @tparam InputStructMatT2 The structured matrix type of the right-hand side input
+	 *                    		matrix.
+	 * @tparam Semiring   		The semiring under which to perform the
+	 *                    		multiplication.
+	 *
+	 * @returns SUCCESS  If the computation completed as intended.
+	 * @returns MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do
+	 *                       not match. All input data containers are left
+	 *                       untouched if this exit code is returned; it will be
+	 *                       as though this call was never made.
+	 *
+	 * @param[out] C 	The output matrix \f$ C = AB \f$ when the function returns
+	 *               	#SUCCESS.
+	 * @param[in]  A 	The left-hand side input matrix \f$ A \f$.
+	 * @param[in]  B 	The left-hand side input matrix \f$ B \f$.
+	 * @param[in] ring  (Optional.) The semiring under which the computation should
+	 *                             proceed.
+	 * @param phase 	The execution phase.
+	 */
+	template<
+		typename OutputStructMatT,
+		typename InputStructMatT1,
+		typename InputStructMatT2,
+		class Semiring
+	>
+	RC mxm( OutputStructMatT & C,
+		const InputStructMatT1 & A,
+		const InputStructMatT2 & B,
+		const Semiring & ring = Semiring(),
+		const PHASE &phase = NUMERICAL,
+		const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value && alp::is_semiring< Semiring >::value,
+			void >::type * const = NULL ) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( C, A, B, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() );
+
+	}
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with additive monoid and multiplicative operator
+	 */
+	template< typename OutputStructMatT,
+		typename InputStructMatT1,
+		typename InputStructMatT2,
+		class Operator, class Monoid
+	>
+	RC mxm( OutputStructMatT & C,
+		const InputStructMatT1 & A,
+		const InputStructMatT2 & B,
+		const Operator & mulOp,
+		const Monoid & addM,
+		const PHASE &phase = NUMERICAL,
+		const typename std::enable_if< ! alp::is_object< typename OutputStructMatT::value_type >::value && ! alp::is_object< typename InputStructMatT1::value_type >::value && ! alp::is_object< typename InputStructMatT2::value_type >::value &&
+		                               alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL ) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() );
+	}
+
+	namespace internal {
+
+		/**
+		 * Applies eWiseApply to all elements of the given band
+		 * Forward declaration. Specialization handle bound-checking.
+		 * Assumes compatible parameters:
+		 *   - matching structures
+		 *   - matching dynamic
+		 */
+		template<
+			size_t band_index,
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class MulMonoid,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2,
+			class Operator,
+			std::enable_if_t<
+				band_index >= std::tuple_size< typename OutputStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseApply_matrix_band_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, dispatch > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, dispatch > *beta,
+			const Operator &oper,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value,
+			void >::type * const = nullptr
+		) {
+			(void)C;
+			(void)A;
+			(void)alpha;
+			(void)B;
+			(void)beta;
+			(void)oper;
+			(void)mulMonoid;
+			return SUCCESS;
+		}
+
+		/** Specialization for band index within the bounds */
+		template<
+			size_t band_index,
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class MulMonoid,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2,
+			class Operator,
+			std::enable_if_t<
+				band_index < std::tuple_size< typename OutputStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseApply_matrix_band_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, dispatch > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, dispatch > *beta,
+			const Operator &oper,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value,
+			void >::type * const = nullptr
+		) {
+			(void)mulMonoid;
+			assert( C != nullptr );
+			// In case of symmetry the iteration domain intersects the the upper
+			// (or lower) domain of A
+			constexpr bool is_sym_c = structures::is_a< OutputStructure, structures::Symmetric >::value;
+			constexpr bool is_sym_a = structures::is_a< InputStructure1, structures::Symmetric >::value;
+			constexpr bool is_sym_b = structures::is_a< InputStructure2, structures::Symmetric >::value;
+
+			// Temporary until adding multiple symmetry directions
+			constexpr bool sym_up_c = is_sym_c;
+			constexpr bool sym_up_a = is_sym_a;
+			constexpr bool sym_up_b = is_sym_b;
+
+			const auto i_limits = structures::calculate_row_coordinate_limits< band_index >( *C );
+
+			for( size_t i = i_limits.first; i < i_limits.second; ++i ) {
+
+				const auto j_limits = structures::calculate_column_coordinate_limits< band_index >( *C, i );
+
+				for( size_t j = j_limits.first; j < j_limits.second; ++j ) {
+					auto &C_val = internal::access( *C, internal::getStorageIndex( *C, i, j ) );
+
+					// Calculate indices to A and B depending on matching symmetry with C
+					const size_t A_i = ( sym_up_c == sym_up_a ) ? i : j;
+					const size_t A_j = ( sym_up_c == sym_up_a ) ? j : i;
+					const size_t B_i = ( sym_up_c == sym_up_b ) ? i : j;
+					const size_t B_j = ( sym_up_c == sym_up_b ) ? j : i;
+
+					if( left_scalar ) {
+						if( right_scalar ) {
+							// C = alpha . beta
+							internal::apply( C_val, **alpha, **beta, oper );
+						} else {
+							// C = alpha . B
+							const auto &B_val = internal::access( *B, internal::getStorageIndex( *B, B_i, B_j ) );
+							internal::apply( C_val, **alpha, B_val, oper );
+						}
+					} else {
+						if( right_scalar ) {
+							// C = A . beta
+							const auto &A_val = internal::access( *A, internal::getStorageIndex( *A, A_i, A_j ) );
+							internal::apply( C_val, A_val, **beta, oper );
+						} else {
+							// C = A . B
+							const auto &A_val = internal::access( *A, internal::getStorageIndex( *A, A_i, A_j ) );
+							const auto &B_val = internal::access( *B, internal::getStorageIndex( *B, B_i, B_j ) );
+							internal::apply( C_val, A_val, B_val, oper );
+						}
+					}
+				}
+			}
+			return eWiseApply_matrix_band_generic<
+				band_index + 1, left_scalar, right_scalar, descr
+			>( C, A, alpha, B, beta, oper, mulMonoid );
+		}
+
+		/**
+		 * \internal general elementwise matrix application that all eWiseApply variants refer to.
+		 */
+		template<
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class MulMonoid,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2,
+			class Operator
+		>
+		RC eWiseApply_matrix_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, dispatch > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, dispatch > *beta,
+			const Operator &oper,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value,
+			void >::type * const = NULL
+		) {
+			(void)alpha;
+			(void)beta;
+			(void)oper;
+			(void)mulMonoid;
+
+#ifdef _DEBUG
+			std::cout << "In alp::internal::eWiseApply_matrix_generic\n";
+#endif
+
+			// run-time checks
+			// TODO: support left/right_scalar
+			const size_t m = alp::nrows( *C );
+			const size_t n = alp::ncols( *C );
+
+			if( !left_scalar ){
+				assert( A != nullptr );
+				if( m != nrows( *A ) || n != ncols( *A ) ) {
+					return MISMATCH;
+				}
+			}
+			if( !right_scalar ){
+				assert( B != nullptr );
+				if( m != nrows( *B ) || n != ncols( *B ) ) {
+					return MISMATCH;
+				}
+			}
+
+			// delegate to single-band variant
+			return eWiseApply_matrix_band_generic< 0, left_scalar, right_scalar, descr >( C, A, alpha, B, beta, oper, mulMonoid );
+		}
+
+	} // namespace internal
+
+	/**
+	 * @brief Computes \f$ C = A . B \f$ for a given monoid.
+	 * 
+	 * @tparam descr      		The descriptor to be used (descriptors::no_operation
+	 *                    		if left unspecified).
+	 * @tparam OutputType 		The element type of the output matrix
+	 * @tparam InputType1 		The element type of the left-hand side matrix
+	 * @tparam InputType2 		The element type of the right-hand side matrix
+	 * @tparam OutputStructure 	The structure of the output matrix
+	 * @tparam InputStructure1 	The structure of the left-hand side matrix
+	 * @tparam InputStructure2  The structure of the right-hand matrix
+	 * @tparam OutputView 		The type of view of the output matrix
+	 * @tparam InputView1 		The type of view of the left-hand matrix
+	 * @tparam InputView2 		The type of view of the right-hand matrix
+	 * @tparam MulMonoid 		The type of monoid used for this element-wise operation
+	 * 
+	 * @param C 		The output structured matrix
+	 * @param A 		The left-hand side structured matrix
+	 * @param B 		The right-hand side structured matrix
+	 * @param mulmono 	The monoid used in the element-wise operation
+	 * @param phase 	The execution phase 
+	 * 
+	 * @return alp::MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do
+	 *                       not match. All input data containers are left
+	 *                       untouched if this exit code is returned; it will be
+	 *                       as though this call was never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+ 	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class MulMonoid
+	>
+	RC eWiseApply(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, dispatch > &A,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, dispatch > &B,
+		const MulMonoid &mulmono,
+		const typename std::enable_if< !alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< MulMonoid >::value,
+		void >::type * const = NULL
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D1, InputType1 >::value ),
+			"alp::eWiseApply (dispatch, matrix <- matrix x matrix, monoid)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D2, InputType2 >::value ),
+			"alp::eWiseApply (dispatch, matrix <- matrix x matrix, monoid)",
+			"called with a postfactor input matrix B that does not match the "
+			"second domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D3, OutputType >::value ),
+			"alp::eWiseApply (dispatch, matrix <- matrix x matrix, monoid)",
+			"called with an output matrix C that does not match the output domain "
+			"of the monoid operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In alp::eWiseApply (dispatch, monoid)\n";
+#endif
+		constexpr Scalar< InputType1, structures::General, dispatch > *no_scalar = nullptr;
+		constexpr bool left_scalar = false;
+		constexpr bool right_scalar = false;
+
+		return internal::eWiseApply_matrix_generic< left_scalar, right_scalar, descr >(
+			&C, &A, no_scalar, &B, no_scalar, mulmono.getOperator(), mulmono
+		);
+	}
+
+	/**
+	 * Returns a view over the general rank-1 matrix computed with the outer product.
+	 * Version for the case when input vectors are the same vector,
+	 * which results in a symmetric matrix.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Operator
+	>
+	Matrix<
+		typename Operator::D3,
+		typename std::conditional<
+			grb::utils::is_complex< typename Operator::D3 >::value,
+			alp::structures::Hermitian,
+			alp::structures::Symmetric
+		>::type,
+		Density::Dense,
+		view::Functor< std::function< void( typename Operator::D3 &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		dispatch
+	>
+	outer(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, dispatch > &x,
+		const Operator &mul = Operator(),
+		const typename std::enable_if< alp::is_operator< Operator >::value &&
+			! alp::is_object< InputType >::value,
+			void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, InputType >::value ), "alp::outerProduct",
+			"called with a prefactor vector that does not match the first domain "
+			"of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, InputType >::value ), "alp::outerProduct",
+			"called with a prefactor vector that does not match the first domain "
+			"of the given multiplication operator" );
+
+		std::function< void( typename Operator::D3 &, const size_t, const size_t ) > data_lambda =
+			[ &x, &mul ]( typename Operator::D3 &result, const size_t i, const size_t j ) {
+				//set( ret, alp::identities::zero );
+				internal::apply(
+					result, x[ i ],
+					grb::utils::is_complex< InputType >::conjugate( x[ j ] ),
+					mul
+				);
+			};
+		std::function< bool() > init_lambda =
+			[ &x ]() -> bool {
+				return internal::getInitialized( x );
+			};
+
+		return Matrix<
+			typename Operator::D3,
+			typename std::conditional<
+				grb::utils::is_complex< typename Operator::D3 >::value,
+				alp::structures::Hermitian,
+				alp::structures::Symmetric
+			>::type,
+			Density::Dense,
+			view::Functor< std::function< void( typename Operator::D3 &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			dispatch
+		>(
+			init_lambda,
+			getLength( x ),
+			data_lambda
+		);
+
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+#undef NO_CAST_OP_ASSERT
+
+#endif // end ``_H_ALP_DISPATCH_BLAS3''
+
diff --git a/include/alp/dispatch/exec.hpp b/include/alp/dispatch/exec.hpp
new file mode 100644
index 000000000..c758233e9
--- /dev/null
+++ b/include/alp/dispatch/exec.hpp
@@ -0,0 +1,39 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_DISPATCH_EXEC
+#define _H_ALP_DISPATCH_EXEC
+
+#include <alp/backends.hpp>
+#include <alp/base/exec.hpp>
+#include <alp/reference/exec.hpp> // dispatch launcher is using reference launcher
+#include "init.hpp"
+
+namespace alp {
+
+	template< EXEC_MODE mode >
+	class Launcher< mode, dispatch > : public Launcher< mode, reference > {};
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_DISPATCH_EXEC''
+
diff --git a/include/alp/dispatch/init.hpp b/include/alp/dispatch/init.hpp
new file mode 100644
index 000000000..a2c06ad32
--- /dev/null
+++ b/include/alp/dispatch/init.hpp
@@ -0,0 +1,41 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_DISPATCH_INIT
+#define _H_ALP_DISPATCH_INIT
+
+#include <alp/base/init.hpp>
+
+namespace alp {
+
+	/** \internal No-op init */
+	template<>
+	RC init< dispatch >( const size_t, const size_t, void * const );
+
+	/** \internal No-op init */
+	template<>
+	RC finalize< dispatch >();
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_DISPATCH_INIT''
+
diff --git a/include/alp/dispatch/io.hpp b/include/alp/dispatch/io.hpp
new file mode 100644
index 000000000..58e5d4ecd
--- /dev/null
+++ b/include/alp/dispatch/io.hpp
@@ -0,0 +1,275 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_DISPATCH_IO
+#define _H_ALP_DISPATCH_IO
+
+#include <alp/base/io.hpp>
+#include "matrix.hpp"
+#include "scalar.hpp"
+#include "vector.hpp"
+#include "blas2.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+namespace alp {
+
+	/**
+	 * Request the size (dimension) of a given Vector.
+	 */
+	template<
+		typename DataType, typename DataStructure, typename View, typename ImfR, typename ImfC
+	>
+	size_t size(
+		const Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, dispatch > &x
+	) noexcept {
+		return getLength( x );
+	}
+
+	/**
+	 * Sets the value of a given scalar \a alpha to be equal to that of
+	 * another given scalar \a beta.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure,
+		typename InputType, typename InputStructure
+	>
+	RC set(
+		Scalar< OutputType, OutputStructure, dispatch > &alpha,
+		const Scalar< InputType, InputStructure, dispatch > &beta,
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value &&
+			!alp::is_object< OutputType >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< OutputType, InputType >::value ),
+			"alp::set (scalar)",
+			"called with a value type that does not match that of the given "
+			"scalar"
+		);
+
+		if( !internal::getInitialized( beta ) ) {
+			internal::setInitialized( alpha, false );
+			return SUCCESS;
+		}
+
+		// foldl requires left-hand side to be initialized prior to the call
+		internal::setInitialized( alpha, true );
+		return foldl( alpha, beta, alp::operators::right_assign< OutputType >() );
+	}
+
+	/**
+	 * Sets all elements of the output matrix to the values of the input matrix.
+	 * C = A
+	 *
+	 * @tparam descr
+	 * @tparam OutputType      Data type of the output matrix C
+	 * @tparam OutputStructure Structure of the matrix C
+	 * @tparam OutputView      View type applied to the matrix C
+	 * @tparam InputType       Data type of the scalar a
+	 *
+	 * @param C    Matrix whose values are to be set
+	 * @param A    The input matrix
+	 *
+	 * @return RC  SUCCESS on the successful execution of the set
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC
+	>
+	RC set(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > &C,
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, dispatch > &A
+	) noexcept {
+		static_assert(
+			!std::is_same< OutputType, void >::value,
+			"alp::set (set to value): cannot have a pattern matrix as output"
+		);
+#ifndef NDEBUG
+		std::cout << "Called alp::set (matrix-to-matrix, dispatch)" << std::endl;
+#endif
+		// static checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< InputType, OutputType >::value ),
+			"alp::set", "called with non-matching value types"
+		);
+
+		static_assert(
+			!internal::is_functor_based<
+				Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch >
+			>::value,
+			"alp::set cannot be called with a functor-based matrix as a destination."
+		);
+
+		// TODO: Improve this check to account for non-zero structrue (i.e., bands)
+		//       and algebraic properties (e.g., symmetry)
+		static_assert(
+			std::is_same< OutputStructure, InputStructure >::value,
+			"alp::set cannot be called for containers with different structures."
+		);
+
+		if( ( nrows( C ) != nrows( A ) ) || ( ncols( C ) != ncols( A ) ) ) {
+			return MISMATCH;
+		}
+
+		if( !internal::getInitialized( A ) ) {
+			internal::setInitialized( C, false );
+			return SUCCESS;
+		}
+
+		internal::setInitialized( C, true );
+		return foldl( C, A, alp::operators::right_assign< OutputType >() );
+	}
+
+	/**
+	 * Sets all elements of the given matrix to the value of the given scalar.
+	 * C = val
+	 *
+	 * @tparam descr
+	 * @tparam OutputType      Data type of the output matrix C
+	 * @tparam OutputStructure Structure of the matrix C
+	 * @tparam OutputView      View type applied to the matrix C
+	 * @tparam InputType       Data type of the scalar a
+	 *
+	 * @param C    Matrix whose values are to be set
+	 * @param val  The value to set the elements of the matrix C
+	 *
+	 * @return RC  SUCCESS on the successful execution of the set
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure
+	>
+	RC set(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch > &C,
+		const Scalar< InputType, InputStructure, dispatch > &val
+	) noexcept {
+
+		static_assert(
+			!std::is_same< OutputType, void >::value,
+			"alp::set (set to matrix): cannot have a pattern matrix as output"
+		);
+#ifdef _DEBUG
+		std::cout << "Called alp::set (matrix-to-value, dispatch)" << std::endl;
+#endif
+		// static checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< InputType, OutputType >::value ),
+			"alp::set", "called with non-matching value types"
+		);
+
+		static_assert(
+			!internal::is_functor_based<
+				Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, dispatch >
+			>::value,
+			"alp::set cannot be called with a functor-based matrix as a destination."
+		);
+
+		if( !internal::getInitialized( val ) ) {
+			internal::setInitialized( C, false );
+			return SUCCESS;
+		}
+
+		internal::setInitialized( C, true );
+		return foldl( C, val, alp::operators::right_assign< OutputType >() );
+	}
+
+	/**
+	 * @brief \a buildMatrix version. The semantics of this function equals the one of
+	 *        \a buildMatrixUnique for the \a reference backend.
+	 * 
+	 * @see alp::buildMatrix
+	 */
+	template< typename InputType, typename Structure, typename View, typename ImfR, typename ImfC, typename fwd_iterator >
+	RC buildMatrix(
+		Matrix< InputType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &A,
+		const fwd_iterator &start,
+		const fwd_iterator &end
+	) noexcept {
+		(void)A;
+		(void)start;
+		(void)end;
+
+		// Temporarily assuming 1-1 mapping with user container
+		internal::setInitialized(A, true);
+
+		InputType * praw, * p;
+		
+		size_t len = internal::getLength( internal::getContainer( A ) );
+		praw = p = internal::getRaw( internal::getContainer( A ) );
+
+		for( fwd_iterator it = start; p < praw + len && it != end; ++it, ++p ) {
+			*p = *it;
+		}
+
+		return SUCCESS;
+	}
+
+	/**
+	 * @brief \a buildVector version.
+	 *
+	 */
+	template< typename InputType, typename Structure, typename View, typename ImfR, typename ImfC, typename fwd_iterator >
+	RC buildVector(
+		Vector< InputType, Structure, Density::Dense, View, ImfR, ImfC, dispatch > &v,
+		const fwd_iterator &start,
+		const fwd_iterator &end
+	) noexcept {
+		// Temporarily assuming 1-1 mapping with user container
+		internal::setInitialized(v, true);
+
+		InputType * praw, * p;
+		
+		size_t len = internal::getLength( internal::getContainer( v ) );
+		praw = p = internal::getRaw( internal::getContainer( v ) );
+
+		for( fwd_iterator it = start; p < praw + len && it != end; ++it, ++p ) {
+			*p = *it;
+		}
+
+		return SUCCESS;
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_ALP_DISPATCH_IO''
+
diff --git a/include/alp/dispatch/matrix.hpp b/include/alp/dispatch/matrix.hpp
new file mode 100644
index 000000000..033119735
--- /dev/null
+++ b/include/alp/dispatch/matrix.hpp
@@ -0,0 +1,105 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_DISPATCH_MATRIX
+#define _H_ALP_DISPATCH_MATRIX
+
+#include <alp/backends.hpp>
+#include <alp/base/matrix.hpp>
+#include <alp/amf-based/matrix.hpp>
+#include <alp/config.hpp>
+#include <alp/structures.hpp>
+#include <alp/imf.hpp>
+
+#include "storage.hpp"
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * Returns the pointer to the element corresponding to element (0,0)
+		 * of the provided matrix.
+		 *
+		 * \note Gets the raw pointer to the underlying container of the
+		 *       original matrix and adds the offset to the first element
+		 *       defined by the matrix view, using getStorageIndex function.
+		 *
+		 * @tparam MatrixType  Type of the given ALP matrix
+		 *
+		 * @param[in] A        The ALP matrix
+		 *
+		 * @returns Pointer of type MatrixType::value_type (a.k.a T)
+		 *
+		 */
+		template<
+			typename MatrixType,
+			std::enable_if_t< alp::is_matrix< MatrixType >::value > * = nullptr
+		>
+		typename MatrixType::value_type *getRawPointerToFirstElement( MatrixType &A ) {
+			return &( internal::access( A, internal::getStorageIndex( A, 0, 0 ) ) );
+		}
+
+		/** const variant */
+		template<
+			typename MatrixType,
+			std::enable_if_t< alp::is_matrix< MatrixType >::value > * = nullptr
+		>
+		const typename MatrixType::value_type *getRawPointerToFirstElement( const MatrixType &A ) {
+			return &( internal::access( A, internal::getStorageIndex( A, 0, 0 ) ) );
+		}
+
+		/**
+		 * Returns the leading dimension corresponding to the underlying
+		 * container of the provided matrix.
+		 *
+		 * @tparam MatrixType  Type of the given ALP matrix
+		 *
+		 * @param[in] A        The ALP matrix
+		 *
+		 * @returns Leading dimension.
+		 *
+		 */
+		template<
+			typename MatrixType,
+			std::enable_if_t< alp::is_matrix< MatrixType >::value > * = nullptr
+		>
+		size_t getLeadingDimension( const MatrixType &A ) {
+			// Get the distance between two elements in two consecutive rows
+			size_t row_diff = internal::getStorageIndex( A, 1, 0 ) - internal::getStorageIndex( A, 0, 0 );
+			// Get the distance between two elements in two consecutive columns
+			size_t col_diff = internal::getStorageIndex( A, 0, 1 ) - internal::getStorageIndex( A, 0, 0 );
+			// For row-wise storage, row_diff is LDA and col_diff must be 1.
+			// For col-wise storage, col_diff is LDA and row_diff must be 1.
+			// In other words, one of row_diff/col_diff must be one and the other is LDA.
+			if( ( row_diff > 1 ) && ( col_diff > 1 ) ) {
+				std::cout << "getLeadingDimension: it seems that the container uses stride > 1 for minor dimension. "
+					<< "This is not supported by BLAS.\n";
+			}
+			if( row_diff > 1 ) {
+				return row_diff;
+			} else {
+				return col_diff;
+			}
+		}
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // end ``_H_ALP_DISPATCH_MATRIX''
diff --git a/include/alp/dispatch/scalar.hpp b/include/alp/dispatch/scalar.hpp
new file mode 100644
index 000000000..0b3b50cec
--- /dev/null
+++ b/include/alp/dispatch/scalar.hpp
@@ -0,0 +1,204 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_DISPATCH_SCALAR
+#define _H_ALP_DISPATCH_SCALAR
+
+
+#include <stdexcept>
+#include <memory>
+
+#include <assert.h>
+
+#include <alp/rc.hpp>
+#include <alp/backends.hpp>
+
+#include <alp/density.hpp>
+#include <alp/structures.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/scalar.hpp>
+
+namespace alp {
+
+	namespace internal {
+		template< typename T, typename Structure >
+		bool getInitialized( const Scalar< T, Structure, dispatch > & ) noexcept;
+
+		template< typename T, typename Structure >
+		void setInitialized( Scalar< T, Structure, dispatch > &, bool ) noexcept;
+	} // end namespace ``alp::internal''
+
+	/**
+	 * \brief An ALP scalar.
+	 *
+	 * This is an opaque data type for scalars.
+	 *
+	 * @tparam T                 The type of the vector elements. \a T shall not
+	 *                           be a ALP type.
+	 * @tparam Structure         One of the structures.
+	 *
+	 * \warning Creating a alp::Scalar of other ALP types is
+	 *                <em>not allowed</em>.
+	 *          Passing a ALP type as template parameter will lead to
+	 *          undefined behaviour.
+	 *
+	 */
+	template< typename T, typename Structure >
+	class Scalar< T, Structure, dispatch > {
+
+		private:
+
+			typedef Scalar< T, Structure, dispatch > self_type;
+
+			friend bool internal::getInitialized<>( const self_type & ) noexcept;
+
+			friend void internal::setInitialized<>( self_type &, bool ) noexcept;
+
+			// Scalar value
+			T value;
+
+			/** Whether the scalar value is currently initialized */
+			bool initialized;
+
+		public:
+			/** @see Vector::value_type. */
+			typedef T value_type;
+
+			/** @see Vector::lambda_reference */
+			typedef T& lambda_reference;
+			typedef const T& const_lambda_reference;
+
+			/**
+			 * The main ALP scalar constructor.
+			 *
+			 * The constructed object will be uninitalised after successful construction.
+			 *
+			 *
+			 * @return SUCCESS This function never fails.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar() : initialized( false ) {}
+
+			/**
+			 * The ALP scalar constructor for converting C/C++ scalar to ALP scalar.
+			 *
+			 * The constructed object will be initialized after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			explicit Scalar( const T &value ) : value( value ), initialized( true ) {}
+
+			/**
+			 * Copy constructor.
+			 *
+			 * @param other The scalar to copy. The initialization state of the copy
+			 *              reflects the state of \a other.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor allocates \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ of data
+			 *           movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar( const Scalar &other ) : value( other.value ), initialized( other.initialized ) {
+				// const RC rc = set( *this, other ); // note: initialized will be set as part of this call
+				// if( rc != SUCCESS ) {
+				// 	throw std::runtime_error( "alp::Scalar< T, Structure, Density::Dense, View::Original< void >, dispatch > (copy constructor): error during call to alp::set (" + toString( rc ) + ")" );
+				// }
+			}
+
+			/**
+			 * Move constructor. The new scalar equals the given
+			 * scalar. Invalidates the use of the input scalar.
+			 *
+			 * @param[in] other The ALP scalar to move to this new instance.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor will not allocate any new dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+			 * \endparblock
+			 */
+			Scalar( Scalar &&other ) : value( other.value ), initialized( other.initialized ) {
+				other.initialized = false;
+			}
+
+			/** \internal No implementation notes. */
+			lambda_reference operator*() noexcept {
+				assert( internal::getInitialized( *this ) );
+				return value;
+			}
+
+			/** \internal No implementation notes. */
+			const_lambda_reference operator*() const noexcept {
+				assert( internal::getInitialized( *this ) );
+				return value;
+			}
+
+	}; // class Scalar with physical container
+
+	/** Identifies any dispatch scalar as an ALP scalar. */
+	template< typename T, typename Structure >
+	struct is_scalar< Scalar< T, Structure, dispatch > > : std::true_type {};
+
+	namespace internal {
+		template< typename T, typename Structure >
+		bool getInitialized( const Scalar< T, Structure, dispatch > &s ) noexcept {
+			return s.initialized;
+		}
+
+		template< typename T, typename Structure >
+		void setInitialized( Scalar< T, Structure, dispatch > &s, bool initialized ) noexcept {
+			s.initialized = initialized;
+		}
+	} // end namespace ``alp::internal''
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_DISPATCH_SCALAR''
+
diff --git a/include/alp/dispatch/storage.hpp b/include/alp/dispatch/storage.hpp
new file mode 100644
index 000000000..b90932c76
--- /dev/null
+++ b/include/alp/dispatch/storage.hpp
@@ -0,0 +1,53 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file
+ *
+ * This file registers mechanisms for coordinate mapping between
+ * logical and physical iteration spaces.
+ *
+ */
+
+#ifndef _H_ALP_DISPATCH_STORAGE
+#define _H_ALP_DISPATCH_STORAGE
+
+#include <alp/amf-based/storage.hpp>
+
+namespace alp {
+
+	namespace internal {
+
+		/** Specialization for any matrix in dispatch backend */
+		template< typename Structure >
+		struct determine_poly_factory< Structure, imf::Id, imf::Id, dispatch > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for vectors */
+		template< typename Structure >
+		struct determine_poly_factory< Structure, imf::Id, imf::Zero, dispatch > {
+
+			typedef storage::polynomials::ArrayFactory factory_type;
+		};
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // _H_ALP_DISPATCH_STORAGE
diff --git a/include/alp/dispatch/vector.hpp b/include/alp/dispatch/vector.hpp
new file mode 100644
index 000000000..1a8375c9d
--- /dev/null
+++ b/include/alp/dispatch/vector.hpp
@@ -0,0 +1,170 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_DISPATCH_VECTOR
+#define _H_ALP_DISPATCH_VECTOR
+
+#include <alp/backends.hpp>
+#include <alp/base/vector.hpp>
+#include <alp/amf-based/vector.hpp>
+#include <alp/reference/vector.hpp>
+#include <alp/config.hpp>
+#include <alp/structures.hpp>
+#include <alp/imf.hpp>
+
+#include "storage.hpp"
+
+namespace alp {
+
+	namespace internal {
+
+		template< typename T >
+		T * getRaw( Vector< T, dispatch > & ) noexcept;
+
+		template< typename T >
+		const T * getRaw( const Vector< T, dispatch > & ) noexcept;
+
+		template< typename T >
+		size_t getLength( const Vector< T, dispatch > & ) noexcept;
+
+		template< typename T >
+		const bool & getInitialized( const Vector< T, dispatch > & v ) noexcept;
+
+		template< typename T >
+		void setInitialized( Vector< T, dispatch > & v, const bool initialized ) noexcept;
+
+
+		/**
+		 * The dispatch implementation of the ALP/Dense vector.
+		 *
+		 * @tparam T The type of an element of this vector. \a T shall not be a
+		 *           GraphBLAS type.
+		 *
+		 * \warning Creating a alp::Vector of other GraphBLAS types is
+		 *                <em>not allowed</em>.
+		 *          Passing a GraphBLAS type as template parameter will lead to
+		 *          undefined behaviour.
+		 */
+		template< typename T >
+		class Vector< T, dispatch > : public Vector< T, reference > {
+
+			friend T * internal::getRaw< T >( Vector< T, dispatch > & ) noexcept;
+			friend const T * internal::getRaw< T >( const Vector< T, dispatch > & ) noexcept;
+			friend size_t internal::getLength< T >( const Vector< T, dispatch > & ) noexcept;
+
+			/* ********************
+				IO friends
+			   ******************** */
+
+			friend const bool & internal::getInitialized< T >( const Vector< T, dispatch > & ) noexcept;
+
+			friend void internal::setInitialized< T >( Vector< T, dispatch > & , bool ) noexcept;
+
+			using Vector< T, reference >::Vector;
+
+		};
+
+		/** Identifies any dispatch internal vector as an internal container. */
+		template< typename T >
+		struct is_container< internal::Vector< T, dispatch > > : std::true_type {};
+
+	} // end namespace ``alp::internal''
+
+	namespace internal {
+
+		template< typename T >
+		T * getRaw( Vector< T, dispatch > &v ) noexcept {
+			return v.data;
+		}
+
+		template< typename T >
+		const T * getRaw( const Vector< T, dispatch > &v ) noexcept {
+			return v.data;
+		}
+
+		template< typename T >
+		size_t getLength( const Vector< T, dispatch > &v ) noexcept {
+			return v.n;
+		}
+
+		template< typename T >
+		const bool & getInitialized( const Vector< T, dispatch > & v ) noexcept {
+			return v.initialized;
+		}
+
+		template< typename T >
+		void setInitialized( Vector< T, dispatch > & v, bool initialized ) noexcept {
+			v.initialized = initialized;
+		}
+
+		/**
+		 * Returns the pointer to the element corresponding to element (0,0)
+		 * of the provided vector.
+		 *
+		 * @tparam MatrixType  Type of the given ALP vector
+		 *
+		 * @param[in] A        The ALP vector
+		 *
+		 * @returns Pointer of type VectorType::value_type (a.k.a T)
+		 *
+		 */
+		template<
+			typename VectorType,
+			std::enable_if_t< alp::is_vector< VectorType >::value > * = nullptr
+		>
+		typename VectorType::value_type *getRawPointerToFirstElement( VectorType &v ) {
+			return &( v[ 0 ] );
+		}
+
+		/** const variant */
+		template<
+			typename VectorType,
+			std::enable_if_t< alp::is_vector< VectorType >::value > * = nullptr
+		>
+		const typename VectorType::value_type *getRawPointerToFirstElement( const VectorType &v ) {
+			return &( v[ 0 ] );
+		}
+
+		/**
+		 * Returns the increment between two consecutive elements in the
+		 * internal container of the given ALP vector.
+		 *
+		 * @tparam VectorType  Type of the given ALP vector
+		 *
+		 * @param[in] v        The ALP vector
+		 *
+		 * @returns The increment of type std::ptrdiff_t
+		 *
+		 */
+		template<
+			typename VectorType,
+			std::enable_if_t< alp::is_vector< VectorType >::value > * = nullptr
+		>
+		std::ptrdiff_t getIncrement( const VectorType &v ) {
+			const typename VectorType::value_type *first_elem_ptr = &( v[ 0 ] );
+			const typename VectorType::value_type *second_elem_ptr = &( v[ 1 ] );
+			std::ptrdiff_t inc = second_elem_ptr - first_elem_ptr;
+			if( inc < 0 ) {
+				std::cerr << "Warning: getIncrement: increment is negative.\n";
+			}
+			return inc;
+		}
+
+	} // end namespace ``alp::internal''
+} // namespace alp
+
+#endif // end ``_H_ALP_DISPATCH_VECTOR''
diff --git a/include/alp/exec.hpp b/include/alp/exec.hpp
new file mode 100644
index 000000000..3dce50446
--- /dev/null
+++ b/include/alp/exec.hpp
@@ -0,0 +1,49 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author: A. N. Yzelman
+ * @date 17th of April, 2017
+ */
+
+#ifndef _H_ALP_EXEC
+#define _H_ALP_EXEC
+
+#include "config.hpp"
+
+#include "base/exec.hpp"
+
+// include template specialisations
+#ifdef _ALP_WITH_REFERENCE
+ #include "alp/reference/exec.hpp"
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include "alp/dispatch/exec.hpp"
+#endif
+#ifdef _ALP_WITH_OMP
+ #include "alp/omp/exec.hpp"
+#endif
+
+#ifdef _ALP_BACKEND
+namespace alp {
+	template< enum EXEC_MODE mode, enum Backend implementation = config::default_backend >
+	class Launcher;
+}
+#endif
+
+#endif // end ``_H_ALP_EXEC''
+
diff --git a/include/alp/identities.hpp b/include/alp/identities.hpp
new file mode 100644
index 000000000..6360a7ae3
--- /dev/null
+++ b/include/alp/identities.hpp
@@ -0,0 +1,198 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 11th of August, 2016
+ */
+
+#ifndef _H_ALP_IDENTITIES
+#define _H_ALP_IDENTITIES
+
+#include <limits>
+
+
+namespace alp {
+
+	/**
+	 * Standard identities common to many operators.
+	 *
+	 * The most commonly used identities are
+	 *   - #alp::identities::zero, and
+	 *   - #alp::identities::one.
+	 *
+	 * A stateful identity should expose the same public interface as the
+	 * identities collected here, which is class which exposes at least one public
+	 * templated function named \a value, taking no arguments, returning the
+	 * identity in the domain \a D. This type \a D is the first template parameter
+	 * of the function \a value. If there are other template parameters, those
+	 * template parameters are required to have defaults.
+	 *
+	 * @see operators
+	 * @see Monoid
+	 * @see Semiring
+	 */
+	namespace identities {
+
+		/** Standard identity for numerical addition. */
+		template< typename D >
+		class zero {
+			static_assert( std::is_convertible< int, D >::value, "Cannot form identity under the requested domain" );
+
+		public:
+			/**
+			 * @tparam D The domain of the value to return.
+			 * @return The identity under standard addition (i.e., `zero').
+			 */
+			static constexpr D value() {
+				return static_cast< D >( 0 );
+			}
+		};
+		template< typename K, typename V >
+		class zero< std::pair< K, V > > {
+		public:
+			static constexpr std::pair< K, V > value() {
+				return std::make_pair( zero< K >::value(), zero< V >::value() );
+			}
+		};
+
+		/** Standard identity for numerical multiplication. */
+		template< typename D >
+		class one {
+			static_assert( std::is_convertible< int, D >::value, "Cannot form identity under the requested domain" );
+
+		public:
+			/**
+			 * @tparam D The domain of the value to return.
+			 * @return The identity under standard multiplication (i.e., `one').
+			 */
+			static constexpr D value() {
+				return static_cast< D >( 1 );
+			}
+		};
+		template< typename K, typename V >
+		class one< std::pair< K, V > > {
+		public:
+			static constexpr std::pair< K, V > value() {
+				return std::make_pair( one< K >::value(), one< V >::value() );
+			}
+		};
+
+		/** Standard identity for the minimum operator. */
+		template< typename D >
+		class infinity {
+			static_assert( std::is_arithmetic< D >::value, "Cannot form identity under the requested domain" );
+
+		public:
+			/**
+			 * @tparam D The domain of the value to return.
+			 * @return The identity under the standard min operator (i.e., `infinity'),
+			 *         of type \a D.
+			 */
+			static constexpr D value() {
+				return std::numeric_limits< D >::has_infinity ? std::numeric_limits< D >::infinity() : std::numeric_limits< D >::max();
+			}
+		};
+		template< typename K, typename V >
+		class infinity< std::pair< K, V > > {
+		public:
+			static constexpr std::pair< K, V > value() {
+				return std::make_pair( infinity< K >::value(), infinity< V >::value() );
+			}
+		};
+
+		/** Standard identity for the maximum operator. */
+		template< typename D >
+		class negative_infinity {
+			static_assert( std::is_arithmetic< D >::value, "Cannot form identity under the requested domain" );
+
+		public:
+			/**
+			 * @tparam D The domain of the value to return.
+			 * @return The identity under the standard max operator, i.e.,
+			 *         `minus infinity'.
+			 */
+			static constexpr D value() {
+				return std::numeric_limits< D >::min() == 0 ? 0 : ( std::numeric_limits< D >::has_infinity ? -std::numeric_limits< D >::infinity() : std::numeric_limits< D >::min() );
+			}
+		};
+		template< typename K, typename V >
+		class negative_infinity< std::pair< K, V > > {
+		public:
+			static constexpr std::pair< K, V > value() {
+				return std::make_pair( negative_infinity< K >::value(), negative_infinity< V >::value() );
+			}
+		};
+
+		/**
+		 * Standard identitity for the logical or operator.
+		 *
+		 * @see operators::logical_or.
+		 */
+		template< typename D >
+		class logical_false {
+			static_assert( std::is_convertible< bool, D >::value, "Cannot form identity under the requested domain" );
+
+		public:
+			/**
+			 * @tparam D The domain of the value to return.
+			 * @return The identity under the standard logical OR operator, i.e.,
+			 *         \a false.
+			 */
+			static const constexpr D value() {
+				return static_cast< D >( false );
+			}
+		};
+		template< typename K, typename V >
+		class logical_false< std::pair< K, V > > {
+		public:
+			static constexpr std::pair< K, V > value() {
+				return std::make_pair( logical_false< K >::value(), logical_false< V >::value() );
+			}
+		};
+
+		/**
+		 * Standard identity for the logical AND operator.
+		 *
+		 * @see operators::logical_and.
+		 */
+		template< typename D >
+		class logical_true {
+			static_assert( std::is_convertible< bool, D >::value, "Cannot form identity under the requested domain" );
+
+		public:
+			/**
+			 * @tparam D The domain of the value to return.
+			 * @return The identity under the standard logical AND operator, i.e.,
+			 *         \a true.
+			 */
+			static constexpr D value() {
+				return static_cast< D >( true );
+			}
+		};
+		template< typename K, typename V >
+		class logical_true< std::pair< K, V > > {
+		public:
+			static constexpr std::pair< K, V > value() {
+				return std::make_pair( logical_true< K >::value(), logical_true< V >::value() );
+			}
+		};
+
+	} // namespace identities
+} // namespace alp
+
+#endif
diff --git a/include/alp/imf.hpp b/include/alp/imf.hpp
new file mode 100644
index 000000000..08e638d7b
--- /dev/null
+++ b/include/alp/imf.hpp
@@ -0,0 +1,349 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file
+ *
+ * This file registers available index mapping functions (IMFs).
+ * IMFs are maps between integer intervals and can be used to define
+ * affine \em access transformations in the form of access matrices.
+ * For example, an access matrix \f$G_f\in R^{N\times N}\f$
+ * parametrized by the IMF \f$f\f$ such that
+ * \f[G_f = \sum_{i=0}^{n-1} e_i^n\left(e_{f(i)}^N\right)^T\f]
+ * could be used to access a group of $n\eN$ rows of matrix
+ * \f$A\in R^{N\times N}\f$
+ * according to \f$f\f$ by multiplying \f$A\f$ by \f$G_f\f$ from the left:
+ * \f[\tilde{A} = G_f\cdot A,\quad \tilde{A}\in R^{n\times N}\f]
+ *
+ * \note The idea of parametrized matrices to express matrix accesses at
+ *       a higher level of mathematical abstractions is inspired by the
+ *       SPIRAL literature (Franchetti et al. SPIRAL: Extreme Performance Portability.
+ *       http://spiral.net/doc/papers/08510983_Spiral_IEEE_Final.pdf).
+ *       Similar affine formulations are also used in the polyhedral
+ *       compilation literature to express concepts such as access
+ *       relations.
+ *       In this draft we use integer maps. A symbolic version of them could be
+ *       defined using external libraries such as the Integer Set Library (isl
+ *       \link https://libisl.sourceforge.io/).
+ *
+ */
+
+#ifndef _H_ALP_IMF
+#define _H_ALP_IMF
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+#include "density.hpp"
+
+
+namespace alp {
+
+	template<
+		typename T,
+		typename Structure,
+		enum Density density,
+		typename View,
+		typename ImfR,
+		typename ImfC,
+		enum Backend backend
+	>
+	class Vector;
+
+	namespace imf {
+
+		class IMF {
+
+			public:
+
+				const size_t n;
+				const size_t N;
+
+				IMF( const size_t n, const size_t N ): n( n ), N( N ) {}
+
+			protected:
+
+				template< typename OtherImf >
+				bool isSame( const OtherImf & other ) const {
+					//static_assert( std::is_same< decltype( *this ), decltype( other ) >::value );
+					return n == other.n && N == other.N;
+				}
+
+			private:
+
+				/** Implements the mapping function of the IMF */
+				size_t map( const size_t ) const;
+
+		}; // class IMF
+
+		/**
+		 * The strided IMF.
+		 * \f$I_n =[0, n), I_N =[0, N)\f$
+		 * \f$Strided_{b, s} = I_n \rightarrow I_N; i \mapsto b + si\f$
+		 */
+
+		class Strided: public IMF {
+
+			public:
+
+				const size_t b;
+				const size_t s;
+
+				size_t map( const size_t i ) const {
+#ifdef _DEBUG
+					std::cout << "Calling Strided map\n";
+#endif
+					return b + s * i;
+				}
+
+				Strided( const size_t n, const size_t N, const size_t b, const size_t s ): IMF( n, N ), b( b ), s( s ) { }
+
+				Strided( const Strided &other ) : IMF( other.n, other.N ), b( other.b ), s( other.s ) { }
+
+				template< typename OtherIMF >
+				bool isSame( const OtherIMF &other ) const {
+					return IMF::isSame( other ) &&
+						b == static_cast< const Strided & >( other ).b &&
+						s == static_cast< const Strided & >( other ).s;
+				}
+		};
+
+		/**
+		 * The identity IMF.
+		 * \f$I_n = [0, n)\f$
+		 * \f$Id = I_n \rightarrow I_n; i \mapsto i\f$
+		 */
+
+		class Id: public Strided {
+
+			public:
+
+				explicit Id( const size_t n ) : Strided( n, n, 0, 1 ) {}
+		};
+
+		/**
+		 * The constant-mapping IMF.
+		 * \f$I_n = [0, n)\f$
+		 * \f$Constant = I_n \rightarrow I_N; i \mapsto const\f$ with \f$const in I_N\f$
+		 */
+
+		class Constant: public Strided {
+
+			public:
+
+				explicit Constant( const size_t n, const size_t N, const size_t value ) : Strided( n, N, value, 0 ) {}
+		};
+
+		/**
+		 * The zero IMF.
+		 * \f$I_n = [0, n)\f$
+		 * \f$Zero = I_n \rightarrow I_1; i \mapsto 0\f$
+		 */
+
+		class Zero: public Strided {
+
+			public:
+
+				explicit Zero( const size_t n ) : Strided( n, 1, 0, 0 ) {}
+		};
+
+
+		class Select: public IMF {
+
+			public:
+
+				/** \internal \todo Change to ALP vector */
+				std::vector< size_t > select;
+
+				size_t map( const size_t i ) const {
+					return select.at( i );
+				}
+
+				template< typename T, typename Structure, enum Density density, typename View, typename ImfR, typename ImfC, enum Backend backend >
+				Select(
+					size_t N,
+					const alp::Vector< T, Structure, density, View, ImfR, ImfC, backend > &select
+				): IMF( size( select ), N ), select( size( select ) ) {
+
+					/** \internal \todo Use set when this->select becomes ALP vector */
+					//set( this->select, select );
+					for( size_t i = 0; i < size( select ); ++i ) {
+						this->select[ i ] = select[ i ];
+					}
+#ifdef DEBUG
+					// Check that select vector does not map outside of range [0,N)
+					for( size_t i = 0; i < size( select ); ++i ) {
+						if ( select[ i ] >= N ) {
+							throw std::runtime_error("Provided select vector mapping beyond the provided range.");
+						}
+					}
+#endif
+				}
+
+				Select( const Select &other ) : IMF( other.select.size(), other.N ), select( other.select ) {
+#ifdef _DEBUG
+					std::cout << "Select copy constructor\n";
+#endif
+				}
+
+				template< typename OtherIMF >
+				bool isSame( const OtherIMF &other ) const {
+					return IMF::isSame( other ) && select == static_cast< const Select & >( other ).select;
+				}
+		};
+
+		/**
+		 * A composition of two IMFs.
+		 * \f$I_{g,n} =[0, n), I_{g,N} =[0, N)\f$
+		 * \f$I_{f,n} =[0, n), I_{f,N} =[0, N)\f$
+		 * \f$Composed_{f, g} = I_{g,n} \rightarrow I_{f,N}; i \mapsto f( g( i ) )\f$
+		 *
+		 * \tparam LeftIMF  The left function of the composition operator
+		 *                  (applied second, i.e., \f$g\f$ )
+		 * \tparam RightIMF The right function of the composition operator
+		 *                  (applied first, i.e., \f$f\f$ )
+		 *
+		 * For specific combinations of the IMF types, there are specializations
+		 * that avoid nested function calls by fusing two functions into one.
+		 */
+
+		template< typename LeftImf, typename RightImf >
+		class Composed: public IMF {
+
+			public:
+				const LeftImf f;
+				const RightImf g;
+
+				size_t map( const size_t i ) const {
+#ifdef _DEBUG
+						std::cout << "Calling Composed< IMF, IMF>::map()\n";
+#endif
+						return f.map( g.map( i ) );
+				}
+
+				Composed( const LeftImf &f, const RightImf &g ):
+					IMF( g.n, f.N ), f( f ), g( g ) {
+#ifdef DEBUG
+						std::cout << "Creating composition of IMFs that cannot be composed into a"
+						             "single mapping function. Consider the effect on performance.\n";
+#endif
+					}
+
+		};
+
+		namespace internal {
+
+			/**
+			 * Ensures that the range of the right IMF matches the domain of the left.
+			 * If the condition is not satisfied, throws an exception
+			 *
+			 * @tparam LeftImf   The type of the left IMF
+			 * @tparam RightImf  The type of the right IMF
+			 *
+			 * @param[in] left_imf   The left IMF
+			 * @param[in] right_imf  The right IMF
+			 *
+			 */
+			template< typename LeftImf, typename RightImf >
+			static void ensure_imfs_match( const LeftImf &left_imf, const RightImf &right_imf ) {
+				if( !( right_imf.N == left_imf.n ) ) {
+					throw std::runtime_error( "Cannot compose two IMFs with non-matching range and domain" );
+				}
+			}
+
+		} // namespace internal
+
+		/**
+		 * Exposes the type and creates the composed IMF from two provided input IMFs.
+		 *
+		 * For certain combinations of IMFs, the resulting composed IMF is
+		 * one of the fundamental types. In these cases, the factory is
+		 * specialized to produce the appropriate type and object.
+		 */
+		template< typename LeftImf, typename RightImf >
+		struct ComposedFactory {
+
+			typedef Composed< LeftImf, RightImf > type;
+
+			static type create( const LeftImf &f, const RightImf &g ) {
+				internal::ensure_imfs_match( f, g );
+				return type( f, g );
+			}
+		};
+
+		template< typename RightImf >
+		struct ComposedFactory< Id, RightImf > {
+
+			typedef RightImf type;
+
+			static type create( const Id &f, const RightImf &g ) {
+				internal::ensure_imfs_match( f, g );
+				return RightImf( g );
+			}
+		};
+
+		template< typename LeftImf >
+		struct ComposedFactory< LeftImf, Id > {
+
+			typedef LeftImf type;
+
+			static type create( const LeftImf &f, const Id &g ) {
+				internal::ensure_imfs_match( f, g );
+				return LeftImf( f );
+			}
+		};
+
+		template<>
+		struct ComposedFactory< Id, Id > {
+
+			typedef Id type;
+
+			static type create( const Id &f, const Id &g ) {
+				internal::ensure_imfs_match( f, g );
+				return type( f.n );
+			}
+		};
+
+		template<>
+		struct ComposedFactory< Strided, Strided >{
+
+			typedef Strided type;
+
+			static type create( const Strided &f, const Strided &g ) {
+				internal::ensure_imfs_match( f, g );
+				return type( g.n, f.N, f.s * g.b + f.b, f.s * g.s );
+			}
+		};
+
+		template<>
+		struct ComposedFactory< Strided, Constant > {
+
+			typedef Constant type;
+
+			static type create( const Strided &f, const Constant &g ) {
+				internal::ensure_imfs_match( f, g );
+				return type( g.n, f.N, f.b + f.s * g.b );
+			}
+		};
+
+	}; // namespace imf
+
+} // namespace alp
+
+#endif // _H_ALP_IMF
diff --git a/include/alp/init.hpp b/include/alp/init.hpp
new file mode 100644
index 000000000..d59782b24
--- /dev/null
+++ b/include/alp/init.hpp
@@ -0,0 +1,35 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 24th of January, 2017
+ */
+
+#ifndef _H_ALP_INIT
+#define _H_ALP_INIT
+
+// include all implementations
+#ifdef _ALP_WITH_REFERENCE
+ #include "alp/reference/init.hpp"
+#endif
+#ifdef _ALP_WITH_OMP
+ #include "alp/omp/init.hpp"
+#endif
+
+#endif // end ``_H_ALP_INIT''
+
diff --git a/include/alp/internalops.hpp b/include/alp/internalops.hpp
new file mode 100644
index 000000000..e9dddba0a
--- /dev/null
+++ b/include/alp/internalops.hpp
@@ -0,0 +1,32 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 8 of August, 2016
+ */
+
+#ifndef _H_ALP_INTERNAL_OPERATORS
+#define _H_ALP_INTERNAL_OPERATORS
+
+// certain backends may want to specialize these functionalities,
+// e.g., for specific targets, e.g. to exploit dedicated hardware
+// features
+#include "base/internalops.hpp"
+
+#endif
+
diff --git a/include/alp/internalrels.hpp b/include/alp/internalrels.hpp
new file mode 100644
index 000000000..7dcb5b0fa
--- /dev/null
+++ b/include/alp/internalrels.hpp
@@ -0,0 +1,32 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author D. G. Spampinato
+ * @date 3 of November, 2022
+ */
+
+#ifndef _H_ALP_INTERNAL_RELATIONS
+#define _H_ALP_INTERNAL_RELATIONS
+
+// certain backends may want to specialize these functionalities,
+// e.g., for specific targets, e.g. to exploit dedicated hardware
+// features
+#include "base/internalrels.hpp"
+
+#endif
+
diff --git a/include/alp/io.hpp b/include/alp/io.hpp
new file mode 100644
index 000000000..ec8dc80d6
--- /dev/null
+++ b/include/alp/io.hpp
@@ -0,0 +1,38 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 21st of February, 2017
+ */
+
+#ifndef _H_ALP_IO
+#define _H_ALP_IO
+
+// now include all specialisations contained in the backend directories:
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/io.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/io.hpp>
+#endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/io.hpp>
+#endif
+
+#endif // end ``_H_ALP_IO''
+
diff --git a/include/alp/iomode.hpp b/include/alp/iomode.hpp
new file mode 100644
index 000000000..17c29a34d
--- /dev/null
+++ b/include/alp/iomode.hpp
@@ -0,0 +1,93 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 21st of February, 2017
+ */
+
+#ifndef _H_ALP_IOMODE
+#define _H_ALP_IOMODE
+
+namespace alp {
+
+	/**
+	 * The GraphBLAS input and output functionalities can either be used in a
+	 * sequential or parallel fashion. Input functions such as buildVector or
+	 * buildMatrixUnique default to sequential behaviour, which means that the
+	 * collective calls to either function must have the exact same arguments--
+	 * that is, each user process is passed the exact same input data.
+	 *
+	 * \note This does not necessarily mean that all data is stored in a
+	 *       replicated fashion across all user processes.
+	 *
+	 * This default behaviour comes with obvious performance penalties; each user
+	 * process must scan the full input data set, which takes \f$ \Theta( n ) \f$
+	 * time. Scalable behaviour would instead incur \f$ \Theta( n / P ) \f$ time,
+	 * with \a P the number of user processes.
+	 * Using a parallel IOMode provides exactly this scalable performance. On
+	 * input, this means that each user process can pass different data to the
+	 * same collective call to, e.g., buildVector or buildMatrixUnique.
+	 *
+	 * For output, which GraphBLAS provides via \a const iterators, sequential
+	 * mode means that each user process retrieves an iterator over all output
+	 * elements-- this requires costly all-to-all communication. Parallel mode
+	 * output instead only returns those elements that do not require inter user-
+	 * process communication.
+	 *
+	 * \note It is guaranteed the union of all output over all user processes
+	 *       corresponds to all elements in the GraphBLAS container.
+	 *
+	 * See the respective functions and classes for full details:
+	 *   -# alp::buildVector;
+	 *   -# alp::buildMatrixUnique;
+	 *   -# alp::Vector::const_iterator;
+	 *   -# alp::Matrix::const_iterator.
+	 */
+	enum IOMode {
+
+		/**
+		 * Sequential mode IO.
+		 *
+		 * Use of this mode results in non-scalable input and output. Its use is
+		 * recommended only in case of small data sets or in one-off situations.
+		 */
+		SEQUENTIAL = 0,
+
+		/**
+		 * Parallel mode IO.
+		 *
+		 * Use of this mode results in fully scalable input and output. Its use is
+		 * recommended as a default. Note that this does require the user to have
+		 * his or her data distributed over the various user processes on input,
+		 * and requires the user to handle distributed data on output.
+		 *
+		 * This is the default mode on all GraphBLAS IO functions.
+		 *
+		 * \note The parallel mode in situations where the number of user processes
+		 *       is one, for instance when choosing a sequential or data-centric
+		 *       GraphBLAS implementation, IOMode::parallel is equivalent to
+		 *       IOMode::sequential.
+		 */
+		PARALLEL
+	};
+
+	/** @} */
+
+} // namespace alp
+
+#endif // end ``_H_ALP_IOMODE''
diff --git a/include/alp/matrix.hpp b/include/alp/matrix.hpp
new file mode 100644
index 000000000..ebf8145da
--- /dev/null
+++ b/include/alp/matrix.hpp
@@ -0,0 +1,76 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 10 of August
+ */
+
+#ifndef _H_ALP_MATRIX
+#define _H_ALP_MATRIX
+
+#include "config.hpp"
+
+#include "base/matrix.hpp"
+
+// now include all specialisations contained in the backend directories:
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/matrix.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/matrix.hpp>
+#endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/matrix.hpp>
+#endif
+
+// specify default only if requested during compilation
+#ifdef _ALP_BACKEND
+namespace alp {
+	namespace internal {
+		template< typename D, enum Backend implementation = config::default_backend >
+		class Matrix;
+	}
+
+	/*
+	 * The default value of \a density could also be made conditional (Dense or Sparse) depending on \a config::default_backend
+	 */
+	template<
+		typename T,
+		typename Structure,
+		enum Density density = Density::Dense,
+		typename View = view::Original< void >,
+		typename ImfR = imf::Id,
+		typename ImfC = imf::Id,
+		enum Backend backend = config::default_backend >
+	class Matrix;
+
+	/** Specializations of ALP backend-agnostic type traits */
+	template< typename T, typename Structure, enum Density density, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	struct inspect_structure< Matrix< T, Structure, density, View, ImfR, ImfC, backend > > {
+		typedef Structure type;
+	};
+
+	template< typename T, typename Structure, enum Density density, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	struct internal::inspect_view< Matrix< T, Structure, density, View, ImfR, ImfC, backend > > {
+		typedef View type;
+	};
+
+} // namespace alp
+#endif
+
+#endif // end ``_H_ALP_MATRIX''
diff --git a/include/alp/monoid.hpp b/include/alp/monoid.hpp
new file mode 100644
index 000000000..767a3e1ed
--- /dev/null
+++ b/include/alp/monoid.hpp
@@ -0,0 +1,138 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 15 March, 2016
+ */
+
+#ifndef _H_ALP_MONOID
+#define _H_ALP_MONOID
+
+#include <cstddef> //size_t
+#ifdef _DEBUG
+#include <cstdio>
+#endif
+#include <cstdlib> //posix_memalign, rand
+#include <type_traits>
+
+#include <assert.h>
+
+#include <alp/identities.hpp>
+#include <alp/ops.hpp>
+#include <alp/type_traits.hpp>
+
+
+/**
+ * The main Sparse Library namespace.
+ *
+ * All classes, enums, constants, and functions are declared in this namespace.
+ * This source file only contains testing code outside this namespace.
+ */
+namespace alp {
+
+	/**
+	 * A generalised monoid.
+	 *
+	 * @tparam _OP The monoid operator.
+	 * @tparam _ID The monoid identity (the `0').
+	 */
+	template< class _OP, template< typename > class _ID >
+	class Monoid {
+
+		static_assert( alp::is_operator< _OP >::value, "First template argument to Monoid must be a GraphBLAS operator" );
+
+		static_assert( alp::is_associative< _OP >::value,
+			"Cannot form a monoid using the given operator since it is not "
+			"associative" );
+
+		static_assert( std::is_same< typename _OP::D1, typename _OP::D3 >::value || std::is_same< typename _OP::D2, typename _OP::D3 >::value,
+			"Cannot form a monoid when the output domain does not match at least "
+			"one of its input domains" );
+
+	public:
+		/** The left-hand side input domain. */
+		typedef typename _OP::D1 D1;
+
+		/** The right-hand side input domain. */
+		typedef typename _OP::D2 D2;
+
+		/** The output domain. */
+		typedef typename _OP::D3 D3;
+
+		/** The type of the underlying operator. */
+		typedef _OP Operator;
+
+		/** The underlying identity. */
+		template< typename IdentityType >
+		using Identity = _ID< IdentityType >;
+
+	private:
+		/**
+		 * The underlying binary operator.
+		 *
+		 * For stateless operators, this field corresponds to empty storage.
+		 */
+		Operator op;
+
+	public:
+		/**
+		 * Constructor that infers a default operator, given the operator type.
+		 * Useful for stateless operators.
+		 */
+		Monoid() : op() {}
+
+		/**
+		 * Retrieves the identity corresponding to this monoid. The identity value
+		 * will be cast to the requested domain.
+		 *
+		 * @tparam D The requested domain of the identity.
+		 *
+		 * @returns The identity corresponding to this monoid, cast to the requested
+		 *          domain.
+		 */
+		template< typename D >
+		constexpr D getIdentity() const {
+			return Identity< D >::value();
+		}
+
+		/**
+		 * Retrieves the underlying operator.
+		 *
+		 * @return The underlying operator. Any state is copied.
+		 */
+		Operator getOperator() const {
+			return op;
+		}
+	};
+
+	// type traits
+	template< class _OP, template< typename > class _ID >
+	struct is_monoid< Monoid< _OP, _ID > > {
+		/** This is a GraphBLAS monoid. */
+		static const constexpr bool value = true;
+	};
+
+	template< class OP, template< typename > class ID >
+	struct has_immutable_nonzeroes< Monoid< OP, ID > > {
+		static const constexpr bool value = alp::is_monoid< Monoid< OP, ID > >::value &&
+			std::is_same< OP, typename alp::operators::logical_or< typename OP::D1, typename OP::D2, typename OP::D3 > >::value;
+	};
+
+} // namespace alp
+
+#endif
diff --git a/include/alp/omp/blas3.hpp b/include/alp/omp/blas3.hpp
new file mode 100644
index 000000000..6de0bdac5
--- /dev/null
+++ b/include/alp/omp/blas3.hpp
@@ -0,0 +1,452 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_BLAS3
+#define _H_ALP_OMP_BLAS3
+
+#include <algorithm>   // for std::min/max
+#include <type_traits> // for std::enable_if
+
+#include <alp/descriptors.hpp>
+#include <alp/structures.hpp>
+
+// #include <alp/amf-based/matrix.hpp>
+#include <alp/base/blas3.hpp>
+
+#include "matrix.hpp"
+#include "storage.hpp"
+
+// Include backend to which sequential work is delegated
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/blas2.hpp>
+ #include <alp/reference/blas3.hpp>
+ #include <alp/reference/io.hpp>
+#endif
+#ifdef _ALP_OMP_WITH_DISPATCH
+ #include <alp/dispatch/blas2.hpp>
+ #include <alp/dispatch/blas3.hpp>
+ #include <alp/dispatch/io.hpp>
+#endif
+
+#ifndef NDEBUG
+#include "../../../tests/utils/print_alp_containers.hpp"
+#endif
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * \internal general mxm implementation that all mxm variants using 
+		 * structured matrices refer to.
+		 */
+		template<
+			bool allow_void,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		RC mxm_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+			alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+			alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if< !alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value && !
+				alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value &&
+				alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL
+		) {
+
+			static_assert( 
+				!(
+					std::is_same< InputType1, void >::value ||
+					std::is_same< InputType2, void >::value
+				),
+				"alp::internal::mxm_generic: the operator-monoid version of mxm cannot be "
+				"used if either of the input matrices is a pattern matrix (of type "
+				"void)"
+			);
+
+#ifndef NDEBUG
+			std::cout << "In alp::internal::mxm_generic (omp)\n";
+#endif
+
+			// Early exit checks 
+			if( ! internal::getInitialized( A ) ||
+				! internal::getInitialized( B ) ||
+				! internal::getInitialized( C )
+			) {
+				internal::setInitialized( C, false );
+				return SUCCESS;
+			}
+
+			const std::ptrdiff_t m   { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t n   { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) };
+			const std::ptrdiff_t k   { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+			const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) };
+			const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) };
+
+			if( m != m_a || k != k_b || n != n_b ) {
+				return MISMATCH;
+			}
+
+			const Distribution_2_5D &da = internal::getAmf( A ).getDistribution();
+			const Distribution_2_5D &db = internal::getAmf( B ).getDistribution();
+			const Distribution_2_5D &dc = internal::getAmf( C ).getDistribution();
+
+			const auto tg_a = da.getThreadGridDims();
+			const auto tg_b = db.getThreadGridDims();
+			const auto tg_c = dc.getThreadGridDims();
+			
+			// 2.5D factor ranges within 3D limits
+			if( tg_c.rt != tg_a.rt || tg_a.rt != tg_b.rt || ( tg_a.tc % tg_a.rt ) != 0 ) {
+				return MISMATCH;
+			}
+
+			if( tg_c.tr != tg_a.tr || tg_c.tc != tg_b.tc || tg_a.tc != tg_b.tr ) {
+				return MISMATCH;
+			}
+
+			using th_coord_t = typename Distribution_2_5D::ThreadCoords;
+
+			RC rc = SUCCESS;
+
+#ifndef NDEBUG
+			#pragma omp critical
+			std::cerr << "In alp::internal::mxm_generic (omp)\n"
+				"\tBeginning parallel region" << std::endl;
+#endif
+
+			#pragma omp parallel
+			{			
+				const size_t thread = config::OMP::current_thread_ID();
+
+				const th_coord_t th_ijk_a = da.getThreadCoords( thread );
+				const th_coord_t th_ijk_b = db.getThreadCoords( thread );
+				const th_coord_t th_ijk_c = dc.getThreadCoords( thread );
+
+				RC local_rc = SUCCESS;
+
+
+				// Broadcast A and B to all c-dimensional layers
+				if( local_rc == SUCCESS && da.isActiveThread( th_ijk_a ) && th_ijk_a.rt > 0 ) {
+
+					const auto set_block_grid_dims_a = da.getLocalBlockGridDims( th_ijk_a );
+
+					th_coord_t th_ij0_a( th_ijk_a.tr, th_ijk_a.tc, 0 );
+
+					for( size_t br = 0; br < set_block_grid_dims_a.first; ++br ) {
+						for( size_t bc = 0; bc < set_block_grid_dims_a.second; ++bc ) {
+
+							auto refAij0 = get_view( A, th_ij0_a, br, bc );
+							auto refAijk = get_view( A, th_ijk_a, br, bc );
+
+							local_rc = local_rc ? local_rc : set( refAijk, refAij0 );
+
+						}
+					}
+				
+				} // End Broadcast of A
+
+				if( local_rc == SUCCESS && db.isActiveThread( th_ijk_b ) && th_ijk_b.rt > 0 ) {
+
+					const auto set_block_grid_dims_b = db.getLocalBlockGridDims( th_ijk_b );
+
+					th_coord_t th_ij0_b( th_ijk_b.tr, th_ijk_b.tc, 0 );
+
+					for( size_t br = 0; br < set_block_grid_dims_b.first; ++br ) {
+						for( size_t bc = 0; bc < set_block_grid_dims_b.second; ++bc ) {
+
+							auto refBij0 = get_view( B, th_ij0_b, br, bc );
+							auto refBijk = get_view( B, th_ijk_b, br, bc );
+
+							local_rc = local_rc ? local_rc : set( refBijk, refBij0 );
+
+						}
+					}
+				} // End Broadcast of B
+
+				if( local_rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt > 0 ) {
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					alp::Scalar< 
+						OutputType, alp::structures::General, config::default_sequential_backend 
+					> zero( monoid.template getIdentity< OutputType >() );
+
+					for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+						for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+							auto refCijk = get_view( C, th_ijk_c, br, bc );
+							local_rc = local_rc ? local_rc : set( refCijk, zero );
+
+						}
+					}
+				} // End Zero-ing of C
+
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues replicating input matrices." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+				// End Broadcast of A and B and zero-ing of C
+				#pragma omp barrier
+
+				if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) ) {
+
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					// Initialize circular shifts at stride of Rt
+					size_t c_a = utils::modulus( th_ijk_a.tc + th_ijk_a.tr + th_ijk_a.rt * tg_a.tc / tg_a.rt, tg_a.tc );
+					size_t r_b = utils::modulus( th_ijk_b.tr + th_ijk_b.tc + th_ijk_b.rt * tg_b.tr / tg_b.rt, tg_b.tr );
+
+					// Per-c-dimensional-layer partial computation
+					for( size_t r = 0; r < tg_a.tc / tg_a.rt; ++r ) {
+
+						const th_coord_t th_isk_a( th_ijk_a.tr, c_a, th_ijk_a.rt );
+						const th_coord_t th_sjk_b( r_b, th_ijk_b.tc, th_ijk_b.rt );
+
+						const auto mxm_block_grid_dims_a = da.getLocalBlockGridDims( th_isk_a );
+						const auto mxm_block_grid_dims_b = db.getLocalBlockGridDims( th_sjk_b );
+
+						if( block_grid_dims_c.first != mxm_block_grid_dims_a.first 
+							|| block_grid_dims_c.second != mxm_block_grid_dims_b.second 
+							|| mxm_block_grid_dims_a.second != mxm_block_grid_dims_b.first 
+						) {
+#ifndef NDEBUG
+							#pragma omp critical
+							std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+								"\tMismatching local block grid size on mxm." << std::endl;
+#endif
+							local_rc = MISMATCH;
+						}
+
+						if( local_rc == SUCCESS ) {
+
+							for( size_t bk = 0; bk < mxm_block_grid_dims_a.second; ++bk ) {
+								for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+	
+									const auto refA_loc = get_view( A, th_isk_a, br, bk );
+
+									for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+										const auto refB_loc = get_view( B, th_sjk_b, bk, bc );
+										auto refC_ijk = get_view( C, th_ijk_c, br, bc );
+
+										// Delegate the call to the sequential mxm implementation
+										local_rc = local_rc ? local_rc : mxm_generic< allow_void >( refC_ijk, refA_loc, refB_loc, oper, monoid, mulMonoid );
+
+									}
+								}
+							}
+
+							// Circular shift rightwards for A
+							c_a = utils::modulus( c_a + 1, tg_a.tc );
+							// Circular shift downwards for B
+							r_b = utils::modulus( r_b + 1, tg_b.tr );
+
+						} 
+
+					} 
+
+				} // End computation per layer of c-dimension
+
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues with local mxm computations." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+				// End layer-by-layer partial computation
+				#pragma omp barrier
+
+				// Final c-dimension reduction
+				if( rc == SUCCESS && dc.isActiveThread( th_ijk_c ) && th_ijk_c.rt == 0 ) {
+					
+					const auto block_grid_dims_c = dc.getLocalBlockGridDims( th_ijk_c );
+
+					for( size_t r = 1; r < tg_c.rt; ++r ) {
+
+						const th_coord_t th_ijr_c( th_ijk_c.tr, th_ijk_c.tc, r );
+
+						for( size_t br = 0; br < block_grid_dims_c.first; ++br ) {
+							for( size_t bc = 0; bc < block_grid_dims_c.second; ++bc ) {
+
+								auto refCij0 = internal::get_view( C, th_ijk_c, br, bc ); // k == 0 
+								auto refCijr = internal::get_view( C, th_ijr_c, br, bc );
+
+								// Final result in C at layer 0
+								local_rc = local_rc ? local_rc : foldl( refCij0, refCijr, monoid.getOperator() );
+							}
+						}
+
+					}
+				
+				} // End of final reduction along c-dimension
+				
+				// Different values for rc could converge here (eg, MISMATCH, FAILED).
+				if( local_rc != SUCCESS ) {
+#ifndef NDEBUG
+					#pragma omp critical
+					std::cerr << "Thread " << thread << " in alp::internal::mxm_generic (omp)\n"
+						"\tIssues with final reduction." << std::endl;
+#endif
+					rc = local_rc;
+				}
+
+			}
+
+			return rc;
+
+		}
+
+	} // namespace internal
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with semiring parameter.
+	 *
+	 * @tparam descr      		The descriptors under which to perform the computation.
+	 * @tparam OutputStructMatT The structured matrix type of the output matrix.
+	 * @tparam InputStructMatT1 The structured matrix type of the the left-hand side input
+	 *                    		matrix.
+	 * @tparam InputStructMatT2 The structured matrix type of the right-hand side input
+	 *                    		matrix.
+	 * @tparam Semiring   		The semiring under which to perform the
+	 *                    		multiplication.
+	 *
+	 * @returns SUCCESS  If the computation completed as intended.
+	 * @returns MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do
+ *                       not match. All input data containers are left
+ *                       untouched if this exit code is returned; it will be
+ *                       as though this call was never made.
+	 *
+	 * @param[out] C 	The output matrix \f$ C = AB \f$ when the function returns
+	 *               	#SUCCESS.
+	 * @param[in]  A 	The left-hand side input matrix \f$ A \f$.
+	 * @param[in]  B 	The left-hand side input matrix \f$ B \f$.
+	 * @param[in] ring  (Optional.) The semiring under which the computation should
+	 *                             proceed.
+	 * @param phase 	The execution phase.
+	 */
+	template<
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
+		class Semiring
+	>
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+		alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+		const Semiring & ring = Semiring(),
+		const PHASE &phase = NUMERICAL,
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Semiring >::value,
+			void 
+		>::type * const = NULL 
+	) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( 
+			C, A, B,
+			ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid()
+		);
+
+	}
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with additive monoid and multiplicative operator
+	 */
+	template< 
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
+		class Operator, class Monoid
+	>
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, omp > &A,
+		alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, omp > &B,
+		const Operator & mulOp,
+		const Monoid & addM,
+		const PHASE &phase = NUMERICAL,
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void 
+		>::type * const = NULL 
+	) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() );
+	
+	}
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_BLAS3''
+
diff --git a/include/alp/omp/config.hpp b/include/alp/omp/config.hpp
new file mode 100644
index 000000000..932034fe4
--- /dev/null
+++ b/include/alp/omp/config.hpp
@@ -0,0 +1,52 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef _H_ALP_OMP_CONFIG
+#define _H_ALP_OMP_CONFIG
+
+#include <graphblas/omp/config.hpp>
+
+#include <alp/base/config.hpp>
+
+
+namespace alp {
+
+	namespace config {
+
+		/** The default sequential backend to be selected for this parallel backend. */
+		static constexpr alp::Backend default_sequential_backend = _ALP_SECONDARY_BACKEND;
+
+		class OMP : public grb::config::OMP {};
+
+		// Dimensions of blocks counted in number of elements per dimension
+		constexpr size_t BLOCK_ROW_DIM = 128;
+		constexpr size_t BLOCK_COL_DIM = 128;
+
+		// Temporary squared solution to accomodate for 2.5D mxm
+		constexpr size_t THREAD_ROW_DIM = 4;
+		constexpr size_t THREAD_COL_DIM = 4;
+
+		constexpr size_t REPLICATION_FACTOR_THREADS = 4;
+
+
+	} // namespace config
+
+} // namespace alp
+
+#endif
+
diff --git a/include/alp/omp/exec.hpp b/include/alp/omp/exec.hpp
new file mode 100644
index 000000000..84df0cdaa
--- /dev/null
+++ b/include/alp/omp/exec.hpp
@@ -0,0 +1,117 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_EXEC
+#define _H_ALP_OMP_EXEC
+
+#include <alp/backends.hpp>
+
+#include <alp/base/exec.hpp>
+
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/init.hpp>
+#endif
+#ifdef _ALP_OMP_WITH_DISPATCH
+ #include <alp/dispatch/init.hpp>
+#endif
+
+namespace alp {
+
+	/**
+	 * \internal No implementation notes.
+	 */
+	template< EXEC_MODE mode >
+	class Launcher< mode, omp > {
+
+		public:
+
+			/** \internal No implementation notes. */
+			Launcher(
+				const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) {
+				(void) process_id;
+				(void) nprocs;
+				(void) hostname;
+				(void) port;
+			}
+
+			/** \internal No implementation notes. */
+			template< typename U >
+			RC exec(
+				void ( *alp_program )( const void *, const size_t, U & ),
+				const void *data_in, const size_t in_size,
+				U &data_out,
+				const bool broadcast = false
+			) const {
+				(void)broadcast; // value doesn't matter for a single user process
+				std::cerr << "Entering exec().\n";
+				// intialise GraphBLAS
+				RC ret = init();
+				// call graphBLAS algo
+				if( ret == SUCCESS ) {
+					( *alp_program )( data_in, in_size, data_out );
+				}
+				// finalise the GraphBLAS
+				if( ret == SUCCESS ) {
+					ret = finalize();
+				}
+				// and done
+				return ret;
+			}
+
+			/** \internal No implementation notes. */
+			template< typename T, typename U >
+			RC exec(
+				void ( *alp_program )( const T &, U & ),
+				const T &data_in, U &data_out,
+				const bool broadcast = false
+			) {
+				(void)broadcast; // value doesn't matter for a single user process
+				std::cerr << "Entering exec().\n";
+				// intialise GraphBLAS
+				RC ret = init();
+				// call graphBLAS algo
+				if( ret == SUCCESS ) {
+					( *alp_program )( data_in, data_out );
+				}
+				// finalise the GraphBLAS
+				if( ret == SUCCESS ) {
+					ret = finalize();
+				}
+				// and done
+				return ret;
+			}
+
+			/** \internal No implementation notes. */
+			static inline RC finalize() {
+				return SUCCESS;
+			}
+
+	};
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_EXEC''
+
diff --git a/include/alp/omp/init.hpp b/include/alp/omp/init.hpp
new file mode 100644
index 000000000..4d63409f9
--- /dev/null
+++ b/include/alp/omp/init.hpp
@@ -0,0 +1,41 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_INIT
+#define _H_ALP_OMP_INIT
+
+#include <alp/base/init.hpp>
+
+namespace alp {
+
+	/** \internal No-op init */
+	template<>
+	RC init< omp >( const size_t, const size_t, void * const );
+
+	/** \internal No-op init */
+	template<>
+	RC finalize< omp >();
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_INIT''
+
diff --git a/include/alp/omp/io.hpp b/include/alp/omp/io.hpp
new file mode 100644
index 000000000..2ba49a121
--- /dev/null
+++ b/include/alp/omp/io.hpp
@@ -0,0 +1,192 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_IO
+#define _H_ALP_OMP_IO
+
+#include <type_traits>
+
+#include <alp/base/io.hpp>
+
+#include "matrix.hpp"
+#include "scalar.hpp"
+#include "storage.hpp"
+
+// Include backend to which sequential work is delegated
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/io.hpp>
+#endif
+#ifdef _ALP_OMP_WITH_DISPATCH
+ #include <alp/dispatch/io.hpp>
+#endif
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+namespace alp {
+
+	/**
+	 * Sets all elements of the given matrix to the value of the given scalar.
+	 * C = val
+	 *
+	 * @tparam descr
+	 * @tparam OutputType      Data type of the output matrix C
+	 * @tparam OutputStructure Structure of the matrix C
+	 * @tparam OutputView      View type applied to the matrix C
+	 * @tparam InputType       Data type of the scalar a
+	 *
+	 * @param C    Matrix whose values are to be set
+	 * @param val  The value to set the elements of the matrix C
+	 *
+	 * @return RC  SUCCESS on the successful execution of the set
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure
+	>
+	RC set(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, omp > &C,
+		const Scalar< InputType, InputStructure, omp > &val
+	) noexcept {
+
+		static_assert(
+			!std::is_same< OutputType, void >::value,
+			"alp::set (set to matrix): cannot have a pattern matrix as output"
+		);
+#ifdef _DEBUG
+		std::cout << "Called alp::set (matrix-to-value, omp)" << std::endl;
+#endif
+		// static checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< InputType, OutputType >::value ),
+			"alp::set", "called with non-matching value types"
+		);
+
+		static_assert(
+			!internal::is_functor_based<
+				Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, omp >
+			>::value,
+			"alp::set cannot be called with a functor-based matrix as a destination."
+		);
+
+		if( !internal::getInitialized( val ) ) {
+			internal::setInitialized( C, false );
+			return SUCCESS;
+		}
+
+		const Distribution_2_5D &d = internal::getAmf( C ).getDistribution();
+
+		RC rc = SUCCESS;
+
+		#pragma omp parallel for
+		for( size_t thread = 0; thread < config::OMP::current_threads(); ++thread ) {
+			const auto t_coords = d.getThreadCoords( thread );
+			const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
+
+			RC local_rc = SUCCESS;
+
+			for( size_t br = 0; br < block_grid_dims.first; ++br ) {
+				for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
+
+					// Get a sequential matrix view over the block
+					auto refC = internal::get_view( C, t_coords, br, bc );
+
+					// Construct a sequential Scalar container from the input Scalar
+					Scalar< InputType, InputStructure, config::default_sequential_backend > ref_val( *val );
+
+					// Delegate the call to the sequential set implementation
+					local_rc = local_rc ? local_rc : set( refC, ref_val );
+
+					if( local_rc != SUCCESS ) {
+						rc = local_rc;
+					}
+				}
+			}
+		}
+
+		internal::setInitialized( C, true );
+		return rc;
+	}
+
+	/**
+	 * Temporarily assuming 1-1, row-major mapping with user container.
+	 */
+	template< 
+		typename InputType, typename Structure, typename View, 
+		typename ImfR, typename ImfC, typename fwd_iterator 
+	>
+	RC buildMatrix(
+		Matrix< InputType, Structure, Density::Dense, View, ImfR, ImfC, omp > &A,
+		const fwd_iterator &start,
+		const fwd_iterator &end
+	) noexcept {
+		
+		static_assert( 
+			( std::is_same< InputType, typename fwd_iterator::value_type >::value ), 
+			"alp::buildMatrix (omp): Mismatching type between user-provided input "
+			"container and output ALP container."
+		);
+
+		const size_t m = nrows( A );
+		const size_t n = ncols( A );
+
+		if( ( end - start ) != static_cast< std::ptrdiff_t >( m * n ) ) {
+			
+			std::cerr << "alp::buildMatrix (omp): Mismatching sizes between "
+				"user-provided input container and output ALP container." << std::endl;
+
+			return MISMATCH;
+		}
+
+		internal::setInitialized(A, true);
+
+		fwd_iterator it = start;
+
+		for( size_t i = 0; i < m; ++i ) {
+			for( size_t j = 0; j < n; ++j ) {
+				internal::access( A, internal::getStorageIndex( A, i, j ) ) = *( it++ );
+			}
+		}
+		
+		return SUCCESS;
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_ALP_OMP_IO''
+
diff --git a/include/alp/omp/matrix.hpp b/include/alp/omp/matrix.hpp
new file mode 100644
index 000000000..eb12ad637
--- /dev/null
+++ b/include/alp/omp/matrix.hpp
@@ -0,0 +1,114 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_MATRIX
+#define _H_ALP_OMP_MATRIX
+
+#include <type_traits>
+
+#include <alp/backends.hpp>
+#include <alp/type_traits.hpp>
+
+#include <alp/base/matrix.hpp>
+#include <alp/amf-based/matrix.hpp>
+
+#include "storage.hpp"
+#include "storagebasedmatrix.hpp"
+#include "vector.hpp"
+
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/storage.hpp> // For AMFFactory
+#endif
+#ifdef _ALP_OMP_WITH_DISPATCH
+ #include <alp/dispatch/storage.hpp> // For AMFFactory
+#endif
+
+
+namespace alp {
+
+	// Currently no backend specific implementation
+
+	namespace internal {
+
+		/**
+		 * Exposes a block of the parallel matrix as a sequential ALP matrix.
+		 *
+		 * The underlying container (buffer/block) is obtained from the parallel
+		 * container, while the AMF is constructed based on the properties of the
+		 * block and the applied gather view (i.e., the IMFs associated to it).
+		 *
+		 */
+		template<
+			enum view::Views target_view = view::original,
+			typename SourceMatrix,
+			typename ThreadCoords,
+			std::enable_if_t<
+				alp::is_matrix< SourceMatrix >::value
+			> * = nullptr
+		>
+		typename new_container_type_from<
+			typename SourceMatrix::template view_type< view::gather >::type
+		>::template change_backend< config::default_sequential_backend >::type
+		get_view( SourceMatrix &source, const ThreadCoords t, const size_t br, const size_t bc ) {
+
+			// get the container
+			const auto &distribution = getAmf( source ).getDistribution();
+			const size_t thread_id = distribution.getThreadId( t );
+			const size_t block_id = distribution.getLocalBlockId( t, br, bc );
+			auto &container = getLocalContainer( getContainer( source ), thread_id, block_id );
+
+			// make an AMF
+			// note: When making a view over a vector, the second imf must be imf::Zero
+
+			// Type of AMF factory corresponding to the full local block's AMF
+			using original_amf_factory = alp::storage::AMFFactory< config::default_sequential_backend >::FromPolynomial<
+				structures::General, imf::Id, imf::Id
+			>;
+
+			// AMF factory after applying the global view
+			using amf_factory = alp::storage::AMFFactory< config::default_sequential_backend >::Compose<
+				imf::Strided, imf::Strided, typename original_amf_factory::amf_type
+			>;
+
+			const auto block_dims = distribution.getBlockDimensions();
+
+			typename amf_factory::amf_type amf = amf_factory::Create(
+				imf::Id( block_dims.first ),
+				imf::Id( block_dims.second ),
+				original_amf_factory::Create( imf::Id( block_dims.first ), imf::Id( block_dims.second ) )
+			);
+
+			// create a sequential container with the container and AMF above
+			using target_t = typename new_container_type_from<
+				typename SourceMatrix::template view_type< view::gather >::type
+			>::template change_backend< config::default_sequential_backend >::type;
+
+			target_t blk_matrix( container, amf );
+			internal::setInitialized( blk_matrix, internal::getInitialized( source ) );
+			return blk_matrix;
+		}
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // end ``_H_ALP_OMP_MATRIX''
diff --git a/include/alp/omp/scalar.hpp b/include/alp/omp/scalar.hpp
new file mode 100644
index 000000000..881e43139
--- /dev/null
+++ b/include/alp/omp/scalar.hpp
@@ -0,0 +1,204 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_OMP_SCALAR
+#define _H_ALP_OMP_SCALAR
+
+
+#include <stdexcept>
+#include <memory>
+
+#include <assert.h>
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/rc.hpp>
+#include <alp/structures.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/scalar.hpp>
+
+
+namespace alp {
+
+	namespace internal {
+		template< typename T, typename Structure >
+		bool getInitialized( const Scalar< T, Structure, omp > & ) noexcept;
+
+		template< typename T, typename Structure >
+		void setInitialized( Scalar< T, Structure, omp > &, bool ) noexcept;
+	} // end namespace ``alp::internal''
+
+	/**
+	 * An ALP scalar.
+	 *
+	 * This is an opaque data type for scalars.
+	 *
+	 * @tparam T                 The type of the vector elements. \a T shall not
+	 *                           be a ALP type.
+	 * @tparam Structure         One of the structures.
+	 *
+	 * \warning Creating a alp::Scalar of other ALP types is
+	 *                <em>not allowed</em>.
+	 *          Passing a ALP type as template parameter will lead to
+	 *          undefined behaviour.
+	 *
+	 */
+	template< typename T, typename Structure >
+	class Scalar< T, Structure, omp > {
+
+		private:
+
+			typedef Scalar< T, Structure, omp > self_type;
+
+			friend bool internal::getInitialized<>( const self_type & ) noexcept;
+
+			friend void internal::setInitialized<>( self_type &, bool ) noexcept;
+
+			// Scalar value
+			T value;
+
+			/** Whether the scalar value is currently initialized */
+			bool initialized;
+
+		public:
+			/** @see Vector::value_type. */
+			typedef T value_type;
+
+			/** @see Vector::lambda_reference */
+			typedef T& lambda_reference;
+			typedef const T& const_lambda_reference;
+
+			/**
+			 * The main ALP scalar constructor.
+			 *
+			 * The constructed object will be uninitalised after successful construction.
+			 *
+			 *
+			 * @return SUCCESS This function never fails.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar() : initialized( false ) {}
+
+			/**
+			 * The ALP scalar constructor for converting C/C++ scalar to ALP scalar.
+			 *
+			 * The constructed object will be initialized after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			explicit Scalar( const T &value ) : value( value ), initialized( true ) {}
+
+			/**
+			 * Copy constructor.
+			 *
+			 * @param other The scalar to copy. The initialization state of the copy
+			 *              reflects the state of \a other.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor allocates \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ of data
+			 *           movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar( const Scalar &other ) : value( other.value ), initialized( other.initialized ) {
+				// const RC rc = set( *this, other ); // note: initialized will be set as part of this call
+				// if( rc != SUCCESS ) {
+				// 	throw std::runtime_error( "alp::Scalar< T, Structure, Density::Dense, View::Original< void >, omp > (copy constructor): error during call to alp::set (" + toString( rc ) + ")" );
+				// }
+			}
+
+			/**
+			 * Move constructor. The new scalar equals the given
+			 * scalar. Invalidates the use of the input scalar.
+			 *
+			 * @param[in] other The ALP scalar to move to this new instance.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor will not allocate any new dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+			 * \endparblock
+			 */
+			Scalar( Scalar &&other ) : value( other.value ), initialized( other.initialized ) {
+				other.initialized = false;
+			}
+
+			/** \internal No implementation notes. */
+			lambda_reference operator*() noexcept {
+				assert( internal::getInitialized( *this ) );
+				return value;
+			}
+
+			/** \internal No implementation notes. */
+			const_lambda_reference operator*() const noexcept {
+				assert( internal::getInitialized( *this ) );
+				return value;
+			}
+
+	}; // class Scalar with physical container
+
+	/** Identifies any omp scalar as an ALP scalar. */
+	template< typename T, typename Structure >
+	struct is_scalar< Scalar< T, Structure, omp > > : std::true_type {};
+
+	namespace internal {
+		template< typename T, typename Structure >
+		bool getInitialized( const Scalar< T, Structure, omp > &s ) noexcept {
+			return s.initialized;
+		}
+
+		template< typename T, typename Structure >
+		void setInitialized( Scalar< T, Structure, omp > &s, bool initialized ) noexcept {
+			s.initialized = initialized;
+		}
+	} // end namespace ``alp::internal''
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_SCALAR''
+
diff --git a/include/alp/omp/storage.hpp b/include/alp/omp/storage.hpp
new file mode 100644
index 000000000..b1c93cf45
--- /dev/null
+++ b/include/alp/omp/storage.hpp
@@ -0,0 +1,781 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file
+ *
+ * This file registers mechanisms for coordinate mapping between
+ * logical and physical iteration spaces.
+ *
+ */
+
+#ifndef _H_ALP_OMP_STORAGE
+#define _H_ALP_OMP_STORAGE
+
+#include <cmath>
+
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/amf-based/storage.hpp>
+#endif
+#ifdef _ALP_OMP_WITH_DISPATCH
+ #include <alp/dispatch/storage.hpp>
+#endif
+
+namespace alp {
+
+	namespace internal {
+
+		/** Specialization for matrices */
+		template< typename Structure >
+		struct determine_poly_factory< Structure, imf::Id, imf::Id, omp > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for vectors */
+		template< typename Structure >
+		struct determine_poly_factory< Structure, imf::Id, imf::Zero, omp > {
+
+			typedef storage::polynomials::ArrayFactory factory_type;
+		};
+
+	} // namespace internal
+
+
+	/**
+	 * Implements mapping between global and local iteration spaces
+	 * for shared-memory parallel backend.
+	 * The logical coordinates are represented as pair (i, j) of row and
+	 * column positions within the matrix.
+	 * The local coordinates are represented as (tr, tc, rt, br, bc, il, jl),
+	 * where:
+	 *  - tr is thread row-coordinate
+	 *  - tc is thread column-coordinate
+	 *  - rt is replication factor for thread coordinates
+	 *  - br is block row-coordinate
+	 *  - bc is block column-coordinate
+	 *  - i  is element's row-coordinate within its block
+	 *  - j  is element's column-coordinate within its block
+	 *
+	 * This implementation assumes block-cyclic distribution of blocks
+	 * among threads.
+	 *
+	 */
+	class Distribution_2_5D {
+
+		public:
+
+			/** Type encapsulating thread coordinates within the thread grid. */
+			struct ThreadCoords {
+				const size_t tr;
+				const size_t tc;
+				const size_t rt;
+
+				ThreadCoords( const size_t tr, const size_t tc, const size_t rt ) : tr( tr ), tc( tc ), rt( rt ) {}
+			};
+
+			/** Type encapsulating the global element coordinate. */
+			struct GlobalCoord {
+
+				const size_t i;
+				const size_t j;
+
+				GlobalCoord( const size_t i, const size_t j ) : i( i ), j( j ) {}
+
+			};
+
+			/** Type encapsulating the local element coordinate. */
+			struct LocalCoord {
+
+				const ThreadCoords t;
+				const size_t br;
+				const size_t bc;
+				const size_t i;
+				const size_t j;
+
+				LocalCoord(
+					const size_t tr, const size_t tc,
+					const size_t rt,
+					const size_t br, const size_t bc,
+					const size_t i, const size_t j
+				) :
+					t( tr, tc, rt ),
+					br( br ), bc( bc ),
+					i( i ), j( j ) {}
+
+				const ThreadCoords &getThreadCoords() const {
+					return t;
+				}
+	
+			};
+
+			/** Type encapsulating the global block coordinate. */
+			struct GlobalBlockCoord {
+
+				const size_t br;
+				const size_t bc;
+
+				GlobalBlockCoord(
+					const size_t br, const size_t bc
+				) :
+					br( br ), bc( bc ) {}
+
+			};
+
+			// /** Type encapsulating the local block coordinate. */
+			// struct LocalBlockCoord {
+
+			// 	const size_t tr;
+			// 	const size_t tc;
+			// 	const size_t rt;
+			// 	const size_t br;
+			// 	const size_t bc;
+
+			// 	LocalBlockCoord(
+			// 		const size_t tr, const size_t tc,
+			// 		const size_t rt,
+			// 		const size_t br, const size_t bc
+			// 	) :
+			// 		tr( tr ), tc( tc ),
+			// 		rt( rt ),
+			// 		br( br ), bc( bc ) {}
+
+			// };
+
+		private:
+
+			/** Row and column dimensions of the associated container */
+			const size_t m;
+			const size_t n;
+			/** Replication factor in thread-coordinate space */
+			static constexpr size_t Rt = config::REPLICATION_FACTOR_THREADS;
+			/** The row and column dimensions of the global block grid */
+			const size_t Br;
+			const size_t Bc;
+			/** The row and column dimensions of the thread grid */
+			const size_t Tr;
+			const size_t Tc;
+
+		public:
+
+			Distribution_2_5D(
+				const size_t m, const size_t n,
+				const size_t num_threads
+			) :
+				m( m ), n( n ),
+				Br( static_cast< size_t >( std::ceil( static_cast< double >( m ) / config::BLOCK_ROW_DIM ) ) ),
+				Bc( static_cast< size_t >( std::ceil( static_cast< double >( n ) / config::BLOCK_COL_DIM ) ) ),
+				Tr( ( Br > config::THREAD_ROW_DIM ) ? config::THREAD_ROW_DIM : Br ), 
+				Tc( ( Bc > config::THREAD_COL_DIM ) ? config::THREAD_COL_DIM : Bc ) 
+			{
+				if( num_threads != config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS ) {
+					std::cerr << "Warning: Provided number of threads cannot be factorized in a 2.5D grid:\n"
+						"\t" << num_threads << " != " << config::THREAD_ROW_DIM << " x " << config::THREAD_COL_DIM << " x " << config::REPLICATION_FACTOR_THREADS << std::endl;
+				}
+			}
+
+			// LocalBlockCoord mapBlockGlobalToLocal( const GlobalBlockCoord &g ) const {
+			// 	(void) g;
+			// 	return LocalBlockCoord( 0, 0, 0, 0, 0 );
+			// }
+
+			// GlobalBlockCoord mapBlockLocalToGlobal( const LocalBlockCoord &l ) const {
+			// 	const size_t block_id_r = l.br * Tr + l.tr;
+			// 	const size_t block_id_c = l.bc * Tc + l.tc;
+			// 	return GlobalBlockCoord( block_id_r, block_id_c );// Temporary
+			// }
+
+			LocalCoord mapGlobalToLocal( const GlobalCoord &g ) const {
+				const size_t global_br = g.i / config::BLOCK_ROW_DIM;
+				const size_t local_br = global_br / Tr;
+				const size_t tr = global_br % Tr;
+				const size_t local_i = g.i % config::BLOCK_ROW_DIM;
+
+				const size_t global_bc = g.j / config::BLOCK_COL_DIM;
+				const size_t local_bc = global_bc / Tc;
+				const size_t tc = global_bc % Tc;
+				const size_t local_j = g.j % config::BLOCK_COL_DIM;
+
+				return LocalCoord(
+					tr, tc,
+					0, // Rt always maps to the front layer
+					local_br, local_bc,
+					local_i, local_j
+				);
+			}
+
+			// /**
+			//  * Maps coordinates from local to global space.
+			//  *
+			//  * \todo Add implementation
+			//  */
+			// GlobalCoord mapLocalToGlobal( const LocalCoord &l ) const {
+			// 	(void) l;
+			// 	return GlobalCoord( 0, 0 );
+			// }
+
+			/** 
+			 * Returns the thread ID corresponding to the given thread coordinates. 
+			 * The fixed thread grid enables to map left-over threads.
+			 */
+			size_t getThreadId( const ThreadCoords t ) const {
+				return t.rt * config::THREAD_ROW_DIM * config::THREAD_COL_DIM + t.tr * config::THREAD_COL_DIM + t.tc;
+			}
+
+			size_t getNumberOfThreads() const {
+				return config::THREAD_ROW_DIM * config::THREAD_COL_DIM * config::REPLICATION_FACTOR_THREADS;
+			}
+
+			/** Returns the thread grid size */
+			ThreadCoords getThreadGridDims() const {
+				return { Tr, Tc, Rt };
+			}
+
+			/** Returns the total global amount of blocks */
+			std::pair< size_t, size_t > getGlobalBlockGridDims() const {
+				return { Br, Bc };
+			}
+
+			/** Returns the dimensions of the block grid associated to the given thread */
+			std::pair< size_t, size_t > getLocalBlockGridDims( const ThreadCoords t ) const {
+				// The RHS of the + operand covers the case
+				// when the last block of threads is not full
+				const size_t blocks_r = Br / Tr + ( t.tr < Br % Tr ? 1 : 0 );
+				const size_t blocks_c = Bc / Tc + ( t.tc < Bc % Tc ? 1 : 0 );
+				return { blocks_r, blocks_c };
+			}
+
+			// /** Returns the global block coordinates based on the thread and local block coordinates */
+			// std::pair< size_t, size_t > getGlobalBlockCoords( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			// 	const size_t global_br = br * Tr + tr % Tr;
+			// 	const size_t global_bc = bc * Tc + tc % Tc;
+			// 	return { global_br, global_bc };
+			// }
+
+			// size_t getGlobalBlockId( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			// 	const auto global_coords = getGlobalBlockCoords( tr, tc, br, bc );
+			// 	return global_coords.first * Bc + global_coords.second;
+			// }
+
+			size_t getLocalBlockId( const LocalCoord &local ) const {
+				return local.br * getLocalBlockGridDims( local.getThreadCoords() ).second + local.bc;
+			}
+
+			size_t getLocalBlockId( const ThreadCoords &t, const size_t br, const size_t bc ) const {
+				return br * getLocalBlockGridDims( t ).second + bc;
+			}
+
+			/**
+			 * Returns the dimensions of the block given by the block id
+			 */
+			//std::pair< size_t, size_t > getBlockDimensions( const size_t tr, const size_t tc, const size_t br, const size_t bc ) const {
+			//	const auto global_block_coords = getGlobalBlockCoords( tr, tc, br, bc );
+			//	const size_t block_height = ( global_block_coords.first < Br - 1 ) ?
+			//		( config::BLOCK_ROW_DIM ) :
+			//		( m - config::BLOCK_ROW_DIM * ( Br - 1 ) );
+			//	const size_t block_width = ( global_block_coords.second < Bc - 1 ) ?
+			//		( config::BLOCK_COL_DIM ) :
+			//		( n - config::BLOCK_COL_DIM * ( Bc - 1 ) );
+			//	return { block_height, block_width };
+			//}
+
+			/** Returns the dimensions of the block given by the block id */
+			constexpr std::pair< size_t, size_t > getBlockDimensions() const {
+				return { config::BLOCK_ROW_DIM, config::BLOCK_COL_DIM };
+			}
+
+			/** Returns the size (in number of elements) of the block defined by the thread and local block coordinates. */
+			size_t getBlockSize() const {
+				const auto dims = getBlockDimensions();
+				return dims.first * dims.second;
+			}
+
+			/** For a given block, returns its offset from the beginning of the buffer in which it is stored */
+			size_t getBlocksOffset( const ThreadCoords &t, const size_t br, const size_t bc ) const {
+				// The offset is calculated as the sum of sizes of all previous blocks
+				const size_t block_coord_1D = br * getLocalBlockGridDims( t ).second + bc;
+				return block_coord_1D * getBlockSize();
+			}
+
+			ThreadCoords getThreadCoords( const size_t thread_id ) const {
+				const size_t rt = thread_id / ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM );
+				const size_t tr = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) / config::THREAD_COL_DIM;
+				const size_t tc = ( thread_id % ( config::THREAD_ROW_DIM * config::THREAD_COL_DIM ) ) % config::THREAD_COL_DIM;
+				return { tr, tc, rt };
+			}
+
+			bool isActiveThread(const ThreadCoords &t ) const {
+				return t.tr < Tr && t.tc < Tc && t.rt < Rt;
+			}
+
+			bool isActiveThread(const size_t thread_id ) const {
+				const auto th_coords = getThreadCoords( thread_id );
+				return isActiveThread( th_coords );
+			}
+
+	};
+		
+	namespace storage {
+
+		/**
+		 * AMF for parallel shared memory backend.
+		 *
+		 * This implementation makes the following assumptions:
+		 *  - all blocks use the same storage scheme, independent of their non-zero structure
+		 *
+		 * @tparam ImfR  Index-Mapping Function associated to row dimension.
+		 * @tparam ImfC  Index-Mapping Function associated to column dimension.
+		 * @tparam PolyFactory  The type of factory for storage polynomials
+		 *                      used to construct polynomials for all blocks.
+		 */
+		template< typename ImfR, typename ImfC, typename PolyFactory >
+		class AMF< ImfR, ImfC, PolyFactory, omp > {
+
+			friend class AMFFactory< omp >;
+
+			public:
+
+				/** Expose static properties */
+				typedef ImfR imf_r_type;
+				typedef ImfC imf_c_type;
+				typedef PolyFactory poly_factory_type;
+				typedef typename PolyFactory::poly_type mapping_polynomial_type;
+
+				/** Expose types defined within the class */
+				typedef struct StorageIndexType {
+
+					size_t buffer_id;
+					size_t block_id;
+					size_t offset;
+
+					StorageIndexType( const size_t buffer_id, const size_t block_id, const size_t offset ) :
+						buffer_id( buffer_id ), block_id( block_id ), offset( offset ) {}
+
+				} storage_index_type;
+
+			private:
+
+				const imf_r_type imf_r;
+				const imf_c_type imf_c;
+
+				constexpr static bool is_matrix = std::is_same< ImfC, imf::Id >::value;
+
+				/**
+				 * Number of threads used to initialize the associated container.
+				 * This impacts the number of allocated blocks.
+				 */
+				const size_t num_threads;
+
+				const Distribution_2_5D distribution;
+
+				AMF(
+					ImfR imf_r,
+					ImfC imf_c,
+					size_t num_threads = config::OMP::threads()
+				) :
+					imf_r( imf_r ), imf_c( imf_c ),
+					num_threads( num_threads ),
+					distribution( imf_r.n, imf_c.n, num_threads ) {
+#ifdef DEBUG
+					std::cout << "Entering AMF normal constructor\n";
+#endif
+				}
+
+				AMF( const AMF & ) = delete;
+				AMF &operator=( const AMF & ) = delete;
+
+			public:
+
+				AMF( AMF &&amf ) :
+					imf_r( std::move( amf.imf_r ) ),
+					imf_c( std::move( amf.imf_c ) ),
+					num_threads( amf.num_threads ),
+					distribution( std::move( amf.distribution ) ) {
+#ifdef DEBUG
+					std::cout << "Entering OMP AMF move constructor\n";
+#endif
+				}
+
+				const Distribution_2_5D &getDistribution() const {
+					return distribution;
+				}
+
+				/**
+				 * Returns dimensions of the logical layout of the associated container.
+				 *
+				 * @return  A pair of two values, number of rows and columns, respectively.
+				 */
+				std::pair< size_t, size_t> getLogicalDimensions() const {
+					return std::make_pair( imf_r.n, imf_c.n );
+				}
+
+				/**
+				 * @brief Returns a storage index based on the coordinates in the
+				 *        logical iteration space.
+				 *
+				 * @tparam R  ImfR type
+				 * @tparam C  ImfC type
+				 *
+				 * @param[in] i  row-coordinate
+				 * @param[in] j  column-coordinate
+				 * @param[in] s  current process ID
+				 * @param[in] P  total number of processes
+				 *
+				 * @return  storage index corresponding to the provided logical
+				 *          coordinates and parameters s and P.
+				 *
+				 * \note It is not necessary to call imf.map() function if the imf
+				 *       has the type imf::Id. To implement SFINAE-driven selection
+				 *       of the getStorageIndex, dummy parameters R and C are added.
+				 *       They are set to the ImfR and ImfC by default and a static
+				 *       assert ensures that external caller does not force a call
+				 *       to wrong implementation by explicitly specifying values
+				 *       for R and/or C.
+				 *
+				 */
+				storage_index_type getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					(void) s;
+					(void) P;
+					const typename Distribution_2_5D::GlobalCoord global( imf_r.map( i ), imf_c.map( j ) );
+					const typename Distribution_2_5D::LocalCoord local = distribution.mapGlobalToLocal( global );
+
+					const size_t thread = distribution.getThreadId( local.getThreadCoords() );
+					const size_t local_block = distribution.getLocalBlockId( local );
+					const size_t local_element = local.i * config::BLOCK_ROW_DIM + local.j;
+
+					return storage_index_type( thread, local_block, local_element );
+				}
+
+				/**
+				 * Returns coordinates in the logical iteration space based on
+				 * the storage index.
+				 *
+				 * @param[in] storageIndex  storage index in the physical
+				 *                          iteration space
+				 * @param[in] s             current process ID
+				 * @param[in] P             total number of processes
+				 *
+				 * @return  a pair of row- and column-coordinates in the
+				 *          logical iteration space.
+				 */
+				std::pair< size_t, size_t > getCoords( const size_t storageIndex, const size_t s, const size_t P ) const;
+
+		}; // class AMF
+
+		/**
+		 * Collects AMF factory classes.
+		 */
+		template<>
+		struct AMFFactory< omp > {
+
+			/**
+			 * Specialization of AMFFactory for shared-memory parallel backend.
+			 * 
+			 * @tparam ViewImfR   The type of IMF applied to the row coordinate.
+			 * @tparam ViewImfC   The type of IMF applied to the column coordinate.
+			 * @tparam SourceAMF  The type of the target AMF
+			 *
+			 */
+			template< typename ViewImfR, typename ViewImfC, typename SourceAMF >
+			struct Compose {
+
+				private:
+
+					/** Extract target IMF and polynomial types */
+					typedef typename SourceAMF::imf_r_type SourceImfR;
+					typedef typename SourceAMF::imf_c_type SourceImfC;
+					typedef typename SourceAMF::mapping_polynomial_type SourcePoly;
+
+					/** Compose row and column IMFs */
+					typedef typename imf::ComposedFactory< SourceImfR, ViewImfR >::type composed_imf_r_type;
+					typedef typename imf::ComposedFactory< SourceImfC, ViewImfC >::type composed_imf_c_type;
+
+					/** Fuse composed row IMF into the target polynomial */
+					typedef typename polynomials::fuse_on_i<
+						composed_imf_r_type,
+						SourcePoly
+					> fused_row;
+
+					/** Fuse composed column IMF into the intermediate polynomial obtained above */
+					typedef typename polynomials::fuse_on_j<
+						composed_imf_c_type,
+						typename fused_row::resulting_polynomial_type
+					> fused_row_col;
+
+					typedef typename fused_row::resulting_imf_type final_imf_r_type;
+					typedef typename fused_row_col::resulting_imf_type final_imf_c_type;
+					typedef typename fused_row_col::resulting_polynomial_type final_polynomial_type;
+
+				public:
+
+					typedef AMF< final_imf_r_type, final_imf_c_type, final_polynomial_type, omp > amf_type;
+
+					static amf_type Create( ViewImfR imf_r, ViewImfC imf_c, const AMF< SourceImfR, SourceImfC, SourcePoly, omp > &amf ) {
+						composed_imf_r_type composed_imf_r = imf::ComposedFactory< SourceImfR, ViewImfR >::create( amf.imf_r, imf_r );
+						composed_imf_c_type composed_imf_c = imf::ComposedFactory< SourceImfC, ViewImfC >::create( amf.imf_c, imf_c );
+						return amf_type(
+							fused_row::CreateImf( composed_imf_r ),
+							fused_row_col::CreateImf( composed_imf_c ),
+							fused_row_col::CreatePolynomial(
+								composed_imf_c,
+								fused_row::CreatePolynomial( composed_imf_r, amf.map_poly )
+							),
+							amf.storage_dimensions
+						);
+					}
+
+					Compose() = delete;
+
+			}; // class Compose
+
+			/**
+			 * @brief Describes an AMF for a container that requires allocation
+			 *        and exposes the AMFs type and a factory method to create it.
+			 *
+			 * A container that requires allocation is accompanied by Id IMFs for
+			 * both row and column dimensions and the provided mapping polynomial.
+			 *
+			 * @tparam PolyType  Type of the mapping polynomial.
+			 *
+			 */
+			template< typename Structure, typename ImfR, typename ImfC >
+			struct FromPolynomial {
+
+				// Ensure compatibility of IMF types.
+				// Original Matrix has imf::Id as both IMFs.
+				// Original Vector has ImfR = imf::Id and ImfC = imf::Zero.
+				static_assert(
+					std::is_same< ImfR, imf::Id >::value &&
+					( std::is_same< ImfC, imf::Id >::value || std::is_same< ImfC, imf::Zero >::value ),
+					"AMF factory FromPolynomial can only be used for an original container."
+				);
+
+				typedef typename internal::determine_poly_factory< Structure, ImfR, ImfC, omp >::factory_type PolyFactory;
+
+				typedef AMF< imf::Id, imf::Id, PolyFactory, omp > amf_type;
+
+				/**
+				 * Factory method used by 2D containers.
+				 *
+				 * @param[in] imf_r               Row IMF
+				 * @param[in] imf_c               Column IMF
+				 * @param[in] poly                Mapping polynomial
+				 * @param[in] storage_dimensions  Size of the allocated storage
+				 *
+				 * @return  An AMF object of the type \a amf_type
+				 *
+				 */
+				static amf_type Create( imf::Id imf_r, imf::Id imf_c ) {
+					return amf_type( imf_r, imf_c );
+				}
+
+				/**
+				 * Factory method used by 1D containers.
+				 *
+				 * Exploits the fact that fusion of strided IMFs into the polynomial
+				 * always succeeds and results in Id IMFs. As a result, the
+				 * constructed AMF is of the type \a amf_type.
+				 *
+				 * @param[in] imf_r               Row IMF
+				 * @param[in] imf_c               Column IMF
+				 * @param[in] poly                Mapping polynomial
+				 * @param[in] storage_dimensions  Size of the allocated storage
+				 *
+				 * @return  An AMF object of the type \a amf_type
+				 *
+				 * \note \internal To exploit existing mechanism for IMF fusion
+				 *                 into the polynomial, this method creates a
+				 *                 dummy AMF out of two Id IMFs and the provided
+				 *                 polynomial and composes the provided Strided
+				 *                 IMFs with the dummy AMF.
+				 */
+				static amf_type Create( imf::Id imf_r, imf::Zero imf_c ) {
+
+					/**
+					 * Ensure that the assumptions do not break upon potential
+					 * future changes to AMFFactory::Compose.
+					 */
+					static_assert(
+						std::is_same<
+							amf_type,
+							typename Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, omp > >::amf_type
+						>::value,
+						"The factory method returns the object of different type than declared. This is a bug."
+					);
+					return Compose< imf::Id, imf::Zero, AMF< imf::Id, imf::Id, typename PolyFactory::poly_type, omp > >::Create(
+						imf_r, imf_c,
+						FromPolynomial< structures::General, imf::Id, imf::Zero >::Create( imf::Id( imf_r.N ), imf::Id( imf_c.N ) )
+					);
+				}
+
+				FromPolynomial() = delete;
+
+			}; // class FromPolynomial
+
+			/**
+			 * @brief Transforms the provided AMF by applying the provided View type.
+			 *
+			 * Exposes the type of the resulting AMF and implements a factory
+			 * method that creates objects of such type.
+			 *
+			 * @tparam view       The enum value of the desired view type.
+			 * @tparam SourceAMF  The type of the target AMF
+			 *
+			 */
+			template< enum view::Views view, typename SourceAMF >
+			struct Reshape {
+
+				typedef SourceAMF amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					throw std::invalid_argument( "Not implemented for the provided view type." );
+					return amf;
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape
+
+			template< typename SourceAMF >
+			struct Reshape< view::original, SourceAMF > {
+
+				typedef SourceAMF amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					return amf_type( amf.imf_r, amf.imf_c, amf.map_poly, amf.storage_dimensions );
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< original, ... >
+
+			template< typename SourceAMF >
+			struct Reshape< view::transpose, SourceAMF > {
+
+				typedef AMF<
+					typename SourceAMF::imf_c_type,
+					typename SourceAMF::imf_r_type,
+					typename polynomials::apply_view<
+						view::transpose,
+						typename SourceAMF::mapping_polynomial_type
+					>::type,
+					omp
+				> amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					typedef typename polynomials::apply_view< view::transpose, typename SourceAMF::mapping_polynomial_type >::type new_mapping_polynomial_type;
+					return AMF<
+						typename SourceAMF::imf_c_type,
+						typename SourceAMF::imf_r_type,
+						new_mapping_polynomial_type,
+						omp
+					>(
+						amf.imf_c,
+						amf.imf_r,
+						new_mapping_polynomial_type(
+							amf.map_poly.ay2, amf.map_poly.ax2, amf.map_poly.axy,
+							amf.map_poly.ay, amf.map_poly.ax,
+							amf.map_poly.a0
+						),
+						amf.storage_dimensions
+					);
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< transpose, ... >
+
+			/**
+			 * Specialization for diagonal views
+			 *
+			 * Diagonal view is implemented by taking a square view over the matrix.
+			 *
+			 * \note \internal Converts a mapping polynomial from a bivariate-quadratic
+			 *                 to univariate quadratic by summing j-factors into
+			 *                 corresponding i-factors.
+			 *                 Implicitely applies a largest possible square view by
+			 *                 using Strided IMFs.
+			 *
+			 */
+			template< typename SourceAMF >
+			struct Reshape< view::diagonal, SourceAMF > {
+
+				private:
+
+					/** Short name of the original mapping polynomial type */
+					typedef typename SourceAMF::mapping_polynomial_type orig_p;
+
+					/** The type of the resulting polynomial */
+					typedef polynomials::BivariateQuadratic<
+						orig_p::Ax2 || orig_p::Ay2 || orig_p::Axy, 0, 0,
+						orig_p::Ax || orig_p::Ay, 0,
+						orig_p::A0, orig_p::D
+					> new_poly_type;
+
+				public:
+
+					typedef AMF< imf::Id, imf::Zero, new_poly_type, omp > amf_type;
+
+					static amf_type Create( const SourceAMF &amf ) {
+						assert( amf.getLogicalDimensions().first == amf.getLogicalDimensions().second );
+						return amf_type(
+							imf::Id( amf.getLogicalDimensions().first ),
+							imf::Zero( 1 ),
+							new_poly_type(
+								orig_p::Ax2 * amf.map_poly.ax2 + orig_p::Ay2 * amf.map_poly.ay2 + orig_p::Axy * amf.map_poly.axy, 0, 0,
+								orig_p::Ax * amf.map_poly.ax + orig_p::Ay * amf.map_poly.ay, 0,
+								amf.map_poly.a0
+							),
+							amf.storage_dimensions
+						);
+					}
+
+					Reshape() = delete;
+
+			}; // class Reshape< diagonal, ... >
+
+			/**
+			 * Specialization for matrix views over vectors
+			 *
+			 * \note \internal The resulting AMF is equivalent to applying
+			 *                 a composition with two ID IMFs.
+			 *
+			 */
+			template< typename SourceAMF >
+			struct Reshape< view::matrix, SourceAMF > {
+
+				typedef typename Compose< imf::Id, imf::Id, SourceAMF >::amf_type amf_type;
+
+				static amf_type Create( const SourceAMF &amf ) {
+					return Compose< imf::Id, imf::Id, SourceAMF >::Create(
+						imf::Id( amf.getLogicalDimensions().first ),
+						imf::Id( amf.getLogicalDimensions().second ),
+						amf
+					);
+				}
+
+				Reshape() = delete;
+
+			}; // class Reshape< diagonal, ... >
+
+		}; // class AMFFactory
+
+	} // namespace storage
+
+} // namespace alp
+
+#endif // _H_ALP_OMP_STORAGE
diff --git a/include/alp/omp/storagebasedmatrix.hpp b/include/alp/omp/storagebasedmatrix.hpp
new file mode 100644
index 000000000..59721c3d3
--- /dev/null
+++ b/include/alp/omp/storagebasedmatrix.hpp
@@ -0,0 +1,178 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_OMP_STORAGEBASEDMATRIX
+#define _H_ALP_OMP_STORAGEBASEDMATRIX
+
+#include <alp/backends.hpp>
+#include <alp/ops.hpp>
+#include <alp/type_traits.hpp>
+#include <alp/utils.hpp>
+
+#include <alp/amf-based/storagebasedmatrix.hpp>
+
+#include "config.hpp"
+#include "storage.hpp"
+#include "vector.hpp"
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * Matrix container specialization
+		 * Implements both original containers and views on containers.
+		 * @tparam requires_allocation True if the class is an original container
+		 *                             False if the class is a view of another matrix
+		 */
+		template< typename T, typename AmfType, bool requires_allocation >
+		class StorageBasedMatrix< T, AmfType, requires_allocation, omp > :
+			public MatrixBase< StorageBasedMatrix< T, AmfType, requires_allocation, omp > > {
+
+			template<
+				typename MatrixType,
+				std::enable_if_t< internal::is_storage_based< MatrixType >::value > *
+			>
+			friend size_t getStorageDimensions( const MatrixType &A ) noexcept;
+
+			/** Get the reference to the AMF of a storage-based matrix */
+			template<
+				typename MatrixType,
+				std::enable_if< internal::is_storage_based< MatrixType >::value > *
+			>
+			friend const typename MatrixType::amf_type &getAmf( const MatrixType &A ) noexcept;
+
+			public:
+
+				/** Expose static properties */
+
+				typedef T value_type;
+				typedef AmfType amf_type;
+				typedef typename AmfType::imf_r_type imf_r_type;
+				typedef typename AmfType::imf_c_type imf_c_type;
+				/** Type returned by access function */
+				typedef T &access_type;
+				typedef const T &const_access_type;
+				/** Type of the index used to access the physical storage */
+				typedef typename AmfType::storage_index_type storage_index_type;
+
+			protected:
+				typedef StorageBasedMatrix< T, AmfType, requires_allocation, omp > self_type;
+				friend MatrixBase< self_type >;
+
+				typedef typename std::conditional<
+					requires_allocation,
+					internal::Vector< T, omp >,
+					internal::Vector< T, omp > &
+				>::type container_type;
+
+				/** A container-type view is characterized by its association with a physical container */
+				container_type container;
+
+				/**
+				 * Access mapping function maps a pair of logical coordinates
+				 * into the concrete coordinate inside the actual container.
+				 * \see AMF
+				 */
+				AmfType amf;
+
+				/**
+				 * determines the size of the matrix via the domain of
+				 * the index mapping functions.
+				 *
+				 * @return A pair of dimensions.
+				 */
+				std::pair< size_t, size_t > dims() const noexcept {
+					return amf.getLogicalDimensions();
+				}
+
+				size_t getStorageDimensions() const noexcept {
+					return amf.getStorageDimensions();
+				}
+
+				friend const Vector< T, omp > &getContainer( const self_type &A ) {
+					return A.container;
+				}
+
+				friend Vector< T, omp > &getContainer( self_type &A ) {
+					return A.container;
+				}
+
+				bool getInitialized() const noexcept {
+					return internal::getInitialized( container );
+				}
+
+				void setInitialized( const bool initialized ) noexcept {
+					internal::setInitialized( container , initialized );
+				}
+
+				const AmfType &getAmf() const noexcept {
+					return amf;
+				}
+
+				/**
+				 * Returns a constant reference to the element corresponding to
+				 * the provided storage index.
+				 *
+				 * @param storageIndex  storage index in the physical iteration
+				 *                      space.
+				 *
+				 * @return const reference or value of the element at given position.
+				 * \note This function may result in accessing memory belonging to
+				 *       another thread, which may incurr performance penalty.
+				 */
+				const_access_type access( const storage_index_type &si ) const {
+					return getRaw( getLocalContainer( container, si.buffer_id, si.block_id ) )[ si.offset ];
+				}
+
+				access_type access( const storage_index_type &si ) {
+					return getRaw( getLocalContainer( container, si.buffer_id, si.block_id ) )[ si.offset ];
+				}
+
+				storage_index_type getStorageIndex( const size_t i, const size_t j, const size_t s, const size_t P ) const {
+					return amf.getStorageIndex( i, j, s, P );
+				}
+
+				/**
+				 * Construct a new structured matrix Base object
+				 *
+				 * @param rows The number of rows of the matrix.
+				 * @param cols The number of columns of the matrix.
+				 * @param smf  The storage mapping function assigned to this matrix.
+				 */
+				StorageBasedMatrix( AmfType &&amf ) :
+					// \todo enable only if ImfR and ImfC are imf::Id
+					container( amf.getDistribution() ),
+					amf( std::move( amf ) ) {
+#ifdef DEBUG
+					std::cout << "Entering OMP StorageBasedMatrix constructor\n";
+#endif
+				}
+
+				/** View on another container */
+				StorageBasedMatrix( Vector< T, omp > &container, AmfType &&amf ) :
+					container( container ),
+					amf( std::move( amf ) ) {}
+
+		}; // class StorageBasedMatrix
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // end ``_H_ALP_OMP_STORAGEBASEDMATRIX''
diff --git a/include/alp/omp/vector.hpp b/include/alp/omp/vector.hpp
new file mode 100644
index 000000000..c357cca78
--- /dev/null
+++ b/include/alp/omp/vector.hpp
@@ -0,0 +1,358 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_OMP_VECTOR
+#define _H_ALP_OMP_VECTOR
+
+
+#include <stdexcept>
+#include <memory>
+
+#include <assert.h>
+
+#include <alp/backends.hpp>
+#include <alp/rc.hpp>
+#include <alp/imf.hpp>
+#include <alp/density.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/vector.hpp>
+#include <alp/amf-based/vector.hpp>
+
+#include "config.hpp"
+#include "matrix.hpp"
+#include "storage.hpp"
+
+#ifdef _ALP_OMP_WITH_REFERENCE
+ #include <alp/reference/vector.hpp>
+#endif
+#ifdef _ALP_OMP_WITH_DISPATCH
+ #include <alp/dispatch/vector.hpp>
+#endif
+
+
+namespace alp {
+
+	namespace internal {
+
+		template< typename T >
+		size_t getLength( const Vector< T, omp > & ) noexcept;
+
+		template< typename T >
+		const bool & getInitialized( const Vector< T, omp > & v ) noexcept;
+
+		template< typename T >
+		void setInitialized( Vector< T, omp > & v, const bool initialized ) noexcept;
+
+		template< typename T >
+		const Vector< T, config::default_sequential_backend > &getLocalContainer(
+			const Vector< T, omp > &v, const size_t thread, const size_t block
+		) noexcept;
+
+		template< typename T >
+		Vector< T, config::default_sequential_backend > &getLocalContainer(
+			Vector< T, omp > &v, const size_t thread, const size_t block
+		) noexcept;
+
+		/**
+		 * The parallel shared memory implementation of the ALP/Dense vector.
+		 *
+		 * @tparam T The type of an element of this vector. \a T shall not be a
+		 *           GraphBLAS type.
+		 *
+		 * \warning Creating a alp::Vector of other GraphBLAS types is
+		 *                <em>not allowed</em>.
+		 *          Passing a GraphBLAS type as template parameter will lead to
+		 *          undefined behaviour.
+		 */
+		template< typename T >
+		class Vector< T, omp > {
+
+			/* ********************
+				IO friends
+			   ******************** */
+
+			friend size_t internal::getLength< T >( const Vector< T, omp > & ) noexcept;
+
+			friend const bool & internal::getInitialized< T >( const Vector< T, omp > & ) noexcept;
+
+			friend void internal::setInitialized< T >( Vector< T, omp > & , bool ) noexcept;
+
+			friend const Vector< T, config::default_sequential_backend > &getLocalContainer<>(
+				const Vector< T, omp > &v, const size_t thread, const size_t block
+			) noexcept;
+
+			friend Vector< T, config::default_sequential_backend > &getLocalContainer<>(
+				Vector< T, omp > &v, const size_t thread, const size_t block
+			) noexcept;
+
+			private:
+
+				/** The distribution of the vector among threads. */
+				Distribution_2_5D distr;
+
+				/** The number of buffers. */
+				size_t num_buffers;
+
+				/** The array of buffers. */
+				T **buffers;
+
+				/** Containers constructed around portions of buffers. */
+				using container_type = Vector< T, config::default_sequential_backend >;
+				std::vector< std::vector< container_type > > containers;
+
+				/** Whether the container is presently initialized. */
+				bool initialized;
+
+			public:
+
+				/** Exposes the element type. */
+				typedef T value_type;
+
+				/**
+				 * The main ALP/Dense vector constructor.
+				 *
+				 * The constructed object will be uninitalised after successful construction.
+				 *
+				 *
+				 * @param length      The number of elements in the new vector.
+				 *
+				 * @return SUCCESS This function never fails.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor may allocate \f$ \Theta( length ) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+				 *        -# This constructor \em may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector(
+					const Distribution_2_5D &d,
+					const size_t cap = 0
+				) : distr( d ),
+					num_buffers( d.getNumberOfThreads() ),
+					containers( num_buffers ),
+					initialized( false ) {
+
+					(void) cap;
+
+#ifdef DEBUG
+					std::cout << "Entered OMP internal::Vector constructor\n";
+#endif
+
+					// TODO: Implement allocation properly
+					buffers = new ( std::nothrow ) value_type*[ num_buffers ];
+					if( ( num_buffers > 0 ) && ( buffers == nullptr ) ){
+						throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
+					}
+
+					#pragma omp parallel
+					{
+						const size_t thread = config::OMP::current_thread_ID();
+
+						const auto t_coords = d.getThreadCoords( thread );
+
+						if ( d.isActiveThread( t_coords ) ) {
+
+							const auto block_grid_dims = d.getLocalBlockGridDims( t_coords );
+
+							// Assuming that all blocks are of the same size
+							const size_t alloc_size = block_grid_dims.first * block_grid_dims.second * d.getBlockSize();
+
+#ifdef DEBUG
+							#pragma omp critical
+							{
+								if( thread != config::OMP::current_thread_ID() ) {
+									std::cout << "Warning: thread != OMP::current_thread_id()\n";
+								}
+								std::cout << "Thread with global coordinates tr = " << t_coords.tr << " tc = " << t_coords.tc
+									<< " on OpenMP thread " << config::OMP::current_thread_ID()
+									<< " allocating buffer of " << alloc_size << " elements "
+									<< " holding " << block_grid_dims.first << " x " << block_grid_dims.second << " blocks.\n";
+							}
+#endif
+
+							// TODO: Implement allocation properly
+							buffers[ thread ] = new ( std::nothrow ) value_type[ alloc_size ];
+
+							if( buffers[ thread ] == nullptr ) {
+								throw std::runtime_error( "Could not allocate memory during alp::Vector<omp> construction." );
+							}
+
+							// Reserve space for all internal container wrappers to avoid re-allocation
+							containers[ thread ].reserve( block_grid_dims.first * block_grid_dims.second );
+
+							// Populate the array of internal container wrappers
+							for( size_t br = 0; br < block_grid_dims.first; ++br ) {
+								for( size_t bc = 0; bc < block_grid_dims.second; ++bc ) {
+									const size_t offset = d.getBlocksOffset( t_coords, br, bc );
+									containers[ thread ].emplace_back( &( buffers[ thread ][ offset ] ), d.getBlockSize() );
+								}
+							}
+
+							// Ensure that the array contains the expected number of containers
+							assert( containers[ thread ].size() == block_grid_dims.first * block_grid_dims.second );
+						
+						} // End active threads allocation
+					}
+				}
+
+				/**
+				 * Copy constructor.
+				 *
+				 * @param other The vector to copy. The initialization state of the copy
+				 *              reflects the state of \a other.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *      Allocates the same capacity as the \a other vector, even if the
+				 *      actual number of elements contained in \a other is less.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor allocates \f$ \Theta(\max{mn, cap} ) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor incurs \f$ \Theta(mn) \f$ of data
+				 *           movement.
+				 *        -# This constructor \em may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector( const Vector< T, omp > &other ) : Vector( other.n, other.cap ) {
+					initialized = other.initialized;
+					// const RC rc = set( *this, other ); // note: initialized will be set as part of this call
+					// if( rc != SUCCESS ) {
+					// 	throw std::runtime_error( "alp::Vector< T, omp > (copy constructor): error during call to alp::set (" + toString( rc ) + ")" );
+					// }
+				}
+
+				/**
+				 * Move constructor. The new vector equal the given
+				 * vector. Invalidates the use of the input vector.
+				 *
+				 * @param[in] other The GraphBLAS vector to move to this new instance.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor will not allocate any new dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+				 * \endparblock
+				 */
+				Vector( Vector< T, omp > &&other ) : buffers( other.buffers ) {
+					other.buffers = nullptr;
+					// data_deleter = std::move( other.data_deleter );
+					// initialized = other.initialized; other.initialized = false;
+				}
+
+				/**
+				 * Vector destructor.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This destructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This destructor will not perform any memory allocations.
+				 *        -# This destructor will use \f$ \mathcal{O}(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This destructor will move \f$ \Theta(1) \f$ bytes of data.
+				 *        -# This destructor makes system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid calling destructors from within performance critical
+				 *          code sections.
+				 */
+				~Vector() {
+					if( buffers != nullptr ) {
+						#pragma omp parallel
+						{
+							const size_t thread = config::OMP::current_thread_ID();
+
+							if ( distr.isActiveThread( thread ) ) {
+								if( buffers[ thread ] != nullptr ) {
+									delete [] buffers[ thread ];
+								}
+							}
+						}
+						delete [] buffers;
+					}
+				}
+
+		};
+
+		/** Identifies any omp internal vector as an internal container. */
+		template< typename T >
+		struct is_container< internal::Vector< T, omp > > : std::true_type {};
+
+	} // end namespace ``alp::internal''
+
+	namespace internal {
+
+		template< typename T >
+		size_t getLength( const Vector< T, omp > &v ) noexcept {
+			return v.n;
+		}
+
+		template< typename T >
+		const bool & getInitialized( const Vector< T, omp > & v ) noexcept {
+			return v.initialized;
+		}
+
+		template< typename T >
+		void setInitialized( Vector< T, omp > & v, bool initialized ) noexcept {
+			v.initialized = initialized;
+		}
+
+		template< typename T >
+		const Vector< T, config::default_sequential_backend > &getLocalContainer(
+			const Vector< T, omp > &v,
+			const size_t thread, const size_t block
+		) noexcept {
+			assert( thread < v.num_buffers );
+			assert( block < v.containers[ thread ].size() );
+			return v.containers[ thread ][ block ];
+		}
+
+		template< typename T >
+		Vector< T, config::default_sequential_backend > &getLocalContainer(
+			Vector< T, omp > &v,
+			const size_t thread, const size_t block
+		) noexcept {
+			assert( thread < v.num_buffers );
+			assert( block < v.containers[ thread ].size() );
+			return v.containers[ thread ][ block ];
+		}
+
+	} // end namespace ``alp::internal''
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_OMP_VECTOR''
+
diff --git a/include/alp/ops.hpp b/include/alp/ops.hpp
new file mode 100644
index 000000000..e6bdddeb4
--- /dev/null
+++ b/include/alp/ops.hpp
@@ -0,0 +1,566 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 8th of August, 2016
+ */
+
+#ifndef _H_ALP_OPERATORS
+#define _H_ALP_OPERATORS
+
+#include "internalops.hpp"
+#include "type_traits.hpp"
+
+namespace alp {
+
+	/**
+	 * This namespace holds various standard operators such as #alp::operators::add
+	 * and #alp::operators::mul.
+	 */
+	namespace operators {
+
+		/**
+		 * This operator discards all right-hand side input and simply copies the
+		 * left-hand side input to the output variable. It exposes the complete
+		 * interface detailed in alp::operators::internal::Operator. This operator
+		 * can be passed to any GraphBLAS function or object constructor.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y)\ \to\ x \f$.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam D1 The left-hand side input domain.
+		 * @tparam D2 The right-hand side input domain.
+		 * @tparam D3 The output domain.
+		 */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class left_assign : public internal::Operator< internal::left_assign< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = left_assign< A, B, C, D >;
+			left_assign() {}
+		};
+
+		/** TODO documentation. */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class left_assign_if : public internal::Operator< internal::left_assign_if< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = left_assign_if< A, B, C, D >;
+			left_assign_if() {}
+		};
+
+		/**
+		 * This operator discards all left-hand side input and simply copies the
+		 * right-hand side input to the output variable. It exposes the complete
+		 * interface detailed in alp::operators::internal::Operator. This operator
+		 * can be passed to any GraphBLAS function or object constructor.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y)\ \to\ y \f$.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam D1 The left-hand side input domain.
+		 * @tparam D2 The right-hand side input domain.
+		 * @tparam D3 The output domain.
+		 */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class right_assign : public internal::Operator< internal::right_assign< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = right_assign< A, B, C, D >;
+			right_assign() {}
+		};
+
+		/** TODO documentation. */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class right_assign_if : public internal::Operator< internal::right_assign_if< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = right_assign_if< A, B, C, D >;
+			right_assign_if() {}
+		};
+
+		/**
+		 * This operator takes the sum of the two input parameters and writes it to
+		 * the output variable. It exposes the complete interface detailed in
+		 * alp::operators::internal::Operator. This operator can be passed to any
+		 * GraphBLAS function or object constructor.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y)\ \to\ x + y \f$.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam D1 The left-hand side input domain.
+		 * @tparam D2 The right-hand side input domain.
+		 * @tparam D3 The output domain.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator+-functions
+		 *          available.
+		 */
+		// [Operator Wrapping]
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class add : public internal::Operator< internal::add< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = add< A, B, C, D >;
+			add() {}
+		};
+		// [Operator Wrapping]
+
+		/**
+		 * This operator multiplies the two input parameters and writes the result to
+		 * the output variable. It exposes the complete interface detailed in
+		 * alp::operators::internal::Operator. This operator can be passed to any
+		 * GraphBLAS function or object constructor.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y)\ \to\ x \cdot y \f$.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam D1 The left-hand side input domain.
+		 * @tparam D2 The right-hand side input domain.
+		 * @tparam D3 The output domain.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator*-functions
+		 *          available.
+		 */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class mul : public internal::Operator< internal::mul< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = mul< A, B, C, D >;
+			mul() {}
+		};
+
+		/**
+		 * This operator takes the maximum of the two input parameters and writes
+		 * the result to the output variable. It exposes the complete interface
+		 * detailed in alp::operators::internal::Operator. This operator can be
+		 * passed to any GraphBLAS function or object constructor.
+		 *
+		 * Mathematical notation: \f$ \max(x,y)\ \to\ \begin{cases}
+		 *    x \text{ if } x > y \\
+		 *    y \text{ otherwise} \end{cases} \f$.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam D1 The left-hand side input domain.
+		 * @tparam D2 The right-hand side input domain.
+		 * @tparam D3 The output domain.
+		 *
+		 * \warning This operator expects objects with a partial ordering defined on
+		 *          and between elements of types \a D1, \a D2, and \a D3.
+		 */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class max : public internal::Operator< internal::max< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = max< A, B, C, D >;
+			max() {}
+		};
+
+		/**
+		 * This operator takes the minimum of the two input parameters and writes
+		 * the result to the output variable. It exposes the complete interface
+		 * detailed in alp::operators::internal::Operator. This operator can be
+		 * passed to any GraphBLAS function or object constructor.
+		 *
+		 * Mathematical notation: \f$ \max(x,y)\ \to\ \begin{cases}
+		 *    x \text{ if } x < y \\
+		 *    y \text{ otherwise} \end{cases} \f$.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam D1 The left-hand side input domain.
+		 * @tparam D2 The right-hand side input domain.
+		 * @tparam D3 The output domain.
+		 *
+		 * \warning This operator expects objects with a partial ordering defined on
+		 *          and between elements of types \a D1, \a D2, and \a D3.
+		 */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class min : public internal::Operator< internal::min< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = min< A, B, C, D >;
+			min() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class subtract : public internal::Operator< internal::substract< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = subtract< A, B, C, D >;
+			subtract() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class divide : public internal::Operator< internal::divide< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = divide< A, B, C, D >;
+			divide() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class divide_reverse : public internal::Operator< internal::divide_reverse< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = divide_reverse< A, B, C, D >;
+			divide_reverse() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class equal : public internal::Operator< internal::equal< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = equal< A, B, C, D >;
+			equal() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class not_equal : public internal::Operator< internal::not_equal< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = not_equal< A, B, C, D >;
+			not_equal() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class any_or : public internal::Operator< internal::any_or< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = any_or< A, B, C, D >;
+			any_or() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class logical_or : public internal::Operator< internal::logical_or< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = logical_or< A, B, C, D >;
+			logical_or() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class logical_and : public internal::Operator< internal::logical_and< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = logical_and< A, B, C, D >;
+			logical_and() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class relu : public internal::Operator< internal::relu< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = relu< A, B, C, D >;
+			relu() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class abs_diff : public internal::Operator< internal::abs_diff< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = abs_diff< A, B, C, D >;
+			abs_diff() {}
+		};
+
+		/** TODO documentation. */
+		template< typename IType, typename VType >
+		class argmin : public internal::Operator< internal::argmin< IType, VType > > {
+		public:
+			argmin() {}
+		};
+
+		/** TODO documentation. */
+		template< typename IType, typename VType >
+		class argmax : public internal::Operator< internal::argmax< IType, VType > > {
+		public:
+			argmax() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
+		class square_diff : public internal::Operator< internal::square_diff< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = square_diff< A, B, C, D >;
+			square_diff() {}
+		};
+
+		/** \todo add documentation */
+		template< typename IN1, typename IN2, enum Backend implementation = config::default_backend >
+		class zip : public internal::Operator< internal::zip< IN1, IN2, implementation > > {
+		public:
+			template< typename A, typename B, enum Backend D >
+			using GenericOperator = zip< A, B, D >;
+			zip() {}
+		};
+
+		/** \todo add documentation */
+		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
+		class equal_first : public internal::Operator< internal::equal_first< D1, D2, D3, implementation > > {
+		public:
+			template< typename A, typename B, typename C, enum Backend D >
+			using GenericOperator = equal_first< A, B, C, D >;
+			equal_first() {}
+		};
+
+	} // namespace operators
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::left_assign_if< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::right_assign_if< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::left_assign< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::right_assign< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	// [Operator Type Traits]
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::add< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+	// [Operator Type Traits]
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::mul< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::max< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::min< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::subtract< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::divide< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::divide_reverse< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::equal< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::not_equal< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::any_or< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::logical_or< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::logical_and< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::abs_diff< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::relu< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename IType, typename VType >
+	struct is_operator< operators::argmin< IType, VType > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename IType, typename VType >
+	struct is_operator< operators::argmax< IType, VType > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::square_diff< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename IN1, typename IN2, enum Backend implementation >
+	struct is_operator< operators::zip< IN1, IN2, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::equal_first< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3 >
+	struct is_idempotent< operators::min< D1, D2, D3 > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3 >
+	struct is_idempotent< operators::max< D1, D2, D3 > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3 >
+	struct is_idempotent< operators::any_or< D1, D2, D3 > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3 >
+	struct is_idempotent< operators::logical_or< D1, D2, D3 > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3 >
+	struct is_idempotent< operators::logical_and< D1, D2, D3 > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3 >
+	struct is_idempotent< operators::relu< D1, D2, D3 > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3 >
+	struct is_idempotent< operators::left_assign_if< D1, D2, D3 > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3 >
+	struct is_idempotent< operators::right_assign_if< D1, D2, D3 > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename IType, typename VType >
+	struct is_idempotent< operators::argmin< IType, VType > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename IType, typename VType >
+	struct is_idempotent< operators::argmax< IType, VType > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename OP >
+	struct is_associative {
+		static constexpr const bool value = is_operator< OP >::value && OP::is_associative();
+	};
+
+	template< typename OP >
+	struct is_commutative {
+		static constexpr const bool value = is_operator< OP >::value && OP::is_commutative();
+	};
+
+	// internal type traits follow
+
+	namespace internal {
+
+		template< typename D1, typename D2, typename D3, enum Backend implementation >
+		struct maybe_noop< operators::left_assign_if< D1, D2, D3, implementation > > {
+			static const constexpr bool value = true;
+		};
+
+		template< typename D1, typename D2, typename D3, enum Backend implementation >
+		struct maybe_noop< operators::right_assign_if< D1, D2, D3, implementation > > {
+			static const constexpr bool value = true;
+		};
+
+	} // namespace alp::internal
+
+} // namespace alp
+
+#ifdef __DOXYGEN__
+ /**
+  * Macro that disables the definition of an operator<< overload for
+  * instances of std::pair. This overload is only active when the _DEBUG
+  * macro is defined, but may clash with user-defined overloads.
+  */
+ #define _DEBUG_NO_IOSTREAM_PAIR_CONVERTER
+#endif
+#ifdef _DEBUG
+ #ifndef _DEBUG_NO_IOSTREAM_PAIR_CONVERTER
+	template< typename U, typename V >
+	std::ostream & operator<<( std::ostream & out, const std::pair< U, V > & pair ) {
+		out << "( " << pair.first << ", " << pair.second << " )";
+		return out;
+	}
+ #endif
+#endif
+
+#endif // end ``_H_ALP_OPERATORS''
+
diff --git a/include/alp/phase.hpp b/include/alp/phase.hpp
new file mode 100644
index 000000000..3da83e312
--- /dev/null
+++ b/include/alp/phase.hpp
@@ -0,0 +1,70 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ */
+
+#ifndef _H_ALP_PHASE
+#define _H_ALP_PHASE
+
+namespace alp {
+
+	/**
+	 * Some primitives may require a symbolic phase prior to executing a numeric
+	 * phase. The symbolic phase may require system calls in order to, for example,
+	 * reallocate storage to account for fill-in.
+	 *
+	 * For vectors, the user usually is able to pass in a reasonable upper bound on
+	 * the number of nonzeroes and as * such, level-1 and level-2 primitives need
+	 * not rely on a symbolic phase. For matrices that act as output on level-3
+	 * primitives, however, it is instead far more common to not know of a
+	 * reasonable upper bound beforehand; in these cases the use of a symbolic
+	 * phase usually cannot be avoided.
+	 *
+	 * The performance semantics of primitives, which often do not allow system
+	 * calls, are guaranteed only for numeric phases.
+	 */
+	enum PHASE {
+
+		/**
+		 * Simulates the operation with the sole purpose of determining the number of
+		 * nonzeroes that the output container should hold. If this should be higher
+		 * than the current capacity, then the output container will be reallocated.
+		 *
+		 * This means the performance costs increase with sum of the container
+		 * dimensions plus the number of output nonzeroes, both in terms of work and
+		 * data movement, whenever the call must reallocate. In that case it will also
+		 * make system calls.
+		 */
+		SYMBOLIC,
+
+		/**
+		 * With the numerical phase, the user guarantees (all) output container(s)
+		 * have enough capacity-- including for any newly materialised nonzeroes.
+		 * The user may either give this guarantee through her knowledge of the
+		 * overall computation (e.g., in a Conjugate Gradient solver for linear
+		 * systems the vectors of length n will hold at most n nonzeroes), or may
+		 * ensure sufficient capacity by first calling the primitive using a
+		 * SYMBOLIC phase.
+		 */
+		NUMERICAL
+	};
+
+} // namespace alp
+
+#endif // end ``_H_ALP_PHASE''
diff --git a/include/alp/rc.hpp b/include/alp/rc.hpp
new file mode 100644
index 000000000..ce94a7a42
--- /dev/null
+++ b/include/alp/rc.hpp
@@ -0,0 +1,150 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Defines all possible GraphBLAS error codes.
+ *
+ * @author A. N. Yzelman
+ * @date 9--11 August, 2016
+ */
+
+#ifndef _H_ALP_RC
+#define _H_ALP_RC
+
+#include <string>
+
+namespace alp {
+
+	/**
+	 * Return codes of public functions.
+	 */
+	enum RC {
+
+		/**
+		 * Default success code.
+		 *
+		 * All GraphBLAS functions may return this error code even if not explicitly
+		 * documented. Any non-SUCCESS error code shall have no side effects; if a
+		 * call fails, it shall be as though the call was never made. The only
+		 * exception is #alp::PANIC.
+		 */
+		SUCCESS = 0,
+
+		/**
+		 * Generic fatal error code.
+		 *
+		 * Signals an illegal state of all GraphBLAS objects connected to the call
+		 * returning this error. Users can only exit gracefully when encoutering
+		 * errors of this type-- after a GraphBLAS function returns this error
+		 * code, the state of the library becomes undefined.
+		 *
+		 * An implementation is encouraged to write clear error messages to stderr
+		 * prior to returning this error code.
+		 *
+		 * Rationale: instead of using <tt>assert</tt> within GraphBLAS
+		 * implementations which would crash the entire application, implementations
+		 * should instead simply return #alp::PANIC and let the GraphBLAS user shut
+		 * down his or her application as gracefully as possible.
+		 *
+		 * All GraphBLAS functions may return this error code even if not explicitly
+		 * documented.
+		 */
+		PANIC,
+
+		/**
+		 * Out of memory error code.
+		 *
+		 * User can mitigate by freeing memory and retrying the call or by reducing
+		 * the amount of memory required by this call.
+		 *
+		 * This error code may only be returned when explicitly documented as such.
+		 */
+		OUTOFMEM,
+
+		/**
+		 * One or more of the GraphBLAS objects corresponding to the call returning
+		 * this error have mismatching dimensions.
+		 *
+		 * User can mitigate by reissuing with correct parameters. It is usually not
+		 * possible to mitigate at run-time; usually this signals a logic programming
+		 * error.
+		 *
+		 * This error code may only be returned when explicitly documented as such.
+		 */
+		MISMATCH,
+
+		/**
+		 * One or more of the GraphBLAS objects corresponding to the call returning
+		 * this error refer to the same object while this is forbidden.
+		 *
+		 * User can mitigate by reissuing with correct parameters. It is usually not
+		 * possible to mitigate at run-time; usually this signals a logic programming
+		 * error. Implementations are not required to return this error code and may
+		 * incur undefined behaviour instead.
+		 *
+		 * This error code may only be returned when explicitly documented as such.
+		 */
+		OVERLAP,
+
+		/**
+		 * One or more output parameters would overflow on this function call.
+		 *
+		 * Users can mitigate by supplying a larger integral types.
+		 *
+		 * This error code may only be returned when explicitly documented as such.
+		 */
+		OVERFLW,
+
+		/**
+		 * A bsp::init() assuming multiple user processes while this is not supported
+		 * by the chosen implementation backend will reduce this error code.
+		 *
+		 * @see config::default_backend for a description of how the current backend
+		 *                              is selected (if not explicitly).
+		 *
+		 * This error code may only be returned when explicitly documented as such.
+		 */
+		UNSUPPORTED,
+
+		/**
+		 * A call to a GraphBLAS function with an illegal parameter value might
+		 * return this error code. When returned, no undefined behaviour will occur
+		 * as a result of having passed the illegal argument.
+		 *
+		 * This error code may only be returned when explicitly documented as such.
+		 */
+		ILLEGAL,
+
+		/**
+		 * Indicates when one of the alp::algorithms has failed to achieve its
+		 * intended result, for instance, when an iterative method failed to
+		 * converged within its alloted resources.
+		 *
+		 * This error code may only be returned when explicitly documented as such.
+		 */
+		FAILED
+
+	};
+
+	/** @returns A string describing the given error code. */
+	std::string toString( const RC code );
+
+} // namespace alp
+
+#endif
diff --git a/include/alp/reference/alloc.hpp b/include/alp/reference/alloc.hpp
new file mode 100644
index 000000000..d45ac8873
--- /dev/null
+++ b/include/alp/reference/alloc.hpp
@@ -0,0 +1,31 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *  
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 22nd of January, 2021
+ */
+
+#ifndef _H_ALP_REFERENCE_ALLOC
+#define _H_ALP_REFERENCE_ALLOC
+
+
+namespace alp {
+
+} // namespace alp
+
+#endif
diff --git a/include/alp/reference/benchmark.hpp b/include/alp/reference/benchmark.hpp
new file mode 100644
index 000000000..e36e632b2
--- /dev/null
+++ b/include/alp/reference/benchmark.hpp
@@ -0,0 +1,36 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January, 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_BENCH
+#define _H_ALP_REFERENCE_BENCH
+
+#include <alp/rc.hpp>
+
+#include <alp/base/benchmark.hpp>
+
+
+namespace alp {
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_REFERENCE_BENCH''
+
diff --git a/include/alp/reference/blas0.hpp b/include/alp/reference/blas0.hpp
new file mode 100644
index 000000000..651561aae
--- /dev/null
+++ b/include/alp/reference/blas0.hpp
@@ -0,0 +1,259 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_BLAS0
+#define _H_ALP_REFERENCE_BLAS0
+
+#include <type_traits> // std::enable_if, std::is_same
+
+#include <alp/backends.hpp>
+#include <alp/descriptors.hpp>
+#include <alp/rc.hpp>
+#include <alp/type_traits.hpp>
+
+#include <alp/base/blas0.hpp>
+
+#include "scalar.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace alp {
+
+	namespace internal {
+
+		/**
+		 * @internal apply \a op to internal scalar container.
+		 */
+		template< 
+			Descriptor descr = descriptors::no_operation,
+			class OP,
+			typename InputType1, typename InputType2, typename OutputType
+		>
+		RC apply( OutputType &out,
+			const InputType1 &x,
+			const InputType2 &y,
+			const OP &op = OP(),
+			const typename std::enable_if<
+				is_operator< OP >::value &&
+				!is_object< InputType1 >::value &&
+				!is_object< InputType2 >::value &&
+				!is_object< OutputType >::value,
+			void >::type * = NULL
+		) {
+			// static sanity check
+			NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || (
+					std::is_same< InputType1, typename OP::D1 >::value &&
+					std::is_same< InputType2, typename OP::D2 >::value &&
+					std::is_same< OutputType, typename OP::D3 >::value
+				) ),
+				"alp::internal::apply (level 0)",
+				"Argument value types do not match operator domains while no_casting "
+				"descriptor was set"
+			);
+
+			// call apply
+			const typename OP::D1 left = static_cast< typename OP::D1 >( x );
+			const typename OP::D2 right = static_cast< typename OP::D2 >( y );
+			typename OP::D3 output = static_cast< typename OP::D3 >( out );
+			op.apply( left, right, output );
+			out = static_cast< OutputType >( output );
+
+			// done
+			return SUCCESS;
+		}
+
+		/**
+		 * @internal \a foldr reference implementation on internal scalar container.
+		 */
+		template< 
+			Descriptor descr = descriptors::no_operation, 
+			class OP, typename InputType, typename IOType >
+		RC foldr( const InputType & x,
+			IOType & y,
+			const OP & op = OP(),
+			const typename std::enable_if< is_operator< OP >::value && ! is_object< InputType >::value && ! is_object< IOType >::value, void >::type * = NULL ) {
+			// static sanity check
+			NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) ||
+								( std::is_same< InputType, typename OP::D1 >::value && std::is_same< IOType, typename OP::D2 >::value && std::is_same< IOType, typename OP::D3 >::value ) ),
+				"alp::internal::foldr (level 0)",
+				"Argument value types do not match operator domains while no_casting "
+				"descriptor was set" );
+
+			// call foldr
+			const typename OP::D1 left = static_cast< typename OP::D1 >( x );
+			typename OP::D3 right = static_cast< typename OP::D3 >( y );
+			op.foldr( left, right );
+			y = static_cast< IOType >( right );
+
+			// done
+			return SUCCESS;
+		}
+
+		/**
+		 * @internal \a foldl reference implementation on internal scalar container.
+		 */
+		template< Descriptor descr = descriptors::no_operation, class OP, typename InputType, typename IOType >
+		RC foldl( IOType & x,
+			const InputType & y,
+			const OP & op = OP(),
+			const typename std::enable_if< is_operator< OP >::value && ! is_object< InputType >::value && ! is_object< IOType >::value, void >::type * = NULL ) {
+			// static sanity check
+			NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) ||
+								( std::is_same< IOType, typename OP::D1 >::value && std::is_same< InputType, typename OP::D2 >::value && std::is_same< IOType, typename OP::D3 >::value ) ),
+				"alp::internal::foldl (level 0)",
+				"Argument value types do not match operator domains while no_casting "
+				"descriptor was set" );
+
+			// call foldl
+			typename OP::D1 left = static_cast< typename OP::D1 >( x );
+			const typename OP::D3 right = static_cast< typename OP::D3 >( y );
+			op.foldl( left, right );
+			x = static_cast< IOType >( left );
+
+			// done
+			return SUCCESS;
+		}
+
+	} // end namespace ``internal''
+
+	/**
+	 * \defgroup BLAS0 The Level-0 Basic Linear Algebra Subroutines (BLAS)
+	 *
+	 * A collection of functions that let GraphBLAS operators work on
+	 * zero-dimensional containers, i.e., on scalars.
+	 *
+	 * The GraphBLAS uses opaque data types and defines several standard functions
+	 * to operate on these data types. Examples types are alp::Vector and
+	 * alp::Matrix, example functions are alp::dot and alp::vxm.
+	 *
+	 * To input data into an opaque GraphBLAS type, each opaque type defines a
+	 * member function \a build: alp::Vector::build() and alp::Matrix::build().
+	 *
+	 * To extract data from opaque GraphBLAS types, each opaque type provides
+	 * \em iterators that may be obtained via the STL standard \a begin and \a end
+	 * functions:
+	 *   - alp::Vector::begin or alp::Vector::cbegin
+	 *   - alp::Vector::end or alp::Vector::cend
+	 *   - alp::Matrix::begin or alp::Matrix::cbegin
+	 *   - alp::Matrix::end or alp::Matrix::cend
+	 *
+	 * Some GraphBLAS functions, however, reduce all elements in a GraphBLAS
+	 * container into a single element of a given type. So for instance, alp::dot
+	 * on two vectors of type alp::Vector<double> using the regular real semiring
+	 * alp::Semiring<double> will store its output in a variable of type \a double.
+	 *
+	 * When parametrising GraphBLAS functions in terms of arbitrary Semirings,
+	 * Monoids, Operators, and object types, it is useful to have a way to apply
+	 * the same operators on whatever type they make functions like alp::dot
+	 * produce-- that is, we require functions that enable the application of
+	 * GraphBLAS operators on single elements.
+	 *
+	 * This group of BLAS level 0 functions provides this functionality.
+	 *
+	 * @{
+	 */
+
+	/**
+	 * @brief Reference implementation of \a apply.
+	 */
+	template< 
+		class OP,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2,
+		typename OutputType, typename OutputStructure
+	>
+	RC apply( 
+		Scalar< OutputType, OutputStructure, reference > &out,
+		const Scalar< InputType1, InputStructure1, reference > &x,
+		const Scalar< InputType2, InputStructure2, reference > &y,
+		const OP &op = OP(),
+		const typename std::enable_if<
+			is_operator< OP >::value &&
+			!is_object< InputType1 >::value &&
+			!is_object< InputType2 >::value &&
+			!is_object< OutputType >::value,
+		void >::type * = NULL
+	) {
+
+		RC rc = internal::apply( *out, *x, *y, op );
+		
+		return rc;
+	}
+
+	/**
+	 * @brief Reference implementation of \a foldr.
+	 */
+	template< 
+		class OP, 
+		typename InputType, typename InputStructure, 
+		typename IOType, typename IOStructure >
+	RC foldr( const Scalar< InputType, InputStructure, reference > &x,
+		Scalar< IOType, IOStructure, reference > &y,
+		const OP & op = OP(),
+		const typename std::enable_if< is_operator< OP >::value && ! is_object< InputType >::value && ! is_object< IOType >::value, void >::type * = NULL ) {
+		
+		RC rc = internal::foldr( *x, *y, op);
+
+		return rc;
+	}
+
+	/**
+	 * @brief Reference implementation of \a foldl.
+	 */
+	template< 
+		class OP, 
+		typename InputType, typename InputStructure, 
+		typename IOType, typename IOStructure >
+	RC foldl( Scalar< IOType, IOStructure, reference > &x,
+		const Scalar< InputType, InputStructure, reference > &y,
+		const OP & op = OP(),
+		const typename std::enable_if< is_operator< OP >::value && ! is_object< InputType >::value && ! is_object< IOType >::value, void >::type * = NULL ) {
+
+		RC rc = internal::foldl( *x, *y, op );
+
+		return rc;
+	}
+
+	/** @} */
+	
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_ALP_REFERENCE_BLAS0''
+
diff --git a/include/alp/reference/blas1.hpp b/include/alp/reference/blas1.hpp
new file mode 100644
index 000000000..140b0f4a0
--- /dev/null
+++ b/include/alp/reference/blas1.hpp
@@ -0,0 +1,2407 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_BLAS1
+#define _H_ALP_REFERENCE_BLAS1
+
+#include <functional>
+
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/ops.hpp>
+#include <alp/rc.hpp>
+#include <alp/rels.hpp>
+#include <alp/semiring.hpp>
+#include <alp/structures.hpp>
+
+#include "blas0.hpp"
+#include "blas2.hpp"
+#include "config.hpp"
+#include "io.hpp"
+#include "matrix.hpp"
+#include "scalar.hpp"
+#include "vector.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+#define NO_CAST_OP_ASSERT( x, y, z )                                           \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the operator domains, as specified in the "            \
+		"documentation of the function " y ", supply an input argument of "    \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible operator where all domains "  \
+		"match those of the input parameters, as specified in the "            \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace alp {
+
+	/**
+	 * \defgroup BLAS1 The Level-1 Basic Linear Algebra Subroutines (BLAS)
+	 *
+	 * A collection of functions that allow ALP/GraphBLAS operators, monoids, and
+	 * semirings work on a mix of zero-dimensional and one-dimensional containers;
+	 * i.e., allows various linear algebra operations on scalars (both built-in
+	 * C++ scalars and objects of type alp::Scalar) and objects of type
+	 * alp::Vector.
+	 *
+	 * C++ built-in scalars are all scalar types that can be
+	 * passed to BLAS functions. This includes simple types (e.g. double) and
+	 * more complex types (e.g. std::pair as complex number representation).
+	 * Such types are referred to as C++ scalars or built-in scalars.
+	 *
+	 * Operations producing scalars are specialized to both C++ built-in scalars
+	 * and alp::Scalars. Functions taking scalars as inputs are specialized only
+	 * to alp::Scalars. Depending on backend's Scalar implementation, the
+	 * conversion from C++ scalar to alp::Scalar can be implicit or explicit.
+	 *
+	 * All functions except for alp::size and alp::nnz return an error code of
+	 * the enum-type alp::RC. The two functions for retrieving the size and the
+	 * nonzeroes of two vectors are excluded from this because they are never
+	 * allowed to fail.
+	 *
+	 * Operations which require a single input vector only and produce scalar
+	 * output:
+	 *   -# alp::size,
+	 *   -# alp::nnz, and
+	 *   -# alp::set (three variants).
+	 * These do not require an operator, monoid, nor semiring. The following
+	 * require an operator:
+	 *   -# alp::foldr (reduction to the right),
+	 *   -# alp::foldl (reduction to the left).
+	 * Operators can only be applied on \em dense vectors. Operations on sparse
+	 * vectors require a well-defined way to handle missing vector elements. The
+	 * following functions require a monoid instead of an operator and are able
+	 * to handle sparse vectors by interpreting missing items as an identity
+	 * value:
+	 *   -# alp::reducer (reduction to the right),
+	 *   -# alp::reducel (reduction to the left).
+	 *
+	 * Operations which require two input vectors and produce scalar output:
+	 *   -# alp::dot   (dot product-- requires a semiring).
+	//  * Sparse vectors under a semiring have their missing values interpreted as a
+	//  * zero element under the given semiring; i.e., the identity of the additive
+	//  * operator.
+	 *
+	 * Operations which require one input vector and one input/output vector for
+	 * full and efficient in-place operations:
+	 *   -# alp::foldr (reduction to the right-- requires an operator),
+	 *   -# alp::foldl (reduction to the left-- requires an operator).
+	 * For alp::foldr, the left-hand side input vector may be replaced by an
+	 * input scalar. For alp::foldl, the right-hand side input vector may be
+	 * replaced by an input scalar. In either of those cases, the reduction
+	 * is equivalent to an in-place vector scaling.
+	 *
+	 * Operations which require two input vectors and one output vector for
+	 * out-of-place operations:
+	 *   -# alp::eWiseApply (requires an operator),
+	 *   -# alp::eWiseMul   (requires a semiring),
+	 *   -# alp::eWiseAdd   (requires a semiring).
+	 * Note that multiplication will consider any zero elements as an annihilator
+	 * to the multiplicative operator. Therefore, the operator will only be
+	 * applied at vector indices where both input vectors have nonzeroes. This is
+	 * different from eWiseAdd. This difference only manifests itself when dealing
+	 * with semirings, and reflects the intuitively expected behaviour. Any of the
+	 * two input vectors (or both) may be replaced with an input scalar instead.
+	 *
+	 * Operations which require three input vectors and one output vector for
+	 * out-of-place operations:
+	 *   -# alp::eWiseMulAdd (requires a semiring).
+	 * This function can be emulated by first successive calls to alp::eWiseMul
+	 * and alp::eWiseAdd. This specialised function, however, has better
+	 * performance semantics. This function is closest to the standard axpy
+	 * BLAS1 call, with out-of-place semantics. The first input vector may be
+	 * replaced by a scalar.
+	 *
+	 * Again, each of alp::eWiseMul, alp::eWiseAdd, alp::eWiseMulAdd accept sparse
+	 * vectors as input and output (since they operate on semirings), while
+	 * alp::eWiseApply.
+	 *
+	 * For fusing multiple BLAS-1 style operations on any number of inputs and
+	 * outputs, users can pass their own operator function to be executed for
+	 * every index \a i.
+	 *   -# alp::eWiseLambda.
+	 * This requires manual application of operators, monoids, and/or semirings
+	 * via the BLAS-0 interface (see alp::apply, alp::foldl, and alp::foldr).
+	 *
+	 * For all of these functions, the element types of input and output types
+	 * do not have to match the domains of the given operator, monoid, or
+	 * semiring unless the alp::descriptors::no_casting descriptor was passed.
+	 *
+	 * An implementation, whether blocking or non-blocking, should have clear
+	 * performance semantics for every sequence of graphBLAS calls, no matter
+	 * whether those are made from sequential or parallel contexts.
+	 *
+	 * @{
+	 */
+
+	/**
+	 * Folds all elements in a ALP Vector \a x into a single value \a beta.
+	 *
+	 * The original value of \a beta is used as the right-hand side input of the
+	 * operator \a op. A left-hand side input for \a op is retrieved from the
+	 * input Vector \a x. The result of the operation is stored in \a beta.
+	 * This process is repeated for every element in \a x.
+	 *
+	 * At function exit, \a beta will equal
+	 * \f$ \beta \odot x_0 \odot x_1 \odot \ldots x_{n-1} \f$.
+	 *
+	 * @tparam descr     The descriptor used for evaluating this function. By
+	 *                   default, this is alp::descriptors::no_operation.
+	 * @tparam OP        The type of the operator to be applied.
+	 *                   The operator must be associative.
+	 * @tparam InputType The type of the elements of \a x.
+	 * @tparam IOType    The type of the value \a y.
+	 * @tparam InputStructure The structure of the vector \a x.
+	 * @tparam InputView      The view type applied to the vector \a x.
+	 *
+	 * @param[in]     x    The input Vector \a x that will not be modified.
+	 *                     This input Vector must be dense.
+	 * @param[in,out] beta On function entry: the initial value to be applied to
+	 *                     \a op from the right-hand side.
+	 *                     On function exit: the result of repeated applications
+	 *                     from the left-hand side of elements of \a x.
+	 * @param[in]    op    The monoid under which to perform this right-folding.
+	 *
+	 * \note We only define fold under monoids, not under plain operators.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 * @returns alp::ILLEGAL When a sparse Vector is passed. In this case, the
+	 *                       call to this function will have no other effects.
+	 *
+	 * \warning Since this function folds from left-to-right using binary
+	 *          operators, this function \em cannot take sparse vectors as input--
+	 *          a monoid is required to give meaning to missing vector entries.
+	 *          See alp::reducer for use with sparse vectors instead.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# associative.
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n \cdot\mathit{sizeof}(\mathit{InputType}) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement. A good implementation will rely on in-place
+	//  *         operators.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure,
+		class Monoid
+	>
+	RC foldr(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &x,
+		Scalar< IOType, IOStructure, reference > &beta,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+
+#ifdef _DEBUG
+		std::cout << "foldr(Vector,Scalar,Monoid) called. Vector has size " << size( x ) << " .\n";
+#endif
+
+		internal::setInitialized(
+			beta,
+			internal::getInitialized( beta ) && internal::getInitialized( x )
+		);
+
+		if( !internal::getInitialized( beta ) ) {
+			return SUCCESS;
+		}
+
+		const size_t n = size( x );
+		for ( size_t i = 0; i < n; ++i ) {
+			(void) internal::foldr( x[ i ], *beta, monoid.getOperator() );
+		}
+		return SUCCESS;
+	}
+
+	/** C++ scalar variant */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType,
+		class Monoid
+	>
+	RC foldr(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &x,
+		IOType &beta,
+		const Monoid & monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		return foldr( x, Scalar< IOType, structures::General, reference >( beta ), monoid );
+	}
+
+	/**
+	 * For all elements in a ALP Vector \a y, fold the value \f$ \alpha \f$
+	 * into each element.
+	 *
+	 * The original value of \f$ \alpha \f$ is used as the left-hand side input
+	 * of the operator \a op. The right-hand side inputs for \a op are retrieved
+	 * from the input vector \a y. The result of the operation is stored in \a y,
+	 * thus overwriting its previous values.
+	 *
+	 * The value of \f$ y_i \f$ after a call to thus function thus equals
+	 * \f$ \alpha \odot y_i \f$, for all \f$ i \in \{ 0, 1, \dots, n - 1 \} \f$.
+	 *
+	 * @tparam descr         The descriptor used for evaluating this function.
+	 *                       By default, this is alp::descriptors::no_operation.
+	 * @tparam OP            The type of the operator to be applied.
+	 * @tparam InputType     The type of \a alpha.
+	 * @tparam IOType        The type of the elements in \a y.
+	 * @tparam IOStructure   The structure of the vector \a y.
+	 * @tparam IOView        The view applied to the vector \a y.
+	 *
+	 * @param[in]     alpha The input value to apply as the left-hand side input
+	 *                      to \a op.
+	 * @param[in,out] y     On function entry: the initial values to be applied as
+	 *                      the right-hand side input to \a op.
+	 *                      On function exit: the output data.
+	 * @param[in]     op    The monoid under which to perform this left-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note We only define fold under monoids, not under plain operators.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirements).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ 2n \cdot \mathit{sizeof}(\mathit{IOType}) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, reference > &alpha,
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &y,
+		const Monoid & monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, IOType >::value ), "alp::foldl",
+			"called with a vector y of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D2, InputType >::value ), "alp::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D3, IOType >::value ), "alp::foldl",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given operator" );
+
+#ifdef _DEBUG
+		std::cout << "foldr(Scalar,Vector,Monoid) called. Vector has size " << size( y ) << " .\n";
+#endif
+		internal::setInitialized(
+			y,
+			internal::getInitialized( alpha ) && internal::getInitialized( y )
+		);
+
+		if( !internal::getInitialized( y ) ) {
+			return SUCCESS;
+		}
+
+		const size_t n = size( y );
+		for ( size_t i = 0; i < n; ++i ) {
+			(void) internal::foldr( *alpha, y[ i ], monoid.getOperator() );
+		}
+		return SUCCESS;
+	}
+
+	/**
+	 * Computes y = x + y, operator variant.
+	 *
+	 * Specialisation for scalar \a x.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class OP
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, reference > &alpha,
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &y,
+		const OP & op = OP(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_operator< OP >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D1, IOType >::value ), "alp::foldr",
+			"called with a vector y of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D2, InputType >::value ), "alp::foldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D3, IOType >::value ), "alp::foldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given operator" );
+
+#ifdef _DEBUG
+		std::cout << "foldr(Scalar,Vector,OP) called. Vector has size " << size( y ) << " .\n";
+#endif
+
+		internal::setInitialized(
+			y,
+			internal::getInitialized( alpha ) && internal::getInitialized( y )
+		);
+
+		if( !internal::getInitialized( y ) ) {
+			return SUCCESS;
+		}
+
+		const size_t n = size( y );
+		for ( size_t i = 0; i < n; ++i ) {
+			(void) internal::foldr( *alpha, y[ i ], op );
+		}
+		return SUCCESS;
+
+	}
+
+	/**
+	 * Folds all elements in a ALP Vector \a x into the corresponding
+	 * elements from an input/output vector \a y. The vectors must be of equal
+	 * size \f$ n \f$. For all \f$ i \in \{0,1,\ldots,n-1\} \f$, the new value
+	 * of at the i-th index of \a y after a call to this function thus equals
+	 * \f$ x_i \odot y_i \f$.
+	 *
+	 * @tparam descr     The descriptor used for evaluating this function. By
+	 *                   default, this is alp::descriptors::no_operation.
+	 * @tparam OP        The type of the operator to be applied.
+	 * @tparam IOType         The type of the elements of \a y.
+	 * @tparam InputType      The type of the elements of \a x.
+	 * @tparam IOStructure    The structure of the vector \a y.
+	 * @tparam InputStructure The structure of the vector \a x.
+	 * @tparam IOView         The View applied on the vector \a y.
+	 * @tparam InputView      The View applied on the vector \a x.
+	 *
+	 * @param[in]     x  The input vector \a y that will not be modified.
+	 * @param[in,out] y  On function entry: the initial value to be applied to
+	 *                   \a op as the right-hand side input.
+	 *                   On function exit: the result of repeated applications
+	 *                   from the right-hand side using elements from \a y.
+	 * @param[in]     op The operator under which to perform this right-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note The element-wise fold is also defined for monoids.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a InputType, 2) the second domain of \a op must match
+	 * \a IOType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirements).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n \cdot (
+	//  *                       \mathit{sizeof}(InputType) + 2\mathit{sizeof}(IOType)
+	//  *                     ) + \mathcal{O}(1)
+	//  *         \f$
+	//  *         bytes of data movement. A good implementation will rely on in-place
+	//  *         operators whenever allowed.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class OP
+	>
+	RC foldr(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &x,
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &y,
+		const OP & op = OP(),
+		const std::enable_if_t<
+			alp::is_operator< OP >::value && ! alp::is_object< InputType >::value && ! alp::is_object< IOType >::value
+		> * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D1, InputType >::value ), "alp::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D2, IOType >::value ), "alp::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D3, IOType >::value ), "alp::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given operator" );
+
+#ifdef _DEBUG
+		std::cout << "foldr(Vector,Vector,OP) called. ";
+		std::cout << "Vector 1 has size " << size( x ) << " . ";
+		std::cout << "Vector 2 has size " << size( y ) << " .\n";
+#endif
+		internal::setInitialized(
+			y,
+			internal::getInitialized( x ) && internal::getInitialized( y )
+		);
+
+		if( !internal::getInitialized( y ) ) {
+			return SUCCESS;
+		}
+
+		const size_t n = size( y );
+
+		if( size( x ) != n ) {
+			return MISMATCH;
+		}
+
+		for ( size_t i = 0; i < n; ++i ) {
+			(void) internal::foldr( x[ i ], y[ i ], op );
+		}
+		return SUCCESS;
+	}
+
+	/**
+	 * Folds all elements in a ALP Vector \a x into the corresponding
+	 * elements from an input/output vector \a y. The vectors must be of equal
+	 * size \f$ n \f$. For all \f$ i \in \{0,1,\ldots,n-1\} \f$, the new value
+	 * of at the i-th index of \a y after a call to this function thus equals
+	 * \f$ x_i \odot y_i \f$.
+	 *
+	 * @tparam descr     The descriptor used for evaluating this function. By
+	 *                   default, this is alp::descriptors::no_operation.
+	 * @tparam Monoid    The type of the monoid to be applied.
+	 * @tparam IOType         The type of the elements of \a y.
+	 * @tparam InputType      The type of the elements of \a x.
+	 * @tparam IOStructure    The structure of the vector \a y.
+	 * @tparam InputStructure The structure of the vector \a x.
+	 * @tparam IOView         The view type applied to the vector \a y.
+	 * @tparam InputView      The view type applied to the vector \a x.
+	 *
+	 * @param[in]       x    The input vector \a y that will not be modified.
+	 * @param[in,out]   y    On function entry: the initial value to be applied
+	 *                       to \a op as the right-hand side input.
+	 *                       On function exit: the result of repeated applications
+	 *                       from the right-hand side using elements from \a y.
+	 * @param[in]     monoid The monoid under which to perform this right-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note The element-wise fold is also defined for operators.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a InputType, 2) the second domain of \a op must match
+	 * \a IOType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid monoid types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirements).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n \cdot (
+	//  *                       \mathit{sizeof}(InputType) + 2\mathit{sizeof}(IOType)
+	//  *                     ) + \mathcal{O}(1)
+	//  *         \f$
+	//  *         bytes of data movement. A good implementation will rely on in-place
+	//  *         operators whenever allowed.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid
+	>
+	RC foldr(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &x,
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &y,
+		const Monoid & monoid = Monoid(),
+		const std::enable_if_t<
+			alp::is_monoid< Monoid >::value && ! alp::is_object< InputType >::value && ! alp::is_object< IOType >::value
+		> * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, InputType >::value ), "alp::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D2, IOType >::value ), "alp::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D3, IOType >::value ), "alp::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given monoid" );
+
+		return foldr( x, y, monoid.getOperator() );
+	}
+
+	/**
+	 * For all elements in a ALP Vector \a x, fold the value \f$ \beta \f$
+	 * into each element.
+	 *
+	 * The original value of \f$ \beta \f$ is used as the right-hand side input
+	 * of the operator \a op. The left-hand side inputs for \a op are retrieved
+	 * from the input vector \a x. The result of the operation is stored in
+	 * \f$ \beta \f$, thus overwriting its previous value. This process is
+	 * repeated for every element in \a y.
+	 *
+	 * The value of \f$ x_i \f$ after a call to thus function thus equals
+	 * \f$ x_i \odot \beta \f$, for all \f$ i \in \{ 0, 1, \dots, n - 1 \} \f$.
+	 *
+	 * @tparam descr       The descriptor used for evaluating this function. By
+	 *                     default, this is alp::descriptors::no_operation.
+	 * @tparam OP          The type of the operator to be applied.
+	 * @tparam IOType      The type of the value \a beta.
+	 * @tparam InputType   The type of the elements of \a x.
+	 * @tparam IOStructure The structure of the vector \a x.
+	 * @tparam IOView      The view type applied to the vector \a x.
+	 *
+	 * @param[in,out] x    On function entry: the initial values to be applied as
+	 *                     the left-hand side input to \a op. The input vector must
+	 *                     be dense.
+	 *                     On function exit: the output data.
+	 * @param[in]     beta The input value to apply as the right-hand side input
+	 *                     to \a op.
+	 * @param[in]     op   The operator under which to perform this left-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note This function is also defined for monoids.
+	 *
+	 * \warning If \a x is sparse and this operation is requested, a monoid instead
+	 *          of an operator is required!
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirement).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ 2n \cdot \mathit{sizeof}(\mathit{IOType}) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType, typename InputStructure,
+		class Op
+	>
+	RC foldl(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &x,
+		const Scalar< InputType, InputStructure, reference > beta,
+		const Op &op = Op(),
+		const std::enable_if_t<
+			! alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_operator< Op >::value
+		> * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( ! ( descr & descriptors::no_casting )	|| std::is_same< typename Op::D1, IOType >::value ),
+			"alp::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( ! ( descr & descriptors::no_casting )	|| std::is_same< typename Op::D2, InputType >::value ),
+			"alp::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( ! ( descr & descriptors::no_casting )	|| std::is_same< typename Op::D3, IOType >::value ),
+			"alp::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "foldl(Vector,Scalar,Op) called. Vector has size " << size( x ) << " .\n";
+#endif
+
+		internal::setInitialized(
+			x,
+			internal::getInitialized( x ) && internal::getInitialized( beta )
+		);
+
+		if( !internal::getInitialized( x ) ) {
+			return SUCCESS;
+		}
+
+		const size_t n = size( x );
+		for ( size_t i = 0; i < n; ++i ) {
+			(void) internal::foldl( x[ i ], *beta, op );
+		}
+		return SUCCESS;
+	}
+
+	/**
+	 * For all elements in a ALP Vector \a x, fold the value \f$ \beta \f$
+	 * into each element.
+	 *
+	 * The original value of \f$ \beta \f$ is used as the right-hand side input
+	 * of the operator \a op. The left-hand side inputs for \a op are retrieved
+	 * from the input vector \a x. The result of the operation is stored in
+	 * \f$ \beta \f$, thus overwriting its previous value. This process is
+	 * repeated for every element in \a y.
+	 *
+	 * The value of \f$ x_i \f$ after a call to thus function thus equals
+	 * \f$ x_i \odot \beta \f$, for all \f$ i \in \{ 0, 1, \dots, n - 1 \} \f$.
+	 *
+	 * @tparam descr       The descriptor used for evaluating this function. By
+	 *                     default, this is alp::descriptors::no_operation.
+	 * @tparam Monoid      The type of the monoid to be applied.
+	 * @tparam IOType      The type of the elements of \a x.
+	 * @tparam InputType   The type of the value \a beta.
+	 * @tparam IOStructure The structure of the vector \a x.
+	 * @tparam IOView      The view type applied to the vector \a x.
+	 *
+	 * @param[in,out] x    On function entry: the initial values to be applied as
+	 *                     the left-hand side input to \a op.
+	 *                     On function exit: the output data.
+	 * @param[in]     beta The input value to apply as the right-hand side input
+	 *                     to \a op.
+	 * @param[in]   monoid The monoid under which to perform this left-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note This function is also defined for operators.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirement).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ 2n \cdot \mathit{sizeof}(\mathit{IOType}) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType,
+		class Monoid
+	>
+	RC foldl(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & x,
+		const InputType beta,
+		const Monoid & monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_monoid< Monoid >::value
+		> * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, IOType >::value ), "alp::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D2, InputType >::value ), "alp::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D3, IOType >::value ), "alp::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given monoid" );
+
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * Folds all elements in a ALP Vector \a y into the corresponding
+	 * elements from an input/output vector \a x. The vectors must be of equal
+	 * size \f$ n \f$. For all \f$ i \in \{0,1,\ldots,n-1\} \f$, the new value
+	 * of at the i-th index of \a x after a call to this function thus equals
+	 * \f$ x_i \odot y_i \f$.
+	 *
+	 * @tparam descr          The descriptor used for evaluating this function. By
+	 *                        default, this is alp::descriptors::no_operation.
+	 * @tparam OP             The type of the operator to be applied.
+	 * @tparam IOType         The type of the value \a x.
+	 * @tparam InputType      The type of the elements of \a y.
+	 * @tparam IOStructure    The structure of the vector \a x.
+	 * @tparam InputStructure The structure of the vector \a y.
+	 * @tparam IOView         The view type applied to the vector \a x.
+	 * @tparam InputView      The view type applied to the vector \a y.
+	 *
+	 * @param[in,out] x On function entry: the vector whose elements are to be
+	 *                  applied to \a op as the left-hand side input.
+	 *                  On function exit: the vector containing the result of
+	 *                  the requested computation.
+	 * @param[in]    y  The input vector \a y whose elements are to be applied
+	 *                  to \a op as right-hand side input.
+	 * @param[in]    op The operator under which to perform this left-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note This function is also defined for monoids.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirements).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n \cdot (
+	//  *                \mathit{sizeof}(\mathit{IOType}) +
+	//  *                \mathit{sizeof}(\mathit{InputType})
+	//  *             ) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement. A good implementation will apply in-place
+	//  *         vectorised instructions whenever the input domains, the output
+	//  *         domain, and the operator used allow for this.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class OP
+	>
+	RC foldl(
+		Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &y,
+		const OP &op = OP(),
+		const std::enable_if_t<
+			alp::is_operator< OP >::value && !alp::is_object< IOType >::value && !alp::is_object< InputType >::value
+		> * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D1, IOType >::value ), "alp::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D2, InputType >::value ), "alp::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename OP::D3, IOType >::value ), "alp::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		if( !internal::getInitialized( x ) ) {
+			return SUCCESS;
+		}
+
+		if( !internal::getInitialized( y ) ) {
+			internal::setInitialized( x, false );
+			return SUCCESS;
+		}
+
+		for( size_t i = 0; i < n; ++i ) {
+			/** \internal \todo Implement RC check. Also applies to other locations. */
+			(void) internal::foldl( x[ i ], y[ i ], op );
+		}
+
+		return SUCCESS;
+	}
+
+	/**
+	 * Folds all elements in a ALP Vector \a y into the corresponding
+	 * elements from an input/output vector \a x. The vectors must be of equal
+	 * size \f$ n \f$. For all \f$ i \in \{0,1,\ldots,n-1\} \f$, the new value
+	 * of at the i-th index of \a x after a call to this function thus equals
+	 * \f$ x_i \odot y_i \f$.
+	 *
+	 * @tparam descr          The descriptor used for evaluating this function. By
+	 *                        default, this is alp::descriptors::no_operation.
+	 * @tparam Monoid         The type of the monoid to be applied.
+	 * @tparam IOType         The type of the value \a x.
+	 * @tparam InputType      The type of the elements of \a y.
+	 * @tparam IOStructure    The structure of the vector \a x.
+	 * @tparam InputStructure The structure of the vector \a y.
+	 * @tparam IOView         The view type applied to the vector \a x.
+	 * @tparam InputView      The view type applied to the vector \a y.
+	 *
+	 * @param[in,out]  x    On function entry: the vector whose elements are to be
+	 *                      applied to \a op as the left-hand side input.
+	 *                      On function exit: the vector containing the result of
+	 *                      the requested computation.
+	 * @param[in]      y    The input vector \a y whose elements are to be applied
+	 *                      to \a op as right-hand side input.
+	 * @param[in]    monoid The operator under which to perform this left-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note This function is also defined for operators.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirements).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n \cdot (
+	//  *                \mathit{sizeof}(\mathit{IOType}) +
+	//  *                \mathit{sizeof}(\mathit{InputType})
+	//  *             ) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement. A good implementation will apply in-place
+	//  *         vectorised instructions whenever the input domains, the output
+	//  *         domain, and the operator used allow for this.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Monoid
+	>
+	RC foldl( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > & y,
+		const Monoid & monoid = Monoid(),
+		const std::enable_if_t<
+			alp::is_monoid< Monoid >::value && ! alp::is_object< IOType >::value && ! alp::is_object< InputType >::value
+		  > * = nullptr
+		) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, IOType >::value ), "alp::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D2, InputType >::value ), "alp::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D3, IOType >::value ), "alp::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x .* \beta \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * The vectors \a x or \a y may not be sparse.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after
+	 * the call to this function completes equals \f$ x_i \odot \beta \f$.
+	 *
+	 * \warning Use of sparse vectors is only supported in full generality
+	 *          when applied via a monoid or semiring; otherwise, there is
+	 *          no concept for correctly interpreting any missing vector
+	 *          elements during the requested computation.
+	 * \note    When applying element-wise operators on sparse vectors
+	 *          using semirings, there is a difference between interpreting missing
+	 *          values as an annihilating identity or as a neutral identity--
+	 *          intuitively, such identities are known as `zero' or `one',
+	 *          respectively. As a consequence, there are three different variants
+	 *          for element-wise operations whose names correspond to their
+	 *          intuitive meanings w.r.t. those identities:
+	 *            -# eWiseAdd (neutral),
+	 *            -# eWiseMul (annihilating), and
+	 *            -# eWiseApply using monoids (neutral).
+	 *          An eWiseAdd with some semiring and an eWiseApply using its additive
+	 *          monoid are totally equivalent.
+	 *
+	 * @tparam descr            The descriptor to be used. Equal to
+	 *                          descriptors::no_operation if left unspecified.
+	 * @tparam OP               The operator to use.
+	 * @tparam InputType1       The value type of the left-hand vector.
+	 * @tparam InputType2       The value type of the right-hand scalar.
+	 * @tparam OutputType       The value type of the ouput vector.
+	 * @tparam InputStructure1  The structure of the left-hand vector.
+	 * @tparam OutputStructure1 The structure of the output vector.
+	 * @tparam InputView1       The view type applied to the left-hand vector.
+	 * @tparam OutputView1      The view type applied to the output vector.
+	 *
+	 * @param[in]   x   The left-hand input vector.
+	 * @param[in]  beta The right-hand input scalar.
+	 * @param[out]  z   The pre-allocated output vector.
+	 * @param[in]   op  The operator to use.
+	 *
+	 * @return alp::MISMATCH Whenever the dimensions of \a x and \a z do not
+	 *                       match. All input data containers are left untouched
+	 *                       if this exit code is returned; it will be as though
+	 *                       this call was never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vectors \a x and \a z. The constant factor depends
+	//  *         on the cost of evaluating the operator. A good implementation uses
+	//  *         vectorised instructions whenever the input domains, the output
+	//  *         domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n(
+	//  *               \mathit{sizeof}(\mathit{D1}) + \mathit{sizeof}(\mathit{D3})
+	//  *             ) +
+	//  *         \mathcal{O}(1) \f$
+	//  *         bytes of data movement. A good implementation will stream \a y
+	//  *         into \a z to apply the multiplication operator in-place, whenever
+	//  *         the input domains, the output domain, and the operator allow for
+	//  *         this.
+	//  * \endparblock
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR, typename InputImfC,
+		typename InputType2, typename InputStructure2,
+		class OP
+	>
+	RC eWiseApply( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR, InputImfC, reference > & x,
+		const Scalar< InputType2, InputStructure2, reference > &beta,
+		const OP & op = OP(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_operator< OP >::value,
+			void >::type * const = NULL ) {
+	#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-[T2]<-T3), operator variant\n";
+	#endif
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Specialisation for \a x and \a y scalar, operator version.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2,
+		class OP
+	>
+	RC eWiseApply( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Scalar< InputType1, InputStructure1, reference> &alpha,
+		const Scalar< InputType2, InputStructure2, reference> &beta,
+		const OP & op = OP(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_operator< OP >::value,
+			void >::type * const = NULL ) {
+	#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), operator variant\n";
+	#endif
+		typename OP::D3 val;
+		RC ret = apply< descr >( val, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, val );
+		return ret;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Specialisation for \a x and \a y scalar, monoid version.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2,
+		class Monoid
+	>
+	RC eWiseApply( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Scalar< InputType1, InputStructure1, reference> &alpha,
+		const Scalar< InputType2, InputStructure2, reference> &beta,
+		const Monoid & monoid = Monoid(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL ) {
+	#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid variant\n";
+	#endif
+		// simply delegate to operator variant
+		return eWiseApply< descr >( z, alpha, beta, monoid.getOperator() );
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Monoid version.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Monoid
+	>
+	RC eWiseApply( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & y,
+		const Monoid & monoid = Monoid(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL ) {
+	#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
+	#endif
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Specialisation for scalar \a x. Monoid version.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Monoid
+	>
+	RC eWiseApply( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Scalar< InputType1, InputStructure1, reference> &alpha,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & y,
+		const Monoid & monoid = Monoid(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL ) {
+	#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+	#endif
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place.
+	 *
+	 * Specialisation for scalar \a y. Monoid version.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2,
+		class Monoid
+	>
+	RC eWiseApply( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & x,
+		const Scalar< InputType2, InputStructure2, reference > &beta,
+		const Monoid & monoid = Monoid(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL ) {
+	#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+	#endif
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = \alpha .* y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * The vectors \a x or \a y may not be sparse.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after
+	 * the call to this function completes equals \f$ \alpha \odot y_i \f$.
+	 *
+	 * \warning Use of sparse vectors is only supported in full generality
+	 *          when applied via a monoid or semiring; otherwise, there is
+	 *          no concept for correctly interpreting any missing vector
+	 *          elements during the requested computation.
+	 * \note    When applying element-wise operators on sparse vectors
+	 *          using semirings, there is a difference between interpreting missing
+	 *          values as an annihilating identity or as a neutral identity--
+	 *          intuitively, identities are known as `zero' or `one',
+	 *          respectively. As a consequence, there are three different variants
+	 *          for element-wise operations whose names correspond to their
+	 *          intuitive meanings w.r.t. those identities:
+	 *            -# eWiseAdd,
+	 *            -# eWiseMul, and
+	 *            -# eWiseMulAdd.
+	 *
+	 * @tparam descr The descriptor to be used. Equal to descriptors::no_operation
+	 *               if left unspecified.
+	 * @tparam OP    The operator to use.
+	 * @tparam InputType1      The value type of the left-hand scalar.
+	 * @tparam InputType2      The value type of the right-hand side vector.
+	 * @tparam OutputStructure The value Structure of the ouput vector.
+	 * @tparam InputStructure2 The value Structure of the right-hand side vector.
+	 * @tparam OutputView      The view type of the ouput vector.
+	 * @tparam InputView2      The view type of the right-hand side vector.
+	 *
+	 * @param[in]  alpha The left-hand scalar.
+	 * @param[in]   y    The right-hand input vector.
+	 * @param[out]  z    The pre-allocated output vector.
+	 * @param[in]   op   The operator to use.
+	 *
+	 * @return alp::MISMATCH Whenever the dimensions of \a y and \a z do not
+	 *                       match. All input data containers are left untouched
+	 *                       if this exit code is returned; it will be as though
+	 *                       this call was never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vectors \a y and \a z. The constant factor depends
+	//  *         on the cost of evaluating the operator. A good implementation uses
+	//  *         vectorised instructions whenever the input domains, the output
+	//  *         domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n(
+	//  *               \mathit{sizeof}(\mathit{D2}) + \mathit{sizeof}(\mathit{D3})
+	//  *             ) +
+	//  *         \mathcal{O}(1) \f$
+	//  *         bytes of data movement. A good implementation will stream \a y
+	//  *         into \a z to apply the multiplication operator in-place, whenever
+	//  *         the input domains, the output domain, and the operator allow for
+	//  *         this.
+	//  * \endparblock
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class OP
+	>
+	RC eWiseApply( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Scalar< InputType1, InputStructure1, reference > &alpha,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & y,
+		const OP & op = OP(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_operator< OP >::value,
+			void >::type * const = NULL ) {
+	#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-[T3]), operator variant\n";
+	#endif
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * Calculates the element-wise operation on elements of two vectors,
+	 * \f$ z = x .* y \f$, using the given operator. The vectors must be
+	 * of equal length.
+	 *
+	 * The vectors \a x or \a y may not be sparse.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after
+	 * the call to this function completes equals \f$ x_i \odot y_i \f$.
+	 *
+	 * \warning Use of sparse vectors is only supported in full generality
+	 *          when applied via a monoid or semiring; otherwise, there is
+	 *          no concept for correctly interpreting any missing vector
+	 *          elements during the requested computation.
+	 * \note    When applying element-wise operators on sparse vectors
+	 *          using semirings, there is a difference between interpreting missing
+	 *          values as an annihilating identity or as a neutral identity--
+	 *          intuitively, identities are known as `zero' or `one',
+	 *          respectively. As a consequence, there are three different variants
+	 *          for element-wise operations whose names correspond to their
+	 *          intuitive meanings w.r.t. those identities:
+	 *            -# eWiseAdd,
+	 *            -# eWiseMul, and
+	 *            -# eWiseMulAdd.
+	 *
+	 * @tparam descr The descriptor to be used (descriptors::no_operation if left
+	 *               unspecified).
+	 * @tparam OP    The operator to use.
+	 * @tparam InputType1      The value type of the left-hand side vector.
+	 * @tparam InputType2      The value type of the right-hand side vector.
+	 * @tparam OutputType      The value type of the ouput vector.
+	 * @tparam InputStructure1 The structure of the left-hand side vector.
+	 * @tparam InputStructure2 The structure of the right-hand side vector.
+	 * @tparam OutputStructure The structure of the ouput vector.
+	 * @tparam InputView1      The value View of the left-hand side vector.
+	 * @tparam InputView2      The value View of the right-hand side vector.
+	 * @tparam OutputView      The value View of the ouput vector.
+	 *
+	 * @param[in]  x  The left-hand input vector. May not equal \a y.
+	 * @param[in]  y  The right-hand input vector. May not equal \a x.
+	 * @param[out] z  The pre-allocated output vector.
+	 * @param[in]  op The operator to use.
+	 *
+	 * @return alp::ILLEGAL  When \a x equals \a y.
+	 * @return alp::MISMATCH Whenever the dimensions of \a x, \a y, and \a z
+	 *                       do not match. All input data containers are left
+	 *                       untouched if this exit code is returned; it will
+	 *                       be as though this call was never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vectors \a x, \a y, and \a z. The constant factor
+	//  *         depends on the cost of evaluating the operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n(
+	//  *               \mathit{sizeof}(\mathit{OutputType}) +
+	//  *               \mathit{sizeof}(\mathit{InputType1}) +
+	//  *               \mathit{sizeof}(\mathit{InputType2})
+	//  *             ) +
+	//  *         \mathcal{O}(1) \f$
+	//  *         bytes of data movement. A good implementation will stream \a x or
+	//  *         \a y into \a z to apply the multiplication operator in-place,
+	//  *         whenever the input domains, the output domain, and the operator
+	//  *         used allow for this.
+	//  * \endparblock
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class OP
+	>
+	RC eWiseApply( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & y,
+		const OP & op = OP(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_operator< OP >::value,
+			void >::type * const = NULL ) {
+	#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-[T2]<-[T3]), operator variant\n";
+	#endif
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * Calculates the element-wise multiplication of two vectors,
+	 *     \f$ z = z + x .* y \f$,
+	 * under a given semiring.
+	 *
+	 * @tparam descr      The descriptor to be used (descriptors::no_operation
+	 *                    if left unspecified).
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the multiplicative
+	 *                    operator of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the multiplicative
+	 *                    operator of the \a ring.
+	 * @tparam OutputType The the result type of the multiplicative operator of
+	 *                    the \a ring.
+	 * @tparam InputStructure1  The structure of the left-hand side input to
+	 *                          the multiplicative operator of the \a ring.
+	 * @tparam InputStructure2  The structure of the right-hand side input
+	 *                          to the multiplicative operator of the \a ring.
+	 * @tparam OutputStructure1 The structure of the output to the
+	 *                          multiplicative operator of the \a ring.
+	 * @tparam InputView1       The view type applied to the left-hand side
+	 *                          input to the multiplicative operator
+	 *                          of the \a ring.
+	 * @tparam InputView2       The view type applied to the right-hand side
+	 *                          input to the multiplicative operator
+	 *                          of the \a ring.
+	 * @tparam OutputView1      The view type applied to the output to the
+	 *                          multiplicative operator of the \a ring.
+	 *
+	 * @param[out]  z  The output vector of type \a OutputType.
+	 * @param[in]   x  The left-hand input vector of type \a InputType1.
+	 * @param[in]   y  The right-hand input vector of type \a InputType2.
+	 * @param[in] ring The generalized semiring under which to perform this
+	 *                 element-wise multiplication.
+	 *
+	 * @return alp::MISMATCH Whenever the dimensions of \a x, \a y, and \a z do
+	 *                       not match. All input data containers are left
+	 *                       untouched if this exit code is returned; it will be
+	 *                       as though this call was never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call takes \f$ \Theta(n) \f$ work, where \f$ n \f$ equals the
+	//  *         size of the vectors \a x, \a y, and \a z. The constant factor
+	//  *         depends on the cost of evaluating the multiplication operator. A
+	//  *         good implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the multiplicative operator used
+	//  *         allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most \f$ n( \mathit{sizeof}(\mathit{D1}) +
+	//  *         \mathit{sizeof}(\mathit{D2}) + \mathit{sizeof}(\mathit{D3})) +
+	//  *         \mathcal{O}(1) \f$ bytes of data movement. A good implementation
+	//  *         will stream \a x or \a y into \a z to apply the multiplication
+	//  *         operator in-place, whenever the input domains, the output domain,
+	//  *         and the operator used allow for this.
+	//  * \endparblock
+	 *
+	 * \warning When given sparse vectors, the zero now annihilates instead of
+	 *       acting as an identity. Thus the eWiseMul cannot simply map to an
+	 *       eWiseApply of the multiplicative operator.
+	 *
+	 * @see This is a specialised form of eWiseMulAdd.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring
+	>
+	RC eWiseMul( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & y,
+		const Ring & ring = Ring(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_semiring< Ring >::value,
+			void >::type * const = NULL ) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D1, InputType1 >::value ), "alp::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D2, InputType2 >::value ), "alp::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D3, OutputType >::value ), "alp::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+	#ifdef _DEBUG
+		std::cout << "eWiseMul (reference, vector <- vector x vector) dispatches to eWiseMulAdd (vector <- vector x vector + 0)\n";
+	#endif
+		// return eWiseMulAdd< descr >( z, x, y, ring.template getZero< Ring::D4 >(), ring );
+		return PANIC;
+	}
+
+	/**
+	 * Computes \f$ z = z + x * y \f$.
+	 *
+	 * Specialisation for scalar \a x.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring
+	>
+	RC eWiseMul( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Scalar< InputType1, InputStructure1, reference > &alpha,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & y,
+		const Ring & ring = Ring(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_semiring< Ring >::value,
+			void >::type * const = NULL ) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D1, InputType1 >::value ), "alp::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D2, InputType2 >::value ), "alp::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D3, OutputType >::value ), "alp::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+	#ifdef _DEBUG
+		std::cout << "eWiseMul (reference, vector <- scalar x vector) dispatches to eWiseMulAdd (vector <- scalar x vector + 0)\n";
+	#endif
+		// return eWiseMulAdd< descr >( z, alpha, y, ring.template getZero< typename Ring::D4 >(), ring );
+		return PANIC;
+	}
+
+	/**
+	 * Computes \f$ z = z + x * y \f$.
+	 *
+	 * Specialisation for scalar \a y.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2,
+		class Ring
+	>
+	RC eWiseMul( Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & x,
+		const Scalar< InputType2, InputStructure2, reference > &beta,
+		const Ring & ring = Ring(),
+		const typename std::enable_if< ! alp::is_object< OutputType >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && alp::is_semiring< Ring >::value,
+			void >::type * const = NULL ) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D1, InputType1 >::value ), "alp::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D2, InputType2 >::value ), "alp::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Ring::D3, OutputType >::value ), "alp::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+	#ifdef _DEBUG
+		std::cout << "eWiseMul (reference) dispatches to eWiseMulAdd with 0.0 as additive scalar\n";
+	#endif
+		// return eWiseMulAdd< descr >( z, x, beta, ring.template getZero< typename Ring::D4 >(), ring.getMultiplicativeOperator() );
+		return PANIC;
+	}
+
+	// internal namespace for implementation of alp::dot
+	namespace internal {
+
+		/** @see alp::dot */
+		template<
+			Descriptor descr = descriptors::no_operation,
+			typename OutputType, typename OutputStructure,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			class AddMonoid, class AnyOp
+		>
+		RC dot_generic( Scalar< OutputType, OutputStructure, reference > &z,
+			const alp::Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &x,
+			const alp::Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &y,
+			const AddMonoid &addMonoid = AddMonoid(),
+			const AnyOp &anyOp = AnyOp()
+		) {
+			throw std::runtime_error( "Needs an implementation." );
+			return SUCCESS;
+		}
+
+	} // namespace internal
+
+	/**
+	 * Calculates the dot product, \f$ \alpha = (x,y) \f$, under a given additive
+	 * monoid and multiplicative operator.
+	 *
+	 * @tparam descr      The descriptor to be used (descriptors::no_operation
+	 *                    if left unspecified).
+	 * @tparam Ring       The semiring type to use.
+	 * @tparam OutputType The output type.
+	 * @tparam InputType1 The input element type of the left-hand input vector.
+	 * @tparam InputType2 The input element type of the right-hand input vector.
+	 * @tparam InputStructure1  The structure of the left-hand input vector.
+	 * @tparam InputStructure2  The structure of the right-hand input vector.
+	 * @tparam InputView1       The view type applied to the left-hand input vector.
+	 * @tparam InputView2       The view type applied to the right-hand input vector.
+	 *
+	 * @param[in,out]  z    The output element \f$ z + \alpha \f$.
+	 * @param[in]      x    The left-hand input vector.
+	 * @param[in]      y    The right-hand input vector.
+	 * @param[in] addMonoid The additive monoid under which the reduction of the
+	 *                      results of element-wise multiplications of \a x and
+	 *                      \a y are performed.
+	 * @param[in]   anyop   The multiplicative operator under which element-wise
+	 *                      multiplications of \a x and \a y are performed. This can
+	 *                      be any binary operator.
+	 *
+	 * By the definition that a dot-product operates under any additive monoid and
+	 * any binary operator, it follows that a dot-product under any semiring can be
+	 * trivially reduced to a call to this version instead.
+	 *
+	 * @return alp::MISMATCH When the dimensions of \a x and \a y do not match. All
+	 *                       input data containers are left untouched if this exit
+	 *                       code is returned; it will be as though this call was
+	 *                       never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call takes \f$ \Theta(n/p) \f$ work at each user process, where
+	//  *         \f$ n \f$ equals the size of the vectors \a x and \a y, and
+	//  *         \f$ p \f$ is the number of user processes. The constant factor
+	//  *         depends on the cost of evaluating the addition and multiplication
+	//  *         operators. A good implementation uses vectorised instructions
+	//  *         whenever the input domains, output domain, and the operators used
+	//  *         allow for this.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory used
+	//  *         by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n( \mathit{sizeof}(\mathit{D1}) + \mathit{sizeof}(\mathit{D2}) ) + \mathcal{O}(p) \f$
+	//  *         bytes of data movement.
+	//  *
+	//  *      -# This call incurs at most \f$ \Theta(\log p) \f$ synchronisations
+	//  *         between two or more user processes.
+	//  *
+	//  *      -# A call to this function does result in any system calls.
+	//  * \endparblock
+	 *
+	 * \note This requires an implementation to pre-allocate \f$ \Theta(p) \f$
+	 *       memory for inter-process reduction, if the underlying communication
+	 *       layer indeed requires such a buffer. This buffer may not be allocated
+	 *       (nor freed) during a call to this function.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *   -# alp::descriptors::no_operation
+	 *   -# alp::descriptors::no_casting
+	 *   -# alp::descriptors::dense
+	 * \endparblock
+	 *
+	 * If the dense descriptor is set, this implementation returns alp::ILLEGAL if
+	 * it was detected that either \a x or \a y was sparse. In this case, it shall
+	 * otherwise be as though the call to this function had not occurred (no side
+	 * effects).
+	 *
+	 * \note The standard, in contrast, only specifies undefined behaviour would
+	 *       occur. This implementation goes beyond the standard by actually
+	 *       specifying what will happen.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class AddMonoid, class AnyOp
+	>
+	RC dot(
+		Scalar< OutputType, OutputStructure, reference > &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const std::enable_if_t< !alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< AddMonoid >::value &&
+			alp::is_operator< AnyOp >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< InputType1, typename AnyOp::D1 >::value ), "alp::dot",
+			"called with a left-hand vector value type that does not match the first "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< InputType2, typename AnyOp::D2 >::value ), "alp::dot",
+			"called with a right-hand vector value type that does not match the second "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename AddMonoid::D3, typename AnyOp::D1 >::value ), "alp::dot",
+			"called with a multiplicative operator output domain that does not match "
+			"the first domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< OutputType, typename AddMonoid::D2 >::value ), "alp::dot",
+			"called with an output vector value type that does not match the second "
+			"domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename AddMonoid::D3, typename AddMonoid::D2 >::value ), "alp::dot",
+			"called with an additive operator whose output domain does not match its "
+			"second input domain" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< OutputType, typename AddMonoid::D3 >::value ), "alp::dot",
+			"called with an output vector value type that does not match the third "
+			"domain of the given additive operator" );
+		(void)z;
+		if( size( x ) != size( y ) ) {
+			return MISMATCH;
+		}
+
+		if( !( internal::getInitialized( z ) && internal::getInitialized( x ) && internal::getInitialized( y ) ) ) {
+#ifdef _DEBUG
+			std::cout << "dot(): one of input vectors or scalar are not initialized: do noting!\n";
+#endif
+			return SUCCESS;
+		}
+
+		std::function< void( typename AddMonoid::D3 &, const size_t, const size_t ) > data_lambda =
+			[ &x, &y, &anyOp ]( typename AddMonoid::D3 &result, const size_t i, const size_t j ) {
+				(void) j;
+				internal::apply(
+					result, x[ i ],
+					grb::utils::is_complex< InputType2 >::conjugate( y[ i ] ),
+					anyOp
+				);
+			};
+
+		std::function< bool() > init_lambda =
+			[ &x ]() -> bool {
+				return internal::getInitialized( x );
+			};
+
+		Vector<
+			typename AddMonoid::D3,
+			structures::General,
+			Density::Dense,
+			view::Functor< std::function< void( typename AddMonoid::D3 &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			reference
+		> temp(
+			init_lambda,
+			size( x ),
+			data_lambda
+		);
+		RC rc = foldl( z, temp, addMonoid );
+		return rc;
+	}
+
+	/** C++ scalar specialization */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class AddMonoid, class AnyOp
+	>
+	RC dot( OutputType &z,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const typename std::enable_if< !alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< AddMonoid >::value &&
+			alp::is_operator< AnyOp >::value,
+		void >::type * const = NULL
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< InputType1, typename AnyOp::D1 >::value ), "alp::dot",
+			"called with a left-hand vector value type that does not match the first "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< InputType2, typename AnyOp::D2 >::value ), "alp::dot",
+			"called with a right-hand vector value type that does not match the second "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename AddMonoid::D3, typename AnyOp::D1 >::value ), "alp::dot",
+			"called with a multiplicative operator output domain that does not match "
+			"the first domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< OutputType, typename AddMonoid::D2 >::value ), "alp::dot",
+			"called with an output vector value type that does not match the second "
+			"domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename AddMonoid::D3, typename AddMonoid::D2 >::value ), "alp::dot",
+			"called with an additive operator whose output domain does not match its "
+			"second input domain" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< OutputType, typename AddMonoid::D3 >::value ), "alp::dot",
+			"called with an output vector value type that does not match the third "
+			"domain of the given additive operator" );
+		Scalar< OutputType, structures::General, reference > res( z );
+		RC rc = dot( res, x, y, addMonoid, anyOp );
+		if( rc != SUCCESS ) {
+			return rc;
+		}
+		/** \internal \todo: extract res.value into z */
+		return SUCCESS;
+	}
+
+	/**
+	 * Provides a generic implementation of the dot computation on semirings by
+	 * translating it into a dot computation on an additive commutative monoid
+	 * with any multiplicative operator.
+	 *
+	 * For return codes, exception behaviour, performance semantics, template
+	 * and non-template arguments, @see alp::dot.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring
+	>
+	RC dot( Scalar< IOType, IOStructure, reference > &x,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &left,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &right,
+		const Ring &ring = Ring(),
+		const typename std::enable_if<
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_semiring< Ring >::value,
+		void >::type * const = NULL
+	) {
+		return alp::dot< descr >( x,
+		// return alp::dot( x,
+			left, right,
+			ring.getAdditiveMonoid(),
+			ring.getMultiplicativeOperator()
+		);
+	}
+
+	/** C++ scalar specialization. */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename IOType,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2
+	>
+	RC dot( IOType &x,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &left,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &right,
+		const Ring &ring = Ring(),
+		const typename std::enable_if<
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			!alp::is_object< IOType >::value &&
+			alp::is_semiring< Ring >::value,
+		void >::type * const = NULL
+	) {
+		Scalar< IOType, structures::General, reference > res( x );
+		RC rc = alp::dot< descr >( res,
+			left, right,
+			ring.getAdditiveMonoid(),
+			ring.getMultiplicativeOperator()
+		);
+		if( rc != SUCCESS ) {
+			return rc;
+		}
+		x = *res;
+		return SUCCESS;
+	}
+
+	/** No implementation notes. */
+	template< typename Func,
+		typename DataType, typename DataStructure, typename DataView, typename DataImfR, typename DataImfC
+	>
+	RC eWiseMap( const Func f, Vector< DataType, DataStructure, Density::Dense, DataView, DataImfR, DataImfC, reference > & x ) {
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * This is the eWiseLambda that performs length checking by recursion.
+	 *
+	 * in the reference implementation all vectors are distributed equally, so no
+	 * need to synchronise any data structures. We do need to do error checking
+	 * though, to see when to return alp::MISMATCH. That's this function.
+	 *
+	 * @see Vector::operator[]()
+	 * @see Vector::lambda_reference
+	 */
+	template<
+		typename Func,
+		typename DataType1, typename DataStructure1, typename DataView1, typename InputImfR1, typename InputImfC1,
+		typename DataType2, typename DataStructure2, typename DataView2, typename InputImfR2, typename InputImfC2,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		Vector< DataType1, DataStructure1, Density::Dense, DataView1, InputImfR1, InputImfC1, reference > &x,
+		const Vector< DataType2, DataStructure2, Density::Dense, DataView2, InputImfR2, InputImfC2, reference > &y,
+		Args const &... args
+	) {
+		// catch mismatch
+		if( size( x ) != size( y ) ) {
+			return MISMATCH;
+		}
+		// continue
+		return eWiseLambda( f, x, args... );
+	}
+
+	/**
+	 * No implementation notes. This is the `real' implementation on reference
+	 * vectors.
+	 *
+	 * @see Vector::operator[]()
+	 * @see Vector::lambda_reference
+	 */
+	template<
+		typename Func,
+		typename DataType, typename DataStructure, typename DataView, typename DataImfR, typename DataImfC
+	>
+	RC eWiseLambda( const Func f, Vector< DataType, DataStructure, Density::Dense, DataView, DataImfR, DataImfC, reference > &x ) {
+#ifdef _DEBUG
+		std::cout << "Info: entering eWiseLambda function on vectors.\n";
+#endif
+		auto x_as_matrix = get_view< view::matrix >( x );
+		return eWiseLambda(
+			[ &f ]( const size_t i, const size_t j, DataType &val ) {
+				(void)j;
+				f( i, val );
+			},
+			x_as_matrix
+		);
+	}
+
+	/**
+	 * Reduces a vector into a scalar. Reduction takes place according a monoid
+	 * \f$ (\oplus,1) \f$, where \f$ \oplus:\ D_1 \times D_2 \to D_3 \f$ with an
+	 * associated identity \f$ 1 \in \{D_1,D_2,D_3\} \f$. Elements from the given
+	 * vector \f$ y \in \{D_1,D_2\} \f$ will be applied at the left-hand or right-
+	 * hand side of \f$ \oplus \f$; which, exactly, is implementation-dependent
+	 * but should not matter since \f$ \oplus \f$ should be associative.
+	 *
+	 * Let \f$ x_0 = 1 \f$ and let
+	 * \f$ x_{i+1} = \begin{cases}
+	 *   x_i \oplus y_i\text{ if }y_i\text{ is nonzero}
+	 *   x_i\text{ otherwise}
+	 * \end{cases},\f$
+	 * for all \f$ i \in \{ 0, 1, \ldots, n-1 \} \f$. On function exit \a x will be
+	 * set to \f$ x_n \f$.
+	 *
+	 * This function assumes that \f$ \odot \f$ under the given domains consitutes
+	 * a valid monoid, which for standard associative operators it usually means
+	 * that \f$ D_3 \subseteq D_2 \subseteq D_1 \f$. If not, or if the operator is
+	 * non-standard, the monoid axioms are to be enforced in some other way-- the
+	 * user is responsible for checking this is indeed the case or undefined
+	 * behaviour will occur.
+	 *
+	 * \note While the monoid identity may be used to easily provide parallel
+	 *       implementations of this function, having a notion of an identity is
+	 *       mandatory to be able to interpret sparse vectors; this is why we do
+	 *       not allow a plain operator to be passed to this function.
+	 *
+	 * @tparam descr     The descriptor to be used (descriptors::no_operation if
+	 *                   left unspecified).
+	 * @tparam Monoid    The monoid to use for reduction. A monoid is required
+	 *                   because the output value \a y needs to be initialised
+	 *                   with an identity first.
+	 * @tparam InputType The type of the elements in the supplied GraphBLAS
+	 *                   vector \a y.
+	 * @tparam IOType    The type of the output value \a x.
+	 *
+	 * @param[out]   x   The result of reduction.
+	 * @param[in]    y   A valid GraphBLAS vector. This vector may be sparse.
+	 * @param[in] monoid The monoid under which to perform this reduction.
+	 *
+	 * @return alp::SUCCESS When the call completed successfully.
+	 * @return alp::ILLEGAL If the provided input vector \a y was not dense.
+	 * @return alp::ILLEGAL If the provided input vector \a y was empty.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting,
+	 * alp::descriptors::dense
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a monoid must match \a InputType, 2) the second domain of \a op must match
+	 * \a IOType, and 3) the third domain must match \a IOType. If one of
+	 * these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *         No system calls will be made.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ n \mathit{sizeof}(\mathit{InputType}) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement. If \a y is sparse, a call to this function
+	//  *         incurs at most \f$ n \mathit{sizeof}( \mathit{bool} ) \f$ extra
+	//  *         bytes of data movement.
+	//  * \endparblock
+	 *
+	 * @see alp::foldl provides similar functionality.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Monoid
+	>
+	RC foldl(
+		Scalar< IOType, IOStructure, reference > &alpha,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &y,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			! alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+
+		// static sanity checks
+		NO_CAST_ASSERT(
+			( ! ( descr & descriptors::no_casting ) || std::is_same< IOType, InputType >::value ),
+			"alp::reduce",
+			"called with a scalar IO type that does not match the input vector type"
+		);
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Monoid::D1 >::value ), "alp::reduce",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Monoid::D2 >::value ), "alp::reduce",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_OP_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< InputType, typename Monoid::D3 >::value ), "alp::reduce",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+
+#ifdef _DEBUG
+		std::cout << "foldl(Scalar,Vector,Monoid) called. Vector has size " << size( y ) << " .\n";
+#endif
+
+		internal::setInitialized(
+			alpha,
+			internal::getInitialized( alpha ) && internal::getInitialized( y )
+		);
+
+		if( !internal::getInitialized( alpha ) ) {
+			return SUCCESS;
+		}
+
+		const size_t n = size( y );
+		for ( size_t i = 0; i < n; ++i ) {
+			(void) internal::foldl( *alpha, y[ i ], monoid.getOperator() );
+		}
+		return SUCCESS;
+	}
+
+	/**
+	 * Compute the permutation vector needed to sort the input vector according to
+	 * the provided \a cmp relation.
+	 *
+	 * @param[in] toSort vector of indices to sort, should not be modified
+	 * @param[in] cmp an ALP (strict) partial order
+	 *
+	 * @param[out] permutation iterator over index permutations which sort toSort vector
+	 *
+	 * Complexity should be \Theta(n*log(n)), and space complexity should be \Theta(n+T+P)
+	 */
+	template<
+		typename IndexType, typename IndexStructure, typename IndexView, typename IndexImfR, typename IndexImfC,
+		typename ValueType, typename ValueStructure, typename ValueView, typename ValueImfR, typename ValueImfC,
+		typename Relation
+	>
+	RC sort(
+		Vector< IndexType, IndexStructure, Density::Dense, IndexView, IndexImfR, IndexImfC, reference > &permutation,
+		const Vector< ValueType, ValueStructure, Density::Dense, ValueView, ValueImfR, ValueImfC, reference > &toSort,
+		const Relation &rel = Relation(),
+		const std::enable_if_t<
+			!( alp::is_object< ValueType >::value )
+			&& ( alp::is_partial_order< Relation >::value || alp::is_strict_partial_order< Relation >::value )
+		> * const = nullptr
+	) noexcept {
+
+		internal::setInitialized( permutation, internal::getInitialized( toSort ) );
+
+		if( !internal::getInitialized( toSort ) ) {
+			return SUCCESS;
+		}
+
+		RC rc = alp::set< alp::descriptors::use_index >( permutation, alp::Scalar< IndexType, structures::General, reference >( 0 ) );
+
+		typedef Vector< 
+			IndexType, IndexStructure, Density::Dense, 
+			IndexView, IndexImfR, IndexImfC, reference 
+		> PermType;
+
+		typename PermType::iterator it_begin = internal::begin( permutation );
+		typename PermType::iterator it_end = internal::end( permutation );
+
+		std::sort( it_begin, it_end, [ &toSort, &rel ]( const IndexType& a, const IndexType& b ) {
+			return rel.check( toSort[ a ], toSort[ b ] );
+		});
+		
+		return rc;
+	}
+
+    /**
+	 * Provides a generic implementation of the 2-norm computation.
+	 *
+	 * Proceeds by computing a dot-product on itself and then taking the square
+	 * root of the result.
+	 *
+	 * This function is only available when the output type is floating point.
+	 *
+	 * For return codes, exception behaviour, performance semantics, template
+	 * and non-template arguments, @see alp::dot.
+	 *
+	 * @param[out] x The 2-norm of \a y. The input value of \a x will be ignored.
+	 * @param[in]  y The vector to compute the norm of.
+	 * @param[in] ring The Semiring under which the 2-norm is to be computed.
+	 *
+	 * \warning This function computes \a x out-of-place. This is contrary to
+	 *          standard ALP/GraphBLAS functions that are always in-place.
+	 *
+	 * \warning A \a ring is not sufficient for computing a two-norm. This
+	 *          implementation assumes the standard <tt>sqrt</tt> function
+	 *          must be applied on the result of a dot-product of \a y with
+	 *          itself under the supplied semiring.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Ring
+	>
+	RC norm2(
+		Scalar< OutputType, OutputStructure, reference > &x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &y,
+		const Ring &ring = Ring(),
+		const typename std::enable_if<
+			std::is_floating_point< OutputType >::value || grb::utils::is_complex< OutputType >::value,
+			void
+		>::type * const = NULL
+	) {
+		RC ret = alp::dot< descr >( x, y, y, ring );
+		if( ret == SUCCESS ) {
+			*x = sqrt( *x );
+		}
+		return ret;
+	}
+
+	/** C++ scalar version */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Ring
+	>
+	RC norm2(
+		OutputType &x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &y,
+		const Ring &ring = Ring(),
+		const typename std::enable_if<
+			std::is_floating_point< OutputType >::value || grb::utils::is_complex< OutputType >::value,
+			void
+		>::type * const = nullptr
+	) {
+		Scalar< OutputType, structures::General, reference > res( x );
+		RC rc = norm2( res, y, ring );
+		if( rc != SUCCESS ) {
+			return rc;
+		}
+		x = *res;
+		return SUCCESS;
+	}
+
+	/**
+	 * Returns a view over the input vector returning conjugate of the accessed element.
+	 * This avoids materializing the resulting container.
+	 * The elements are calculated lazily on access.
+	 *
+	 * @tparam descr      	    The descriptor to be used (descriptors::no_operation
+	 *                    	    if left unspecified).
+	 * @tparam InputType  	    The value type of the input vector.
+	 * @tparam InputStructure   The Structure type applied to the input vector.
+	 * @tparam InputView        The view type applied to the input vector.
+	 *
+	 * @param x      The input vector
+	 *
+	 * @return Vector view over a lambda function defined in this function.
+	 *
+	 *
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC
+	>
+	Vector<
+		DataType, Structure, Density::Dense,
+		view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		reference
+	>
+	conjugate(
+		const Vector< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &x,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value
+		> * const = nullptr
+	) {
+
+		std::function< void( DataType &, const size_t, const size_t ) > data_lambda =
+			[ &x ]( DataType &result, const size_t i, const size_t j ) {
+				(void) j;
+				result = grb::utils::is_complex< DataType >::conjugate( x[ i ] );
+			};
+
+		std::function< bool() > init_lambda =
+			[ &x ]() -> bool {
+				return internal::getInitialized( x );
+			};
+
+		return Vector<
+			DataType,
+			Structure,
+			Density::Dense,
+			view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			reference
+		>( init_lambda,	size( x ),	data_lambda );
+
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+#undef NO_CAST_OP_ASSERT
+
+#endif // end ``_H_ALP_REFERENCE_BLAS1''
+
diff --git a/include/alp/reference/blas2.hpp b/include/alp/reference/blas2.hpp
new file mode 100644
index 000000000..54ca9e5bb
--- /dev/null
+++ b/include/alp/reference/blas2.hpp
@@ -0,0 +1,1130 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_BLAS2
+#define _H_ALP_REFERENCE_BLAS2
+
+#include <cstddef>
+
+#include <graphblas/utils/iscomplex.hpp>
+
+#include <alp/backends.hpp>
+#include <alp/rc.hpp>
+
+#include "blas0.hpp"
+#include "blas1.hpp"
+#include "config.hpp"
+#include "matrix.hpp"
+#include "scalar.hpp"
+#include "vector.hpp"
+
+#define NO_CAST_OP_ASSERT( x, y, z )                                           \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the operator domains, as specified in the "            \
+		"documentation of the function " y ", supply an input argument of "    \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible operator where all domains "  \
+		"match those of the input parameters, as specified in the "            \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+namespace alp {
+
+	/**
+	 * \addtogroup reference
+	 * @{
+	 */
+
+	/** \internal Delegates to fully masked variant */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType3, typename InputStructure3, typename InputView3, typename InputImfR3, typename InputImfC3,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring
+	>
+	RC vxm( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType3, InputStructure3, Density::Dense, InputView3, InputImfR3, InputImfC3, reference > & mask,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const Ring & ring = Ring(),
+		const typename std::enable_if< alp::is_semiring< Ring >::value, void >::type * const = NULL ) {
+		const Vector< bool, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > empty_mask( 0 );
+		return vxm< descr, true, false >( u, mask, v, empty_mask, A, ring );
+	}
+
+	/** \internal Delegates to fully masked variant */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType3, typename InputStructure3, typename InputView3, typename InputImfR3, typename InputImfC3,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class AdditiveMonoid,
+		class MultiplicativeOperator
+	>
+	RC vxm( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType3, InputStructure3, Density::Dense, InputView3, InputImfR3, InputImfC3, reference > & mask,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const AdditiveMonoid & add = AdditiveMonoid(),
+		const MultiplicativeOperator & mul = MultiplicativeOperator(),
+		const typename std::enable_if< alp::is_monoid< AdditiveMonoid >::value && alp::is_operator< MultiplicativeOperator >::value && ! alp::is_object< IOType >::value &&
+				! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && ! alp::is_object< InputType3 >::value && ! std::is_same< InputType2, void >::value,
+			void >::type * const = NULL ) {
+		const alp::Vector< bool, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > empty_mask( 0 );
+		return vxm< descr, true, false >( u, mask, v, empty_mask, A, add, mul );
+	}
+
+	/** \internal Delegates to vxm_generic. */
+	template< Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType3, typename InputStructure3, typename InputView3, typename InputImfR3, typename InputImfC3,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType4, typename InputStructure4, typename InputView4, typename InputImfR4, typename InputImfC4,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Ring
+	>
+	RC vxm( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType3, InputStructure3, Density::Dense, InputView3, InputImfR3, InputImfC3, reference > & mask,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Vector< InputType4, InputStructure4, Density::Dense, InputView4, InputImfR4, InputImfC4, reference > & v_mask,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const Ring & ring = Ring(),
+		const typename std::enable_if< alp::is_semiring< Ring >::value, void >::type * const = NULL ) {
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/** \internal Delegates to fully masked version */
+	template< Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType = typename Ring::D4, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType1 = typename Ring::D1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2 = typename Ring::D2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2
+	>
+	RC vxm( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const Ring & ring = Ring(),
+		const typename std::enable_if< alp::is_semiring< Ring >::value, void >::type * const = NULL ) {
+		const Vector< bool, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > empty_mask( 0 );
+		return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, ring );
+	}
+
+	/** \internal Delegates to fully masked version */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class AdditiveMonoid, class MultiplicativeOperator
+	>
+	RC vxm( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const AdditiveMonoid & add = AdditiveMonoid(),
+		const MultiplicativeOperator & mul = MultiplicativeOperator(),
+		const typename std::enable_if< alp::is_monoid< AdditiveMonoid >::value && alp::is_operator< MultiplicativeOperator >::value && ! alp::is_object< IOType >::value &&
+				! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && ! std::is_same< InputType2, void >::value,
+			void >::type * const = NULL ) {
+		const alp::Vector< bool, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > empty_mask( 0 );
+		return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, add, mul );
+	}
+
+	/** \internal Delegates to fully masked version */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType3 = bool, typename InputStructure3, typename InputView3, typename InputImfR3, typename InputImfC3,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		class Ring
+	>
+	RC mxv( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType3, InputStructure3, Density::Dense, InputView3, InputImfR3, InputImfC3, reference > & mask,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Ring & ring,
+		const typename std::enable_if< alp::is_semiring< Ring >::value, void >::type * const = NULL ) {
+		const Vector< bool, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > empty_mask( 0 );
+		return mxv< descr, true, false >( u, mask, A, v, empty_mask, ring );
+	}
+
+	/** \internal Delegates to vxm_generic */
+	template< Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType3, typename InputStructure3, typename InputView3, typename InputImfR3, typename InputImfC3,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType4, typename InputStructure4, typename InputView4, typename InputImfR4, typename InputImfC4,
+		class Ring
+	>
+	RC mxv( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType3, InputStructure3, Density::Dense, InputView3, InputImfR3, InputImfC3, reference > & mask,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Vector< InputType4, InputStructure4, Density::Dense, InputView4, InputImfR4, InputImfC4, reference > & v_mask,
+		const Ring & ring,
+		const typename std::enable_if< alp::is_semiring< Ring >::value, void >::type * const = NULL ) {
+
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * \internal Delegates to fully masked variant.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType = typename Ring::D4, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType2 = typename Ring::D2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		typename InputType1 = typename Ring::D1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1
+	>
+	RC mxv( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Ring & ring,
+		const typename std::enable_if< alp::is_semiring< Ring >::value, void >::type * const = NULL ) {
+		const Vector< bool, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > empty_mask( 0 );
+		return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, ring );
+	}
+
+	/** \internal Delegates to fully masked version */
+	template< Descriptor descr = descriptors::no_operation,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		class AdditiveMonoid, class MultiplicativeOperator
+	>
+	RC mxv( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const AdditiveMonoid & add = AdditiveMonoid(),
+		const MultiplicativeOperator & mul = MultiplicativeOperator(),
+		const typename std::enable_if< alp::is_monoid< AdditiveMonoid >::value && alp::is_operator< MultiplicativeOperator >::value && ! alp::is_object< IOType >::value &&
+				! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && ! std::is_same< InputType2, void >::value,
+			void >::type * const = NULL ) {
+		const alp::Vector< bool, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > empty_mask( 0 );
+		return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, add, mul );
+	}
+
+	/**
+	 * \internal Delegates to vxm_generic
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType3, typename InputStructure3, typename InputView3, typename InputImfR3, typename InputImfC3,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType4, typename InputStructure4, typename InputView4, typename InputImfR4, typename InputImfC4,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class AdditiveMonoid,
+		class MultiplicativeOperator
+	>
+	RC vxm( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType3, InputStructure3, Density::Dense, InputView3, InputImfR3, InputImfC3, reference > & mask,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Vector< InputType4, InputStructure4, Density::Dense, InputView4, InputImfR4, InputImfC4, reference > & v_mask,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const AdditiveMonoid & add = AdditiveMonoid(),
+		const MultiplicativeOperator & mul = MultiplicativeOperator(),
+		const typename std::enable_if< alp::is_monoid< AdditiveMonoid >::value && alp::is_operator< MultiplicativeOperator >::value && ! alp::is_object< IOType >::value &&
+				! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && ! alp::is_object< InputType3 >::value && ! alp::is_object< InputType4 >::value &&
+				! std::is_same< InputType2, void >::value,
+			void >::type * const = NULL ) {
+
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	/**
+	 * \internal Delegates to vxm_generic.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		typename InputType3, typename InputStructure3, typename InputView3, typename InputImfR3, typename InputImfC3,
+		typename InputType4, typename InputStructure4, typename InputView4, typename InputImfR4, typename InputImfC4,
+		class AdditiveMonoid,
+		class MultiplicativeOperator
+	>
+	RC mxv( Vector< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > & u,
+		const Vector< InputType3, InputStructure3, Density::Dense, InputView3, InputImfR3, InputImfC3, reference > & mask,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & v,
+		const Vector< InputType4, InputStructure4, Density::Dense, InputView4, InputImfR4, InputImfC4, reference > & v_mask,
+		const AdditiveMonoid & add = AdditiveMonoid(),
+		const MultiplicativeOperator & mul = MultiplicativeOperator(),
+		const typename std::enable_if< alp::is_monoid< AdditiveMonoid >::value && alp::is_operator< MultiplicativeOperator >::value && ! alp::is_object< IOType >::value &&
+				! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && ! alp::is_object< InputType3 >::value && ! alp::is_object< InputType4 >::value &&
+				! std::is_same< InputType2, void >::value,
+			void >::type * const = NULL ) {
+
+		throw std::runtime_error( "Needs an implementation." );
+		return SUCCESS;
+	}
+
+	namespace internal {
+
+		/**
+		 * Applies the provided function to each element of the given band.
+		 * This function is called by the public eWiseLambda variant.
+		 * Forward declaration. Specializations handle bound checking.
+		 */
+		template<
+			size_t BandIndex, typename Func,
+			typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+			std::enable_if_t<
+				BandIndex >= std::tuple_size< typename Structure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseLambda(
+			const Func f,
+			alp::Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A
+		);
+
+		/** Specialization for an out-of-bounds band index */
+		template<
+			size_t BandIndex, typename Func,
+			typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+			std::enable_if_t<
+				BandIndex >= std::tuple_size< typename Structure::band_intervals >::value
+			> *
+		>
+		RC eWiseLambda(
+			const Func f,
+			alp::Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A
+		) {
+			(void)f;
+			(void)A;
+			// nothing to do
+			return SUCCESS;
+		}
+
+		/**
+		 * Specialization for a within-the-range band index.
+		 * Applies the provided function to each element of the given band.
+		 * Upon completion, calls itself for the next band.
+		 */
+		template<
+			size_t band_index, typename Func,
+			typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+			std::enable_if_t<
+				band_index < std::tuple_size< typename Structure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseLambda(
+			const Func f,
+			alp::Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A
+		) {
+			const auto i_limits = structures::calculate_row_coordinate_limits< band_index >( A );
+
+			for( size_t i = i_limits.first; i < i_limits.second; ++i ) {
+
+				const auto j_limits = structures::calculate_column_coordinate_limits< band_index >( A, i );
+
+				for( size_t j = j_limits.first; j < j_limits.second; ++j ) {
+					auto &a_val = internal::access( A, internal::getStorageIndex( A, i, j ) );
+					f( i, j, a_val );
+				}
+			}
+			return eWiseLambda< band_index + 1 >( f, A );
+		}
+
+	} // namespace internal
+
+	/**
+	 * Delegates to single-band variant.
+	 *
+	 * @see alp::eWiseLambda for the user-level specification.
+	 */
+	template<
+		typename Func,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC
+	>
+	RC eWiseLambda(
+		const Func f,
+		Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A
+	) {
+#ifdef _DEBUG
+		std::cout << "entering alp::eWiseLambda (matrices, reference ). A is " << alp::nrows( A ) << " by " << alp::ncols( A ) << " and holds " << alp::nnz( A ) << " nonzeroes.\n";
+#endif
+		return internal::eWiseLambda< 0 >( f, A );
+	}
+
+	/**
+	 * This function provides dimension checking and will defer to the below
+	 * function for the actual implementation.
+	 *
+	 * @see alp::eWiseLambda for the user-level specification.
+	 */
+	template<
+		typename Func,
+		typename DataType1, typename DataStructure1, typename DataView1, typename DataImfR1, typename DataImfC1,
+		typename DataType2, typename DataStructure2, typename DataView2, typename DataImfR2, typename DataImfC2,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		Matrix< DataType1, DataStructure1, Density::Dense, DataView1, DataImfR1, DataImfC1, reference > &A,
+		const Vector< DataType2, DataStructure2, Density::Dense, DataView2, DataImfR2, DataImfC2, reference > &x,
+		Args const &... args
+	) {
+		// do size checking
+		if( !( size( x ) == nrows( A ) || size( x ) == ncols( A ) ) ) {
+			std::cerr << "Mismatching dimensions: given vector of size " << size( x )
+				<< " has nothing to do with either matrix dimension (" << nrows( A ) << " nor " << ncols( A ) << ").\n";
+			return MISMATCH;
+		}
+
+		return eWiseLambda( f, A, args... );
+	}
+
+	namespace internal {
+
+		/**
+		 * Applies fold to all elements of the given band
+		 * Depending on the values of left and scalar, performs the following variants:
+		 * - left == true  && scalar == true:  C = C . alpha
+		 * - left == true  && scalar == false: C = C . A
+		 * - left == false && scalar == true:  C = alpha . C
+		 * - left == false && scalar == false: C = A . C
+		 * This variants handles out-of-bounds band index.
+		 * All variants assume compatible parameters:
+		 *   - matching structures
+		 *   - matching dynamic sizes
+		 */
+		template<
+			size_t band_index,
+			bool left, // if true, performs foldl, otherwise foldr
+			bool scalar,
+			Descriptor descr,
+			class Operator,
+			typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+			typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+			typename InputTypeScalar, typename InputStructureScalar,
+			std::enable_if_t<
+				band_index >= std::tuple_size< typename IOStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC fold_matrix_band_generic(
+			alp::Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > *C,
+			const alp::Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > *A,
+			const alp::Scalar< InputTypeScalar, InputStructureScalar, reference > *alpha,
+			const Operator &op,
+			const std::enable_if_t<
+				!alp::is_object< IOType >::value &&
+				!alp::is_object< InputType >::value &&
+				alp::is_operator< Operator >::value
+			> * const = nullptr
+		) {
+			(void) C;
+			(void) A;
+			(void) alpha;
+			(void) op;
+			return SUCCESS;
+		}
+
+		/** Specialization for band index within the bounds */
+		template<
+			size_t band_index,
+			bool left, // if true, performs foldl, otherwise foldr
+			bool scalar,
+			Descriptor descr,
+			class Operator,
+			typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+			typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+			typename InputTypeScalar, typename InputStructureScalar,
+			std::enable_if_t<
+				band_index < std::tuple_size< typename IOStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC fold_matrix_band_generic(
+			alp::Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > *C,
+			const alp::Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > *A,
+			const alp::Scalar< InputTypeScalar, InputStructureScalar, reference > *alpha,
+			const Operator &op,
+			const std::enable_if_t<
+				!alp::is_object< IOType >::value &&
+				!alp::is_object< InputType >::value &&
+				alp::is_operator< Operator >::value
+			> * const = nullptr
+		) {
+			// Ensure that the provided containers are compatible with static configuration
+			assert( C != nullptr );
+			if( scalar ) {
+				assert( alpha != nullptr );
+			} else {
+				assert( A != nullptr );
+			}
+
+			constexpr bool is_sym_c = structures::is_a< IOStructure, structures::Symmetric >::value;
+			constexpr bool is_sym_a = structures::is_a< InputStructure, structures::Symmetric >::value;
+
+			// Temporary until adding multiple symmetry directions
+			constexpr bool sym_up_c = is_sym_c;
+			constexpr bool sym_up_a = is_sym_a;
+
+			// It is assumed without checking that bands of A are a subset of bands of C. TODO: Implement proper check.
+			// If input is scalar, iterating over bands of C, otherwise over bands of A
+			const auto i_limits = scalar ?
+				structures::calculate_row_coordinate_limits< band_index >( *C ) :
+				structures::calculate_row_coordinate_limits< band_index >( *A );
+
+			for( size_t i = i_limits.first; i < i_limits.second; ++i ) {
+
+				const auto j_limits = scalar ?
+					structures::calculate_column_coordinate_limits< band_index >( *C, i ) :
+					structures::calculate_column_coordinate_limits< band_index >( *A, i );
+
+				for( size_t j = j_limits.first; j < j_limits.second; ++j ) {
+					auto &IO_val = internal::access( *C, internal::getStorageIndex( *C, i, j ) );
+
+					if( scalar ) {
+						if( left ) {
+							// C = C . alpha
+							(void) internal::foldl( IO_val, **alpha, op );
+						} else {
+							// C = alpha . C
+							(void) internal::foldr( **alpha, IO_val, op );
+						}
+					} else {
+						// C = A . C
+						// Calculate indices to 'A' depending on matching symmetry with 'C'
+						const size_t A_i = ( sym_up_c == sym_up_a ) ? i : j;
+						const size_t A_j = ( sym_up_c == sym_up_a ) ? j : i;
+						const auto &A_val = internal::access( *A, internal::getStorageIndex( *A, A_i, A_j ) );
+
+						if( left ) {
+							// C = C . A
+							(void) internal::foldl( IO_val, A_val, op );
+						} else {
+							// C = A . C
+							(void) internal::foldr( A_val, IO_val, op );
+						}
+					}
+				}
+			}
+			return fold_matrix_band_generic<
+				band_index + 1, left, scalar, descr
+			>( C, A, alpha, op );
+		}
+
+		/**
+		 * \internal general elementwise matrix application that all eWiseApply variants refer to.
+		 */
+		template<
+			bool left, bool scalar,
+			Descriptor descr,
+			class Operator,
+			typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+			typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+			typename InputTypeScalar, typename InputStructureScalar
+		>
+		RC fold_matrix_generic(
+			alp::Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > *C,
+			const alp::Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > *A,
+			const alp::Scalar< InputTypeScalar, InputStructureScalar, reference > *alpha,
+			const Operator &op,
+			const std::enable_if_t<
+				!alp::is_object< IOType >::value &&
+				!alp::is_object< InputType >::value &&
+				alp::is_operator< Operator >::value
+			> * const = nullptr
+		) {
+
+#ifdef _DEBUG
+			std::cout << "In alp::internal::fold_matrix_generic\n";
+#endif
+
+			// run-time checks
+			// TODO: support left/right_scalar
+			const size_t m = alp::nrows( *C );
+			const size_t n = alp::ncols( *C );
+
+			if( !scalar ){
+				assert( A != nullptr );
+				if( m != nrows( *A ) || n != ncols( *A ) ) {
+					return MISMATCH;
+				}
+			}
+
+			// delegate to single-band variant
+			return fold_matrix_band_generic< 0, left, scalar, descr >( C, A, alpha, op );
+		}
+
+	} // namespace internal
+	/**
+	 * For all elements in a ALP Matrix \a B, fold the value \f$ \alpha \f$
+	 * into each element.
+	 *
+	 * The original value of \f$ \alpha \f$ is used as the left-hand side input
+	 * of the operator \a op. The right-hand side inputs for \a op are retrieved
+	 * from the input Matrix \a B. The result of the operation is stored in \a A,
+	 * thus overwriting its previous values.
+	 *
+	 * The value of \f$ B_i,j \f$ after a call to thus function thus equals
+	 * \f$ \alpha \odot B_i,j \f$, for all \f$ i, j \in \{ 0, 1, \dots, n - 1 \} \f$.
+	 *
+	 * @tparam descr         The descriptor used for evaluating this function.
+	 *                       By default, this is alp::descriptors::no_operation.
+	 * @tparam OP            The type of the operator to be applied.
+	 * @tparam InputType     The type of \a alpha.
+	 * @tparam IOType        The type of the elements in \a B.
+	 * @tparam IOStructure   The structure of the matrix \a B.
+	 * @tparam IOView        The view applied to the matrix \a B.
+	 *
+	 * @param[in]     alpha The input value to apply as the left-hand side input
+	 *                      to \a op.
+	 * @param[in,out] B     On function entry: the initial values to be applied as
+	 *                      the right-hand side input to \a op.
+	 *                      On function exit: the output data.
+	 * @param[in]     op    The monoid under which to perform this left-folding.
+	 *
+	 * @returns alp::SUCCESS This function always succeeds.
+	 *
+	 * \note We only define fold under monoids, not under plain operators.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a op must match \a IOType, 2) the second domain of \a op must match
+	 * \a InputType, and 3) the third domain must match \a IOType. If one of these
+	 * is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	 * \parblock
+	 * \par Valid operator types
+	 * The given operator \a op is required to be:
+	 *   -# (no requirements).
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call comprises \f$ \Theta(n) \f$ work, where \f$ n \f$ equals
+	//  *         the size of the vector \a x. The constant factor depends on the
+	//  *         cost of evaluating the underlying binary operator. A good
+	//  *         implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the operator used allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most
+	//  *         \f$ 2n \cdot \mathit{sizeof}(\mathit{IOType}) + \mathcal{O}(1) \f$
+	//  *         bytes of data movement.
+	//  * \endparblock
+	 *
+	 * @see alp::operators::internal::Operator for a discussion on when in-place
+	 *      and/or vectorised operations are used.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, reference > &alpha,
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &B,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, InputType >::value ),
+			"alp::foldr",
+			"called with a scalar alpha of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D2, IOType >::value ),
+			"alp::foldr",
+			"called on a matrix B of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D3, IOType >::value ),
+			"alp::foldr",
+			"called on a matrix B of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are a subset of IOStructure's bands
+
+		// fold to the right, with scalar as input
+		constexpr bool left = false;
+		constexpr bool scalar = true;
+		constexpr Matrix< InputType, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > *no_matrix = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &B, no_matrix, &alpha, monoid.getOperator() ) ;
+	}
+
+	/** Folds element-wise alpha into B, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator
+	>
+	RC foldr(
+		const Scalar< InputType, InputStructure, reference > &alpha,
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &B,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, InputType >::value ),
+			"alp::foldr",
+			"called with a scalar alpha B of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, IOType >::value ),
+			"alp::foldr",
+			"called on a matrix B of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D3, IOType >::value ),
+			"alp::foldr",
+			"called on a matrix B of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are identical to IOStructure's bands
+
+		// fold to the right, with scalar as input
+		constexpr bool left = false;
+		constexpr bool scalar = true;
+		constexpr Matrix< InputType, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > *no_matrix = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &B, no_matrix, &alpha, op ) ;
+	}
+
+	/** Folds element-wise A into B, monoid variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid
+	>
+	RC foldr(
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &A,
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &B,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, InputType >::value ),
+			"alp::foldr",
+			"called with a matrix A of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D2, IOType >::value ),
+			"alp::foldr",
+			"called on a matrix B of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D3, IOType >::value ),
+			"alp::foldr",
+			"called on a matrix B of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are a subset of IOStructure's bands
+
+		// fold to the right, with matrix as input (no scalar)
+		constexpr bool left = false;
+		constexpr bool scalar = false;
+		constexpr Scalar< InputType, structures::General, reference > *no_scalar = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &B, &A, no_scalar, monoid.getOperator() ) ;
+	}
+
+	/** Folds element-wise A into B, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator
+	>
+	RC foldr(
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &A,
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &B,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value && ! alp::is_object< IOType >::value && alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, InputType >::value ),
+			"alp::foldr",
+			"called with a matrix A of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, IOType >::value ),
+			"alp::foldr",
+			"called on a matrix B of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D3, IOType >::value ),
+			"alp::foldr",
+			"called on a matrix B of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are identical to IOStructure's bands
+
+		// fold to the right, with matrix as input (no scalar)
+		constexpr bool left = false;
+		constexpr bool scalar = false;
+		constexpr Scalar< InputType, structures::General, reference > *no_scalar = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &B, &A, no_scalar, op ) ;
+	}
+
+	/** Folds element-wise B into A, monoid variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &A,
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &B,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D2, InputType >::value ),
+			"alp::foldl",
+			"called with a matrix B of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D3, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are a subset of IOStructure's bands
+
+		constexpr bool left = true;
+		constexpr bool scalar = false;
+		constexpr Scalar< InputType, structures::General, reference > *no_scalar = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &A, &B, no_scalar, monoid.getOperator() ) ;
+	}
+
+	/** Folds element-wise B into A, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &A,
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &B,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value && ! alp::is_object< InputType >::value && alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, InputType >::value ),
+			"alp::foldl",
+			"called with a matrix B of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D3, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are identical to IOStructure's bands
+
+		constexpr bool left = true;
+		constexpr bool scalar = false;
+		constexpr Scalar< InputType, structures::General, reference > *no_scalar = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &A, &B, no_scalar, op ) ;
+	}
+
+	/** Folds element-wise beta into A, monoid variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Monoid
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &A,
+		const Scalar< InputType, InputStructure, reference > &beta,
+		const Monoid &monoid = Monoid(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value && !alp::is_object< InputType >::value && alp::is_monoid< Monoid >::value
+		> * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "Called alp::foldl (scalar-to-matrix, monoid, reference)" << std::endl;
+#endif
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D1, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D2, InputType >::value ),
+			"alp::foldl",
+			"called with a scalar beta of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Monoid::D3, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are a subset of IOStructure's bands
+
+		constexpr bool left = true;
+		constexpr bool scalar = true;
+		constexpr Matrix< InputType, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > *no_matrix = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &A, no_matrix, &beta, monoid.getOperator() ) ;
+	}
+
+	/** Folds element-wise beta into A, operator variant */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure,
+		typename IOType, typename IOStructure, typename IOView, typename IOImfR, typename IOImfC,
+		class Operator
+	>
+	RC foldl(
+		Matrix< IOType, IOStructure, Density::Dense, IOView, IOImfR, IOImfC, reference > &A,
+		const Scalar< InputType, InputStructure, reference > &beta,
+		const Operator &op = Operator(),
+		const std::enable_if_t<
+			!alp::is_object< IOType >::value && !alp::is_object< InputType >::value && alp::is_operator< Operator >::value
+		> * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "Called alp::foldl (scalar-to-matrix, operator, reference)" << std::endl;
+#endif
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the first domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, InputType >::value ),
+			"alp::foldl",
+			"called with a scalar beta of a type that does not match the second domain "
+			"of the given operator"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D3, IOType >::value ),
+			"alp::foldl",
+			"called on a matrix A of a type that does not match the third domain "
+			"of the given operator"
+		);
+		// TODO: check that InputStructure's bands are identical to IOStructure's bands
+
+		constexpr bool left = true;
+		constexpr bool scalar = true;
+		constexpr Matrix< InputType, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > *no_matrix = nullptr;
+		return internal::fold_matrix_generic< left, scalar, descr >( &A, no_matrix, &beta, op ) ;
+	}
+
+	/**
+	 * Returns a view over the input matrix returning conjugate of the accessed element.
+	 * This avoids materializing the resulting container.
+	 * The elements are calculated lazily on access.
+	 *
+	 * @tparam descr      	    The descriptor to be used (descriptors::no_operation
+	 *                    	    if left unspecified).
+	 * @tparam InputType  	    The value type of the input matrix.
+	 * @tparam InputStructure   The Structure type applied to the input matrix.
+	 * @tparam InputView        The view type applied to the input matrix.
+	 *
+	 * @param A      The input matrix
+	 *
+	 * @return Matrix view over a lambda function defined in this function.
+	 *
+	 * Specialization for non-square matrices. This distinction is necessary due
+	 * to different constructor signature for square and non-square matrices.
+	 *
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+		std::enable_if_t<
+			!structures::is_a< Structure, structures::Square >::value
+		> * = nullptr
+	>
+	Matrix<
+		DataType, Structure, Density::Dense,
+		view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		reference
+	>
+	conjugate(
+		const Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value
+		> * const = nullptr
+	) {
+
+		std::function< void( DataType &, const size_t, const size_t ) > data_lambda =
+			[ &A ]( DataType &result, const size_t i, const size_t j ) {
+				result = grb::utils::is_complex< DataType >::conjugate(
+					internal::access( A, internal::getStorageIndex( A, i, j ) )
+				);
+			};
+		std::function< bool() > init_lambda =
+			[ &A ]() -> bool {
+				return internal::getInitialized( A );
+			};
+		
+		return Matrix<
+			DataType,
+			Structure,
+			Density::Dense,
+			view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			reference
+			>(
+				init_lambda,
+				nrows( A ),
+				ncols( A ),
+				data_lambda
+			);
+
+	}
+
+	/** Specialization for square matrices */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC,
+		std::enable_if_t<
+			structures::is_a< Structure, structures::Square >::value
+		> * = nullptr
+	>
+	Matrix<
+		DataType, Structure, Density::Dense,
+		view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		reference
+	>
+	conjugate(
+		const Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A,
+		const std::enable_if_t<
+			!alp::is_object< DataType >::value
+		> * const = nullptr
+	) {
+
+		std::function< void( DataType &, const size_t, const size_t ) > data_lambda =
+			[ &A ]( DataType &result, const size_t i, const size_t j ) {
+				result = grb::utils::is_complex< DataType >::conjugate(
+					internal::access( A, internal::getStorageIndex( A, i, j ) )
+				);
+			};
+		std::function< bool() > init_lambda =
+			[ &A ]() -> bool {
+				return internal::getInitialized( A );
+			};
+		
+		return Matrix<
+			DataType,
+			Structure,
+			Density::Dense,
+			view::Functor< std::function< void( DataType &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			reference
+			>(
+				init_lambda,
+				nrows( A ),
+				data_lambda
+			);
+
+	}
+	/** @} */
+
+} // end namespace ``alp''
+
+#undef NO_CAST_OP_ASSERT
+
+#endif // end ``_H_ALP_REFERENCE_BLAS2''
+
diff --git a/include/alp/reference/blas3.hpp b/include/alp/reference/blas3.hpp
new file mode 100644
index 000000000..c398564a2
--- /dev/null
+++ b/include/alp/reference/blas3.hpp
@@ -0,0 +1,1604 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_BLAS3
+#define _H_ALP_REFERENCE_BLAS3
+
+#include <algorithm>   // for std::min/max
+#include <type_traits> // for std::enable_if
+
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+
+#include <alp/descriptors.hpp>
+#include <alp/structures.hpp>
+
+#include <alp/base/blas3.hpp>
+
+#include "blas0.hpp"
+#include "io.hpp"
+#include "matrix.hpp"
+#include "scalar.hpp"
+#include "vector.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the semiring domains, as specified in the "            \
+		"documentation of the function " y ", supply a container argument of " \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible semiring where all domains "  \
+		"match those of the container arguments, as specified in the "         \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+#define NO_CAST_OP_ASSERT( x, y, z )                                           \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the operator domains, as specified in the "            \
+		"documentation of the function " y ", supply an input argument of "    \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible operator where all domains "  \
+		"match those of the input parameters, as specified in the "            \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace alp {
+	namespace internal {
+
+		/**
+		 * \internal generic band mxm implementation - forward declaration.
+		 */
+		template<
+			size_t BandPos1, size_t BandPos2,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		typename std::enable_if<
+			( BandPos1 < std::tuple_size< typename InputStructure1::band_intervals >::value ) &&
+			( BandPos2 < std::tuple_size< typename InputStructure2::band_intervals >::value ),
+		RC >::type mxm_band_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid
+		);
+
+		/**
+		 * \internal generic band mxm implementation.
+		 * Recursively enumerating the cartesian product of non-zero bands 
+		 * (Base case).
+		 */
+		template<
+			size_t BandPos1, size_t BandPos2,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		typename std::enable_if<
+			( BandPos1 == std::tuple_size< typename InputStructure1::band_intervals >::value ),
+		RC >::type mxm_band_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid
+		) {
+			(void)C;
+			(void)A;
+			(void)B;
+			(void)oper;
+			(void)monoid;
+			(void)mulMonoid;
+
+			return SUCCESS;
+		}
+
+		/**
+		 * \internal generic band mxm implementation. 
+		 * Recursively enumerating the cartesian product of non-zero bands.
+		 * Move to next non-zero band of A.
+		 */
+		template<
+			size_t BandPos1, size_t BandPos2,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		typename std::enable_if<
+			( BandPos1 < std::tuple_size< typename InputStructure1::band_intervals >::value ) &&
+			( BandPos2 == std::tuple_size< typename InputStructure2::band_intervals >::value ),
+		RC >::type mxm_band_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid
+		) {
+			return mxm_band_generic< BandPos1 + 1, 0 >( C, A, B, oper, monoid, mulMonoid );
+		}
+
+		/**
+		 * \internal generic band mxm implementation. 
+		 * Recursively enumerating the cartesian product of non-zero bands.
+		 * Compute and move to next non-zero band of B.
+		 */
+		template<
+			size_t BandPos1, size_t BandPos2,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		typename std::enable_if<
+			( BandPos1 < std::tuple_size< typename InputStructure1::band_intervals >::value ) &&
+			( BandPos2 < std::tuple_size< typename InputStructure2::band_intervals >::value ),
+		RC >::type mxm_band_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid
+		) {
+
+			OutputType temp;
+
+			const std::ptrdiff_t M   { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t N   { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t K   { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+
+			const std::ptrdiff_t l_a { structures::get_lower_limit< BandPos1 >( A ) };
+			const std::ptrdiff_t u_a { structures::get_upper_limit< BandPos1 >( A ) };
+
+			const std::ptrdiff_t l_b { structures::get_lower_limit< BandPos2 >( B ) };
+			const std::ptrdiff_t u_b { structures::get_upper_limit< BandPos2 >( B ) };
+			
+			// In case of symmetry the iteration domain intersects the the upper 
+			// (or lower) domain of C
+			constexpr bool is_sym_a { structures::is_a< InputStructure1, structures::Symmetric >::value };
+			constexpr bool is_sym_b { structures::is_a< InputStructure2, structures::Symmetric >::value };
+			constexpr bool is_sym_c { structures::is_a< OutputStructure, structures::Symmetric >::value };
+
+			// Temporary until adding multiple symmetry directions
+			constexpr bool sym_up_a { is_sym_a };
+			constexpr bool sym_up_b { is_sym_b };
+			constexpr bool sym_up_c { is_sym_c };
+
+			// Intersecting potential symmetry of A and B, 
+			// in which case consider case Up( A ) * Up( B )
+			for( std::ptrdiff_t i = 0; i < M; ++i ) {
+				// Size + Symmetry constraints
+				//    sym_up_c * i   <= j < N
+				// Band constraints
+				// /\ i + l_a + l_b <= j < i + u_a + u_b - 1 (u is past-the-end)
+				for( std::ptrdiff_t j = std::max( sym_up_c * i, i + l_a + l_b ); 
+					 j < std::min( N, i + u_a + u_b - 1 ); 
+					 ++j ) {
+					
+					auto &c_val = internal::access( C, internal::getStorageIndex( C, i, j ) );
+
+					// Size + Symmetry constraints
+					//    sym_up_a * i <= l < K * (!sym_up_b) + ( j + 1 ) * (sym_up_b)   
+					// Band constraints
+					// /\ i + l_a      <= l < i + u_a        
+					// /\ j - u_b + 1  <= l < j - l_b + 1
+					for( std::ptrdiff_t l = std::max( { sym_up_a * i, i + l_a, j - u_b + 1 } ); 
+						 l < std::min( { K * ( !sym_up_b ) + ( j + 1 ) * sym_up_b, i + u_a, j - l_b + 1 } ); 
+						 ++l ) {
+						const auto ta { internal::access( A, internal::getStorageIndex( A, i, l ) ) };
+						const auto tb { internal::access( B, internal::getStorageIndex( B, l, j ) ) };
+						(void)internal::apply( temp, ta, tb, oper );
+						// std::cout << c_val << " += " << temp << " = " << ta << " * " << tb << std::endl;
+						(void)internal::foldl( c_val, temp, monoid.getOperator() );
+						// std::cout << c_val << std::endl;
+					}
+				}
+			}
+
+			if ( sym_up_b ) {
+				// Intersecting potential symmetry of A and B, 
+				// in which case consider case Up( A ) * Lo( B )
+				for( std::ptrdiff_t i = 0; i < M; ++i ) {
+					// Size + Symmetry constraints
+					//    sym_up_c * i   <= j < N - 1 
+					// Band constraints
+					// /\ i + l_a + l_b <= j < i + u_a + u_b - 1
+					for( std::ptrdiff_t j = std::max( sym_up_c * i, i + l_a + l_b ); 
+						j < std::min( N - 1, i + u_a + u_b - 1 ); 
+						++j ) {
+						
+						auto &c_val = internal::access( C, internal::getStorageIndex( C, i, j ) );
+
+						// Size + Symmetry constraints
+						//    max(sym_up_a * i, j + 1 ) <= l < K
+						// Band constraints
+						// /\ i + l_a              <= l < i + u_a 
+						// /\ j - u_b + 1          <= l < j - l_b + 1
+						for( std::ptrdiff_t l = std::max( { sym_up_a * i, j + 1, i + l_a, j - u_b + 1 } ); 
+							l < std::min( { K, i + u_a, j - l_b + 1 } ); 
+							++l ) {
+							const auto ta { internal::access( A, internal::getStorageIndex( A, i, l ) ) };
+							// Access to B^T
+							const auto tb { internal::access( B, internal::getStorageIndex( B, j, l ) ) };
+							(void)internal::apply( temp, ta, tb, oper );
+							(void)internal::foldl( c_val, temp, monoid.getOperator() );
+						}
+					}
+				}
+			}
+
+			if ( sym_up_a ) {
+				// Intersecting potential symmetry of A and B, 
+				// in which case consider case Lo( A ) * Up( B )
+				for( std::ptrdiff_t i = 0; i < M; ++i ) {
+					// Size + Symmetry constraints
+					//    sym_up_c * i   <= j < N
+					// Band constraints
+					// /\ i + l_a + l_b <= j < i + u_a + u_b - 1
+					for( std::ptrdiff_t j = std::max( sym_up_c * i, i + l_a + l_b ); 
+						j < std::min( N, i + u_a + u_b - 1 ); 
+						++j ) {
+						
+						auto &c_val = internal::access( C, internal::getStorageIndex( C, i, j ) );
+
+						// Size + Symmetry constraints
+						//    0                    <= l < min(i, 
+						//                                    K * ( !sym_up_b ) 
+						//                                    + ( j + 1 ) * sym_up_b
+						//                                    )
+						// Band constraints
+						// /\ i + l_a              <= l < i + u_a 
+						// /\ j - u_b + 1          <= l < j - l_b + 1
+						for( std::ptrdiff_t l = std::max( { ( std::ptrdiff_t )0, i + l_a, j - u_b + 1 } ); 
+							l < std::min( { i, K * ( !sym_up_b ) + ( j + 1 ) * sym_up_b, i + u_a, j - l_b + 1 } ); 
+							++l ) {
+							// Access to A^T
+							const auto ta { internal::access( A, internal::getStorageIndex( A, l, i ) ) };
+							const auto tb { internal::access( B, internal::getStorageIndex( B, l, j ) ) };
+							(void)internal::apply( temp, ta, tb, oper );
+							(void)internal::foldl( c_val, temp, monoid.getOperator() );
+						}
+					}
+				}
+
+				if( ( !sym_up_c ) && sym_up_b ) {
+					// Intersecting potential symmetry of A and B, 
+					// in which case consider case Lo( A ) * Lo( B ).
+					// Useful only if C is not sym
+					for( std::ptrdiff_t i = 2; i < M; ++i ) {
+						// Size + Symmetry constraints
+						//    0             <= j < i - 1
+						// Band constraints
+						// /\ i + l_a + l_b <= j < i + u_a + u_b - 1
+						for( std::ptrdiff_t j = std::max( ( std::ptrdiff_t )0, i + l_a + l_b ); 
+							j < std::min( i - 1, i + u_a + u_b - 1 ); 
+							++j ) {
+							
+							auto &c_val = internal::access( C, internal::getStorageIndex( C, i, j ) );
+
+							// Size + Symmetry constraints
+							//    j + 1                <= l < i
+							// Band constraints
+							// /\ i + l_a              <= l < i + u_a 
+							// /\ j - u_b + 1          <= l < j - l_b + 1
+							for( std::ptrdiff_t l = std::max( { j + 1, i + l_a, j - u_b + 1 } ); 
+								l < std::min( { i, i + u_a, j - l_b + 1 } ); 
+								++l ) {
+								// Access to A^T
+								const auto ta { internal::access( A, internal::getStorageIndex( A, l, i ) ) };
+								// Access to B^T
+								const auto tb { internal::access( B, internal::getStorageIndex( B, j, l ) ) };
+								(void)internal::apply( temp, ta, tb, oper );
+								(void)internal::foldl( c_val, temp, monoid.getOperator() );
+							}
+						}
+					}
+				}
+			}
+
+
+			return mxm_band_generic< BandPos1, BandPos2 + 1 >( C, A, B, oper, monoid, mulMonoid );
+		}
+
+		/**
+		 * \internal general mxm implementation that all mxm variants using 
+		 * structured matrices refer to.
+		 */
+		template<
+			bool allow_void,
+			class MulMonoid,
+			typename OutputType, typename InputType1, typename InputType2,
+			class Operator, class Monoid,
+			typename OutputStructure, typename OutputView, 
+			typename OutputImfR, typename OutputImfC,
+			typename InputStructure1, typename InputView1, 
+			typename InputImfR1, typename InputImfC1,
+			typename InputStructure2, typename InputView2, 
+			typename InputImfR2, typename InputImfC2
+		>
+		RC mxm_generic( 
+			alp::Matrix< OutputType, OutputStructure, 
+			Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+			const alp::Matrix< InputType1, InputStructure1, 
+			Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+			const alp::Matrix< InputType2, InputStructure2, 
+			Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if< !alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value && !
+				alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value &&
+				alp::is_monoid< Monoid >::value,
+			void >::type * const = NULL
+		) {
+
+			static_assert( 
+				!(
+					std::is_same< InputType1, void >::value ||
+					std::is_same< InputType2, void >::value
+				),
+				"alp::internal::mxm_generic: the operator-monoid version of mxm cannot be "
+				"used if either of the input matrices is a pattern matrix (of type "
+				"void)"
+			);
+
+#ifdef _DEBUG
+			std::cout << "In alp::internal::mxm_generic (reference)\n";
+#endif
+
+			// Early exit checks 
+			if( ! internal::getInitialized( A ) || 
+				! internal::getInitialized( B ) || 
+				! internal::getInitialized( C ) 
+			) {
+				internal::setInitialized( C, false );
+				return SUCCESS;
+			}
+
+			const std::ptrdiff_t m   { static_cast< std::ptrdiff_t >( nrows( C ) ) };
+			const std::ptrdiff_t n   { static_cast< std::ptrdiff_t >( ncols( C ) ) };
+			const std::ptrdiff_t m_a { static_cast< std::ptrdiff_t >( nrows( A ) ) };
+			const std::ptrdiff_t k   { static_cast< std::ptrdiff_t >( ncols( A ) ) };
+			const std::ptrdiff_t k_b { static_cast< std::ptrdiff_t >( nrows( B ) ) };
+			const std::ptrdiff_t n_b { static_cast< std::ptrdiff_t >( ncols( B ) ) };
+
+			if( m != m_a || k != k_b || n != n_b ) {
+				return MISMATCH;
+			}
+
+			return mxm_band_generic< 0, 0 >( C, A, B, oper, monoid, mulMonoid );
+		}
+
+	} // namespace internal
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with semiring parameter.
+	 *
+	 * @tparam descr      		The descriptors under which to perform the computation.
+	 * @tparam OutputStructMatT The structured matrix type of the output matrix.
+	 * @tparam InputStructMatT1 The structured matrix type of the the left-hand side input
+	 *                    		matrix.
+	 * @tparam InputStructMatT2 The structured matrix type of the right-hand side input
+	 *                    		matrix.
+	 * @tparam Semiring   		The semiring under which to perform the
+	 *                    		multiplication.
+	 *
+	 * @returns SUCCESS  If the computation completed as intended.
+	 * @returns MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do
+ *                       not match. All input data containers are left
+ *                       untouched if this exit code is returned; it will be
+ *                       as though this call was never made.
+	 *
+	 * @param[out] C 	The output matrix \f$ C = AB \f$ when the function returns
+	 *               	#SUCCESS.
+	 * @param[in]  A 	The left-hand side input matrix \f$ A \f$.
+	 * @param[in]  B 	The left-hand side input matrix \f$ B \f$.
+	 * @param[in] ring  (Optional.) The semiring under which the computation should
+	 *                             proceed.
+	 * @param phase 	The execution phase.
+	 */
+	template<
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
+		class Semiring
+	>
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+		const Semiring & ring = Semiring(),
+		const PHASE &phase = NUMERICAL,
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value && 
+			!alp::is_object< InputType1 >::value && 
+			!alp::is_object< InputType2 >::value && 
+			alp::is_semiring< Semiring >::value,
+			void 
+		>::type * const = NULL 
+	) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( 
+			C, A, B, 
+			ring.getMultiplicativeOperator(), ring.getAdditiveMonoid(), ring.getMultiplicativeMonoid() 
+		);
+	}
+
+	/**
+	 * Dense Matrix-Matrix multiply between structured matrices.
+	 * Version with additive monoid and multiplicative operator
+	 */
+	template< 
+		typename OutputType, typename InputType1, typename InputType2,
+		typename OutputStructure, typename OutputView, 
+		typename OutputImfR, typename OutputImfC,
+		typename InputStructure1, typename InputView1, 
+		typename InputImfR1, typename InputImfC1,
+		typename InputStructure2, typename InputView2, 
+		typename InputImfR2, typename InputImfC2,
+		class Operator, class Monoid
+	>
+	RC mxm( 
+		alp::Matrix< OutputType, OutputStructure, 
+		Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const alp::Matrix< InputType1, InputStructure1, 
+		Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const alp::Matrix< InputType2, InputStructure2, 
+		Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+		const Operator & mulOp,
+		const Monoid & addM,
+		const PHASE &phase = NUMERICAL,
+		const typename std::enable_if< 
+			!alp::is_object< OutputType >::value && 
+			!alp::is_object< InputType1 >::value && 
+			!alp::is_object< InputType2 >::value &&
+			alp::is_operator< Operator >::value && alp::is_monoid< Monoid >::value,
+			void 
+		>::type * const = NULL 
+	) {
+		(void)phase;
+
+		return internal::mxm_generic< false >( C, A, B, mulOp, addM, Monoid() );
+	}
+
+	namespace internal {
+
+		/**
+		 * Applies eWiseApply to all elements of the given band
+		 * Forward declaration. Specialization handle bound-checking.
+		 * Assumes compatible parameters:
+		 *   - matching structures
+		 *   - matching dynamic
+		 */
+		template<
+			size_t band_index,
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class MulMonoid,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2,
+			class Operator,
+			std::enable_if_t<
+				band_index >= std::tuple_size< typename OutputStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseApply_matrix_band_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, reference > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, reference > *beta,
+			const Operator &oper,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value,
+			void >::type * const = nullptr
+		) {
+			(void)C;
+			(void)A;
+			(void)alpha;
+			(void)B;
+			(void)beta;
+			(void)oper;
+			(void)mulMonoid;
+			return SUCCESS;
+		}
+
+		/** Specialization for band index within the bounds */
+		template<
+			size_t band_index,
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class MulMonoid,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2,
+			class Operator,
+			std::enable_if_t<
+				band_index < std::tuple_size< typename OutputStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseApply_matrix_band_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, reference > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, reference > *beta,
+			const Operator &oper,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value,
+			void >::type * const = nullptr
+		) {
+			(void)mulMonoid;
+			assert( C != nullptr );
+			// In case of symmetry the iteration domain intersects the the upper
+			// (or lower) domain of A
+			constexpr bool is_sym_c = structures::is_a< OutputStructure, structures::Symmetric >::value;
+			constexpr bool is_sym_a = structures::is_a< InputStructure1, structures::Symmetric >::value;
+			constexpr bool is_sym_b = structures::is_a< InputStructure2, structures::Symmetric >::value;
+
+			// Temporary until adding multiple symmetry directions
+			constexpr bool sym_up_c = is_sym_c;
+			constexpr bool sym_up_a = is_sym_a;
+			constexpr bool sym_up_b = is_sym_b;
+
+			const auto i_limits = structures::calculate_row_coordinate_limits< band_index >( *C );
+
+			for( size_t i = i_limits.first; i < i_limits.second; ++i ) {
+
+				const auto j_limits = structures::calculate_column_coordinate_limits< band_index >( *C, i );
+
+				for( size_t j = j_limits.first; j < j_limits.second; ++j ) {
+					auto &C_val = internal::access( *C, internal::getStorageIndex( *C, i, j ) );
+
+					// Calculate indices to A and B depending on matching symmetry with C
+					const size_t A_i = ( sym_up_c == sym_up_a ) ? i : j;
+					const size_t A_j = ( sym_up_c == sym_up_a ) ? j : i;
+					const size_t B_i = ( sym_up_c == sym_up_b ) ? i : j;
+					const size_t B_j = ( sym_up_c == sym_up_b ) ? j : i;
+
+					if( left_scalar ) {
+						if( right_scalar ) {
+							// C = alpha . beta
+							internal::apply( C_val, **alpha, **beta, oper );
+						} else {
+							// C = alpha . B
+							const auto &B_val = internal::access( *B, internal::getStorageIndex( *B, B_i, B_j ) );
+							internal::apply( C_val, **alpha, B_val, oper );
+						}
+					} else {
+						if( right_scalar ) {
+							// C = A . beta
+							const auto &A_val = internal::access( *A, internal::getStorageIndex( *A, A_i, A_j ) );
+							internal::apply( C_val, A_val, **beta, oper );
+						} else {
+							// C = A . B
+							const auto &A_val = internal::access( *A, internal::getStorageIndex( *A, A_i, A_j ) );
+							const auto &B_val = internal::access( *B, internal::getStorageIndex( *B, B_i, B_j ) );
+							internal::apply( C_val, A_val, B_val, oper );
+						}
+					}
+				}
+			}
+			return eWiseApply_matrix_band_generic<
+				band_index + 1, left_scalar, right_scalar, descr
+			>( C, A, alpha, B, beta, oper, mulMonoid );
+		}
+
+		/**
+		 * \internal general elementwise matrix application that all eWiseApply variants refer to.
+		 */
+		template<
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class MulMonoid,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2,
+			class Operator
+		>
+		RC eWiseApply_matrix_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, reference > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, reference > *beta,
+			const Operator &oper,
+			const MulMonoid &mulMonoid,
+			const typename std::enable_if<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_operator< Operator >::value,
+			void >::type * const = NULL
+		) {
+			(void)alpha;
+			(void)beta;
+			(void)oper;
+			(void)mulMonoid;
+
+#ifdef _DEBUG
+			std::cout << "In alp::internal::eWiseApply_matrix_generic\n";
+#endif
+
+			// run-time checks
+			// TODO: support left/right_scalar
+			const size_t m = alp::nrows( *C );
+			const size_t n = alp::ncols( *C );
+
+			if( !left_scalar ){
+				assert( A != nullptr );
+				if( m != nrows( *A ) || n != ncols( *A ) ) {
+					return MISMATCH;
+				}
+			}
+			if( !right_scalar ){
+				assert( B != nullptr );
+				if( m != nrows( *B ) || n != ncols( *B ) ) {
+					return MISMATCH;
+				}
+			}
+
+			// delegate to single-band variant
+			return eWiseApply_matrix_band_generic< 0, left_scalar, right_scalar, descr >( C, A, alpha, B, beta, oper, mulMonoid );
+		}
+
+	} // namespace internal
+
+	/**
+	 * @brief Computes \f$ C = A . B \f$ for a given monoid.
+	 * 
+	 * @tparam descr      		The descriptor to be used (descriptors::no_operation
+	 *                    		if left unspecified).
+	 * @tparam OutputType 		The element type of the output matrix
+	 * @tparam InputType1 		The element type of the left-hand side matrix
+	 * @tparam InputType2 		The element type of the right-hand side matrix
+	 * @tparam OutputStructure 	The structure of the output matrix
+	 * @tparam InputStructure1 	The structure of the left-hand side matrix
+	 * @tparam InputStructure2  The structure of the right-hand matrix
+	 * @tparam OutputView 		The type of view of the output matrix
+	 * @tparam InputView1 		The type of view of the left-hand matrix
+	 * @tparam InputView2 		The type of view of the right-hand matrix
+	 * @tparam MulMonoid 		The type of monoid used for this element-wise operation
+	 * 
+	 * @param C 		The output structured matrix
+	 * @param A 		The left-hand side structured matrix
+	 * @param B 		The right-hand side structured matrix
+	 * @param mulmono 	The monoid used in the element-wise operation
+	 * @param phase 	The execution phase 
+	 * 
+	 * @return alp::MISMATCH Whenever the structures or dimensions of \a A, \a B, and \a C do
+	 *                       not match. All input data containers are left
+	 *                       untouched if this exit code is returned; it will be
+	 *                       as though this call was never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+ 	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class MulMonoid
+	>
+	RC eWiseApply(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+		const MulMonoid &mulmono,
+		const typename std::enable_if< !alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< MulMonoid >::value,
+		void >::type * const = NULL
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D1, InputType1 >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D2, InputType2 >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with a postfactor input matrix B that does not match the "
+			"second domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D3, OutputType >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with an output matrix C that does not match the output domain "
+			"of the monoid operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In alp::eWiseApply (reference, monoid)\n";
+#endif
+		constexpr Scalar< InputType1, structures::General, reference > *no_scalar = nullptr;
+		constexpr bool left_scalar = false;
+		constexpr bool right_scalar = false;
+
+		return internal::eWiseApply_matrix_generic< left_scalar, right_scalar, descr >(
+			&C, &A, no_scalar, &B, no_scalar, mulmono.getOperator(), mulmono
+		);
+	}
+
+
+	/**
+	 * Computes \f$ C = alpha . B \f$ for a given monoid.
+	 *
+	 * Case where \a A is a scalar.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class MulMonoid
+	>
+	RC eWiseApply(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const Scalar< InputType1, InputStructure1, reference > &alpha,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+		const MulMonoid &mulmono,
+		const typename std::enable_if< !alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< MulMonoid >::value,
+		void >::type * const = NULL
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D1, InputType1 >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D2, InputType2 >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with a postfactor input matrix B that does not match the "
+			"second domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D3, OutputType >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with an output matrix C that does not match the output domain "
+			"of the monoid operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In alp::eWiseApply (reference, monoid)\n";
+#endif
+
+		constexpr Matrix< InputType1, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > * no_matrix = nullptr;
+		constexpr Scalar< InputType2, structures::General, reference > *no_scalar = nullptr;
+		constexpr bool left_scalar = true;
+		constexpr bool right_scalar = false;
+
+		return internal::eWiseApply_matrix_generic< left_scalar, right_scalar, descr >(
+			&C, no_matrix, &alpha, &B, no_scalar, mulmono.getOperator(), mulmono
+		);
+	}
+
+	/**
+	 * Computes \f$ C = A . beta \f$ for a given monoid.
+	 *
+	 * Case where \a B is a scalar.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2,
+		class MulMonoid
+	>
+	RC eWiseApply(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const Scalar< InputType2, InputStructure2, reference > &beta,
+		const MulMonoid &mulmono,
+		const typename std::enable_if< !alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_monoid< MulMonoid >::value,
+		void >::type * const = NULL
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D1, InputType1 >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D2, InputType2 >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with a postfactor input matrix B that does not match the "
+			"second domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D3, OutputType >::value ),
+			"alp::eWiseApply (reference, matrix <- matrix x matrix, monoid)",
+			"called with an output matrix C that does not match the output domain "
+			"of the monoid operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In alp::eWiseApply (reference, monoid)\n";
+#endif
+
+		constexpr Scalar< InputType1, structures::General, reference > *no_scalar = nullptr;
+		constexpr Matrix< InputType2, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > *no_matrix = nullptr;
+		constexpr bool left_scalar = false;
+		constexpr bool right_scalar = true;
+
+		return internal::eWiseApply_matrix_generic< left_scalar, right_scalar, descr >(
+			&C, &A, no_scalar, no_matrix, &beta, mulmono.getOperator(), mulmono
+		);
+	}
+
+	namespace internal {
+
+		/**
+		 * Applies eWiseMul to all elements of the given band
+		 * Specialization handle bound-checking.
+		 * Assumes compatible parameters:
+		 *   - matching structures
+		 *   - matching dynamic sizes
+		 */
+		template<
+			size_t band_index,
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class Ring,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2,
+			std::enable_if_t<
+				band_index >= std::tuple_size< typename OutputStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseMul_matrix_band_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, reference > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, reference > *beta,
+			const Ring &ring,
+			const std::enable_if_t<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_semiring< Ring >::value
+			> * const = nullptr
+		) {
+			(void) C;
+			(void) A;
+			(void) alpha;
+			(void) B;
+			(void) beta;
+			(void) ring;
+			return SUCCESS;
+		}
+
+		/** Specialization for band index within the bounds */
+		template<
+			size_t band_index,
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class Ring,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2,
+			std::enable_if_t<
+				band_index < std::tuple_size< typename OutputStructure::band_intervals >::value
+			> * = nullptr
+		>
+		RC eWiseMul_matrix_band_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, reference > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, reference > *beta,
+			const Ring &ring,
+			const std::enable_if_t<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_semiring< Ring >::value
+			> * const = nullptr
+		) {
+			assert( C != nullptr );
+			// In case of symmetry the iteration domain intersects the the upper
+			// (or lower) domain of A
+			constexpr bool is_sym_c = structures::is_a< OutputStructure, structures::Symmetric >::value;
+			constexpr bool is_sym_a = structures::is_a< InputStructure1, structures::Symmetric >::value;
+			constexpr bool is_sym_b = structures::is_a< InputStructure2, structures::Symmetric >::value;
+
+			// Temporary until adding multiple symmetry directions
+			constexpr bool sym_up_c = is_sym_c;
+			constexpr bool sym_up_a = is_sym_a;
+			constexpr bool sym_up_b = is_sym_b;
+
+			const auto i_limits = structures::calculate_row_coordinate_limits< band_index >( *C );
+
+			for( size_t i = i_limits.first; i < i_limits.second; ++i ) {
+
+				const auto j_limits = structures::calculate_column_coordinate_limits< band_index >( *C, i );
+
+				for( size_t j = j_limits.first; j < j_limits.second; ++j ) {
+					OutputType C_tmp;
+
+					// Calculate indices to A and B depending on matching symmetry with C
+					const size_t A_i = ( sym_up_c == sym_up_a ) ? i : j;
+					const size_t A_j = ( sym_up_c == sym_up_a ) ? j : i;
+					const size_t B_i = ( sym_up_c == sym_up_b ) ? i : j;
+					const size_t B_j = ( sym_up_c == sym_up_b ) ? j : i;
+
+					if( left_scalar ) {
+						if( right_scalar ) {
+							// C = alpha . beta
+							internal::apply( C_tmp, **alpha, **beta, ring.getMultiplicativeOperator() );
+						} else {
+							// C = alpha . B
+							const auto &B_val = internal::access( *B, internal::getStorageIndex( *B, B_i, B_j ) );
+							internal::apply( C_tmp, **alpha, B_val, ring.getMultiplicativeOperator() );
+						}
+					} else {
+						if( right_scalar ) {
+							// C = A . beta
+							const auto &A_val = internal::access( *A, internal::getStorageIndex( *A, A_i, A_j ) );
+							internal::apply( C_tmp, A_val, **beta, ring.getMultiplicativeOperator() );
+						} else {
+							// C = A . B
+							const auto &A_val = internal::access( *A, internal::getStorageIndex( *A, A_i, A_j ) );
+							const auto &B_val = internal::access( *B, internal::getStorageIndex( *B, B_i, B_j ) );
+							internal::apply( C_tmp, A_val, B_val, ring.getMultiplicativeOperator() );
+						}
+					}
+
+					auto &C_val = internal::access( *C, internal::getStorageIndex( *C, i, j ) );
+					internal::foldl( C_val, C_tmp, ring.getAdditiveOperator() );
+				}
+			}
+			return eWiseMul_matrix_band_generic<
+				band_index + 1, left_scalar, right_scalar, descr
+			>( C, A, alpha, B, beta, ring );
+		}
+
+		/**
+		 * \internal general elementwise matrix application that all eWiseApply variants refer to.
+		 */
+		template<
+			bool left_scalar, bool right_scalar,
+			Descriptor descr,
+			class Ring,
+			typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+			typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+			typename InputTypeScalar1, typename InputStructureScalar1,
+			typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+			typename InputTypeScalar2, typename InputStructureScalar2
+		>
+		RC eWiseMul_matrix_generic(
+			alp::Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > *C,
+			const alp::Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > *A,
+			const alp::Scalar< InputTypeScalar1, InputStructureScalar1, reference > *alpha,
+			const alp::Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > *B,
+			const alp::Scalar< InputTypeScalar2, InputStructureScalar2, reference > *beta,
+			const Ring &ring = Ring(),
+			const std::enable_if_t<
+				!alp::is_object< OutputType >::value &&
+				!alp::is_object< InputType1 >::value &&
+				!alp::is_object< InputType2 >::value &&
+				alp::is_semiring< Ring >::value
+			> * const = nullptr
+		) {
+
+#ifdef _DEBUG
+			std::cout << "In alp::internal::eWiseMul_matrix_generic\n";
+#endif
+
+			// run-time checks
+			const size_t m = alp::nrows( *C );
+			const size_t n = alp::ncols( *C );
+
+			if( !left_scalar ){
+				assert( A != nullptr );
+				if( m != nrows( *A ) || n != ncols( *A ) ) {
+					return MISMATCH;
+				}
+			}
+			if( !right_scalar ){
+				assert( B != nullptr );
+				if( m != nrows( *B ) || n != ncols( *B ) ) {
+					return MISMATCH;
+				}
+			}
+
+			// delegate to single-band variant
+			return eWiseMul_matrix_band_generic< 0, left_scalar, right_scalar, descr >( C, A, alpha, B, beta, ring );
+		}
+
+	} // namespace internal
+
+	/**
+	 * Calculates the element-wise multiplication of two matrices,
+	 *     \f$ C = C + A .* B \f$,
+	 * under a given semiring.
+	 *
+	 * @tparam descr      The descriptor to be used (descriptors::no_operation
+	 *                    if left unspecified).
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the multiplicative
+	 *                    operator of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the multiplicative
+	 *                    operator of the \a ring.
+	 * @tparam OutputType The the result type of the multiplicative operator of
+	 *                    the \a ring.
+	 * @tparam InputStructure1  The structure of the left-hand side input to
+	 *                          the multiplicative operator of the \a ring.
+	 * @tparam InputStructure2  The structure of the right-hand side input
+	 *                          to the multiplicative operator of the \a ring.
+	 * @tparam OutputStructure1 The structure of the output to the
+	 *                          multiplicative operator of the \a ring.
+	 * @tparam InputView1       The view type applied to the left-hand side
+	 *                          input to the multiplicative operator
+	 *                          of the \a ring.
+	 * @tparam InputView2       The view type applied to the right-hand side
+	 *                          input to the multiplicative operator
+	 *                          of the \a ring.
+	 * @tparam OutputView1      The view type applied to the output to the
+	 *                          multiplicative operator of the \a ring.
+	 *
+	 * @param[out]  z  The output vector of type \a OutputType.
+	 * @param[in]   x  The left-hand input vector of type \a InputType1.
+	 * @param[in]   y  The right-hand input vector of type \a InputType2.
+	 * @param[in] ring The generalized semiring under which to perform this
+	 *                 element-wise multiplication.
+	 *
+	 * @return alp::MISMATCH Whenever the dimensions of \a x, \a y, and \a z do
+	 *                       not match. All input data containers are left
+	 *                       untouched if this exit code is returned; it will be
+	 *                       as though this call was never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * alp::descriptors::no_operation, alp::descriptors::no_casting.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If alp::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 *
+	 * \endparblock
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      -# This call takes \f$ \Theta(n) \f$ work, where \f$ n \f$ equals the
+	//  *         size of the vectors \a x, \a y, and \a z. The constant factor
+	//  *         depends on the cost of evaluating the multiplication operator. A
+	//  *         good implementation uses vectorised instructions whenever the input
+	//  *         domains, the output domain, and the multiplicative operator used
+	//  *         allow for this.
+	//  *
+	//  *      -# This call will not result in additional dynamic memory allocations.
+	//  *
+	//  *      -# This call takes \f$ \mathcal{O}(1) \f$ memory beyond the memory
+	//  *         used by the application at the point of a call to this function.
+	//  *
+	//  *      -# This call incurs at most \f$ n( \mathit{sizeof}(\mathit{D1}) +
+	//  *         \mathit{sizeof}(\mathit{D2}) + \mathit{sizeof}(\mathit{D3})) +
+	//  *         \mathcal{O}(1) \f$ bytes of data movement. A good implementation
+	//  *         will stream \a x or \a y into \a z to apply the multiplication
+	//  *         operator in-place, whenever the input domains, the output domain,
+	//  *         and the operator used allow for this.
+	//  * \endparblock
+	 *
+	 * \warning When given sparse vectors, the zero now annihilates instead of
+	 *       acting as an identity. Thus the eWiseMul cannot simply map to an
+	 *       eWiseApply of the multiplicative operator.
+	 *
+	 * @see This is a specialised form of eWiseMulAdd.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2
+	>
+	RC eWiseMul(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D1, InputType1 >::value ),
+			"alp::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D2, InputType2 >::value ),
+			"alp::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring"
+		);
+		NO_CAST_OP_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D3, OutputType >::value ),
+			"alp::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring"
+		);
+#ifdef _DEBUG
+		std::cout << "eWiseMul (reference, matrix <- matrix x matrix) dispatches to internal::eWiseMul_matrix_generic (matrix <- matrix x matrix)\n";
+#endif
+		constexpr Scalar< InputType1, structures::General, reference > *no_scalar = nullptr;
+		constexpr bool left_scalar = false;
+		constexpr bool right_scalar = false;
+		return internal::eWiseMul_matrix_generic< left_scalar, right_scalar, descr >( &C, &A, no_scalar, &B, no_scalar, ring );
+	}
+
+	/**
+	 * eWiseMul, version where A is a scalar.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, 
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2
+	>
+	RC eWiseMul(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const Scalar< InputType1, InputStructure1, reference > &alpha,
+		const Matrix< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &B,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D1, InputType1 >::value ),
+			"alp::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring"
+		);
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D2, InputType2 >::value ),
+			"alp::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring"
+		);
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D3, OutputType >::value ),
+			"alp::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring"
+		);
+#ifdef _DEBUG
+		std::cout << "eWiseMul (reference, matrix <- scalar x matrix) dispatches to internal::eWiseMul_matrix_generic (matrix <- scalar x matrix)\n";
+#endif
+		constexpr Matrix< InputType1, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > *no_matrix = nullptr;
+		constexpr Scalar< InputType2, structures::General, reference > *no_scalar = nullptr;
+		constexpr bool left_scalar = true;
+		constexpr bool right_scalar = false;
+		return internal::eWiseMul_matrix_generic< left_scalar, right_scalar, descr >( &C, no_matrix, &alpha, &B, no_scalar, ring );
+	}
+
+	/**
+	 * eWiseMul, version where B is a scalar.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2
+	>
+	RC eWiseMul(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const Matrix< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &A,
+		const Scalar< InputType2, InputStructure2, reference > &beta,
+		const Ring &ring = Ring(),
+		const std::enable_if_t<
+			!alp::is_object< OutputType >::value &&
+			!alp::is_object< InputType1 >::value &&
+			!alp::is_object< InputType2 >::value &&
+			alp::is_semiring< Ring >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D1, InputType1 >::value ),
+			"alp::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring"
+		);
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D2, InputType2 >::value ),
+			"alp::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring"
+		);
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< typename Ring::D3, OutputType >::value ),
+			"alp::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring"
+		);
+#ifdef _DEBUG
+		std::cout << "eWiseMul (reference, matrix <- matrix x scalar) dispatches to internal::eWiseMul_matrix_generic (matrix <- matrix x scalar)\n";
+#endif
+		constexpr Scalar< InputType1, structures::General, reference > *no_scalar = nullptr;
+		constexpr Matrix< InputType2, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > *no_matrix = nullptr;
+		constexpr bool left_scalar = false;
+		constexpr bool right_scalar = true;
+		return internal::eWiseMul_matrix_generic< left_scalar, right_scalar, descr >( &C, &A, no_scalar, no_matrix, &beta, ring );
+	}
+
+	/**
+	 * @brief  Outer product of two vectors. The result matrix \a A will contain \f$ uv^T \f$.
+	 * 
+	 * @tparam descr      	The descriptor to be used (descriptors::no_operation
+	 *                    	if left unspecified).
+	 * @tparam InputType1 	The value type of the left-hand vector.
+	 * @tparam InputType2 	The value type of the right-hand scalar.
+	 * @tparam OutputType 	The value type of the ouput vector.
+	 * @tparam InputStructure1  The Structure type applied to the left-hand vector.
+	 * @tparam InputStructure2  The Structure type applied to the right-hand vector.
+	 * @tparam OutputStructure1 The Structure type applied to the output vector.
+	 * @tparam InputView1       The view type applied to the left-hand vector.
+	 * @tparam InputView2       The view type applied to the right-hand vector.
+	 * @tparam OutputView1      The view type applied to the output vector.
+	 * @tparam Operator         The operator type used for this element-wise operation.
+	 *  
+	 * @param A      The output structured matrix 
+	 * @param u      The left-hand side vector view
+	 * @param v 	 The right-hand side vector view
+	 * @param mul 	 The operator
+	 * @param phase  The execution phase 
+	 * 
+	 * @return alp::MISMATCH Whenever the structures or dimensions of \a A, \a u, and \a v do
+	 *                       not match. All input data containers are left
+	 *                       untouched if this exit code is returned; it will be
+	 *                       as though this call was never made.
+	 * @return alp::SUCCESS  On successful completion of this call.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Operator
+	>
+	RC outer( Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > & A,
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > & u,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > & v,
+		const Operator & mul = Operator(),
+		const typename std::enable_if< alp::is_operator< Operator >::value && ! alp::is_object< InputType1 >::value && ! alp::is_object< InputType2 >::value && ! alp::is_object< OutputType >::value,
+			void >::type * const = NULL ) {
+		// static checks
+		NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, InputType1 >::value ), "alp::outerProduct",
+			"called with a prefactor vector that does not match the first domain "
+			"of the given multiplication operator" );
+		NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, InputType2 >::value ), "alp::outerProduct",
+			"called with a postfactor vector that does not match the first domain "
+			"of the given multiplication operator" );
+		NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) || std::is_same< typename Operator::D3, OutputType >::value ), "alp::outerProduct",
+			"called with an output matrix that does not match the output domain of "
+			"the given multiplication operator" );
+
+		const size_t nrows = size( u );
+		const size_t ncols = size( v );
+
+		if( nrows != alp::nrows( A ) ) {
+			return MISMATCH;
+		}
+
+		if( ncols != alp::ncols( A ) ) {
+			return MISMATCH;
+		}
+
+		alp::Matrix< InputType1, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > u_matrix( nrows, 1 );
+		alp::Matrix< InputType2, structures::General, Density::Dense, view::Original< void >, imf::Id, imf::Id, reference > v_matrix( 1, ncols );
+
+		// auto u_converter = alp::utils::makeVectorToMatrixConverter< InputType1 >( u, []( const size_t & ind, const InputType1 & val ) {
+		// 	return std::make_pair( std::make_pair( ind, 0 ), val );
+		// } );
+
+		// alp::buildMatrixUnique( u_matrix, u_converter.begin(), u_converter.end(), PARALLEL );
+
+		// auto v_converter = alp::utils::makeVectorToMatrixConverter< InputType2 >( v, []( const size_t & ind, const InputType2 & val ) {
+		// 	return std::make_pair( std::make_pair( 0, ind ), val );
+		// } );
+		// alp::buildMatrixUnique( v_matrix, v_converter.begin(), v_converter.end(), PARALLEL );
+
+		alp::Monoid< alp::operators::left_assign< OutputType >, alp::identities::zero > mono;
+
+		return alp::mxm( A, u_matrix, v_matrix, mul, mono );
+	}
+
+	/**
+	 * Returns a view over the general rank-1 matrix computed with the outer product.
+	 * This avoids creating the resulting container. The elements are calculated lazily on access.
+	 *
+	 * @tparam descr      	    The descriptor to be used (descriptors::no_operation
+	 *                    	    if left unspecified).
+	 * @tparam InputType1 	    The value type of the left-hand vector.
+	 * @tparam InputType2 	    The value type of the right-hand scalar.
+	 * @tparam InputStructure1  The Structure type applied to the left-hand vector.
+	 * @tparam InputStructure2  The Structure type applied to the right-hand vector.
+	 * @tparam InputView1       The view type applied to the left-hand vector.
+	 * @tparam InputView2       The view type applied to the right-hand vector.
+	 * @tparam Operator         The operator type used for this element-wise operation.
+	 *
+	 * @param x      The left-hand side vector view
+	 * @param y 	 The right-hand side vector view
+	 * @param mul 	 The operator
+	 * @param phase  The execution phase
+	 *
+	 * @return Matrix view over a lambda function defined in this function.
+	 *         The data type of the matrix equals to the return type of the provided operator.
+	 *         The structure of this matrix is General.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType1, typename InputStructure1, typename InputView1, typename InputImfR1, typename InputImfC1,
+		typename InputType2, typename InputStructure2, typename InputView2, typename InputImfR2, typename InputImfC2,
+		class Operator
+	>
+	Matrix< typename Operator::D3, structures::General, Density::Dense,
+		view::Functor< std::function< void( InputType1 &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		reference
+	>
+	outer(
+		const Vector< InputType1, InputStructure1, Density::Dense, InputView1, InputImfR1, InputImfC1, reference > &x,
+		const Vector< InputType2, InputStructure2, Density::Dense, InputView2, InputImfR2, InputImfC2, reference > &y,
+		const Operator &mul = Operator(),
+		const typename std::enable_if< alp::is_operator< Operator >::value &&
+			! alp::is_object< InputType1 >::value &&
+			! alp::is_object< InputType2 >::value,
+			void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, InputType1 >::value ), "alp::outerProduct",
+			"called with a prefactor vector that does not match the first domain "
+			"of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, InputType2 >::value ), "alp::outerProduct",
+			"called with a postfactor vector that does not match the first domain "
+			"of the given multiplication operator" );
+
+		std::function< void( typename Operator::D3 &, const size_t, const size_t ) > data_lambda =
+			[ &x, &y, &mul ]( typename Operator::D3 &result, const size_t i, const size_t j ) {
+				//set( ret, alp::identities::zero );
+				internal::apply(
+					result,
+					x[ i ],
+					grb::utils::is_complex< InputType2 >::conjugate( y[ j ] ),
+					mul
+				);
+			};
+		std::function< bool() > init_lambda =
+			[ &x, &y ]() -> bool {
+				return internal::getInitialized( x ) && internal::getInitialized( y );
+			};
+
+		return Matrix<
+			typename Operator::D3,
+			structures::General,
+			Density::Dense,
+			view::Functor< std::function< void( typename Operator::D3 &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			reference
+		>(
+			init_lambda,
+			size( x ),
+			size( y ),
+			data_lambda
+		);
+
+	}
+
+	/**
+	 * Returns a view over the general rank-1 matrix computed with the outer product.
+	 * Version for the case when input vectors are the same vector,
+	 * which results in a symmetric matrix.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC,
+		class Operator
+	>
+	Matrix<
+		typename Operator::D3,
+		typename std::conditional<
+			grb::utils::is_complex< typename Operator::D3 >::value,
+			alp::structures::Hermitian,
+			alp::structures::Symmetric
+		>::type,
+		Density::Dense,
+		view::Functor< std::function< void( typename Operator::D3 &, const size_t, const size_t ) > >,
+		imf::Id, imf::Id,
+		reference
+	>
+	outer(
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &x,
+		const Operator &mul = Operator(),
+		const typename std::enable_if< alp::is_operator< Operator >::value &&
+			! alp::is_object< InputType >::value,
+			void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D1, InputType >::value ), "alp::outerProduct",
+			"called with a prefactor vector that does not match the first domain "
+			"of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< typename Operator::D2, InputType >::value ), "alp::outerProduct",
+			"called with a prefactor vector that does not match the first domain "
+			"of the given multiplication operator" );
+
+		std::function< void( typename Operator::D3 &, const size_t, const size_t ) > data_lambda =
+			[ &x, &mul ]( typename Operator::D3 &result, const size_t i, const size_t j ) {
+				//set( ret, alp::identities::zero );
+				internal::apply(
+					result, x[ i ],
+					grb::utils::is_complex< InputType >::conjugate( x[ j ] ),
+					mul
+				);
+			};
+		std::function< bool() > init_lambda =
+			[ &x ]() -> bool {
+				return internal::getInitialized( x );
+			};
+
+		return Matrix<
+			typename Operator::D3,
+			typename std::conditional<
+				grb::utils::is_complex< typename Operator::D3 >::value,
+				alp::structures::Hermitian,
+				alp::structures::Symmetric
+			>::type,
+			Density::Dense,
+			view::Functor< std::function< void( typename Operator::D3 &, const size_t, const size_t ) > >,
+			imf::Id, imf::Id,
+			reference
+		>(
+			init_lambda,
+			size( x ),
+			data_lambda
+		);
+
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+#undef NO_CAST_OP_ASSERT
+
+#endif // end ``_H_ALP_REFERENCE_BLAS3''
+
diff --git a/include/alp/reference/collectives.hpp b/include/alp/reference/collectives.hpp
new file mode 100644
index 000000000..0812031fa
--- /dev/null
+++ b/include/alp/reference/collectives.hpp
@@ -0,0 +1,76 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_COLL
+#define _H_ALP_REFERENCE_COLL
+
+namespace alp {
+
+	/** 
+	 * \internal Since this backend only has a single user process, the below only
+	 *           contains trivial implementations.
+	 */
+	template<>
+	class collectives< reference > {
+
+		private:
+
+			collectives() {}
+
+		public:
+
+			/** \internal Trivial no-op */
+			template< Descriptor descr = descriptors::no_operation,
+				typename OP, typename IOType
+			>
+			static RC allreduce( IOType &, const OP &op = OP() ) {
+				(void)op;
+				return SUCCESS;
+			}
+
+			/** \internal Trivial no-op */
+			template< Descriptor descr = descriptors::no_operation,
+				typename OP, typename IOType
+			>
+			static RC reduce( IOType &, const size_t root = 0, const OP &op = OP() ) {
+				assert( root == 0 );
+#ifdef NDEBUG
+				(void)root;
+#endif
+				return SUCCESS;
+			}
+
+			template< typename IOType >
+			static RC broadcast( IOType &, const size_t root = 0 ) {
+				assert( root == 0 );
+#ifdef NDEBUG
+				(void)root;
+#endif
+				return SUCCESS;
+			}
+
+	};
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_REFERENCE_COLL''
+
diff --git a/include/alp/reference/config.hpp b/include/alp/reference/config.hpp
new file mode 100644
index 000000000..725d85f02
--- /dev/null
+++ b/include/alp/reference/config.hpp
@@ -0,0 +1,54 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Contains the configuration parameters for the reference and reference_omp
+ * backends.
+ *
+ * @author A. N. Yzelman
+ * @date 14th of January 2022.
+ */
+
+#ifndef _H_ALP_REFERENCE_CONFIG
+#define _H_ALP_REFERENCE_CONFIG
+
+#include <alp/base/config.hpp>
+
+namespace alp {
+
+	/**
+	 * \defgroup reference The reference backend implementation
+	 *
+	 * Groups all definitions and documentations corresponding to the #reference
+	 * and #reference_omp implementations.
+	 * @{
+	 */
+
+	namespace config {
+
+		/** \internal No backend-specific configuration parameters yet */
+
+	} // namespace config
+
+	/** @} */
+
+} // namespace alp
+
+#endif // end ``_H_ALP_REFERENCE_CONFIG''
+
diff --git a/include/alp/reference/exec.hpp b/include/alp/reference/exec.hpp
new file mode 100644
index 000000000..7246f32ff
--- /dev/null
+++ b/include/alp/reference/exec.hpp
@@ -0,0 +1,115 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_EXEC
+#define _H_ALP_REFERENCE_EXEC
+
+#include <alp/backends.hpp>
+
+#include <alp/base/exec.hpp>
+
+#include "init.hpp"
+
+namespace alp {
+
+	/**
+	 * \internal No implementation notes.
+	 */
+	template< EXEC_MODE mode >
+	class Launcher< mode, reference > {
+
+		public:
+
+			/** \internal No implementation notes. */
+			Launcher( const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) {
+				(void) hostname;
+				(void) port;
+				if( nprocs != 1 ) {
+					throw std::invalid_argument( "Total number of user processes must be"
+						"exactly one when using the reference implementation." );
+				}
+				if( process_id != 0 ) {
+					throw std::invalid_argument( "Process ID must always be zero in the reference"
+						 "implementation." );
+				}
+			}
+
+			/** \internal No implementation notes. */
+			template< typename U >
+			RC exec(
+				void ( *alp_program )( const void *, const size_t, U & ),
+				const void *data_in, const size_t in_size,
+				U &data_out,
+				const bool broadcast = false
+			) const {
+				(void)broadcast; // value doesn't matter for a single user process
+				// intialise GraphBLAS
+				RC ret = init();
+				// call graphBLAS algo
+				if( ret == SUCCESS ) {
+					( *alp_program )( data_in, in_size, data_out );
+				}
+				// finalise the GraphBLAS
+				if( ret == SUCCESS ) {
+					ret = finalize();
+				}
+				// and done
+				return ret;
+			}
+
+			/** \internal No implementation notes. */
+			template< typename T, typename U >
+			RC exec(
+				void ( *alp_program )( const T &, U & ),
+				const T &data_in, U &data_out,
+				const bool broadcast = false
+			) {
+				(void)broadcast; // value doesn't matter for a single user process
+				// intialise GraphBLAS
+				RC ret = init();
+				// call graphBLAS algo
+				if( ret == SUCCESS ) {
+					( *alp_program )( data_in, data_out );
+				}
+				// finalise the GraphBLAS
+				if( ret == SUCCESS ) {
+					ret = finalize();
+				}
+				// and done
+				return ret;
+			}
+
+			/** \internal No implementation notes. */
+			static inline RC finalize() {
+				return SUCCESS;
+			}
+
+	};
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_REFERENCE_EXEC''
+
diff --git a/include/alp/reference/init.hpp b/include/alp/reference/init.hpp
new file mode 100644
index 000000000..f7df92e49
--- /dev/null
+++ b/include/alp/reference/init.hpp
@@ -0,0 +1,41 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_INIT
+#define _H_ALP_REFERENCE_INIT
+
+#include <alp/base/init.hpp>
+
+namespace alp {
+
+	/** \internal No-op init */
+	template<>
+	RC init< reference >( const size_t, const size_t, void * const );
+
+	/** \internal No-op init */
+	template<>
+	RC finalize< reference >();
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_REFERENCE_INIT''
+
diff --git a/include/alp/reference/io.hpp b/include/alp/reference/io.hpp
new file mode 100644
index 000000000..e91589ccb
--- /dev/null
+++ b/include/alp/reference/io.hpp
@@ -0,0 +1,927 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_IO
+#define _H_ALP_REFERENCE_IO
+
+#include <type_traits>
+
+#include <alp/base/io.hpp>
+
+#include "blas1.hpp"
+#include "blas2.hpp"
+#include "matrix.hpp"
+#include "scalar.hpp"
+#include "vector.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace alp {
+
+	/**
+	 * Request the size (dimension) of a given Vector.
+	 *
+	 * The dimension is set at construction of the given Vector and cannot
+	 * be changed. A call to this function shall always succeed.
+	 *
+	 * @tparam DataType      The type of elements contained in the vector \a x.
+	 * @tparam DataStructure The structure of the vector \a x.
+	 * @tparam View          The view type applied to the vector \a x.
+	 *
+	 * @param[in] x The Vector of which to retrieve the size.
+	 *
+	 * @return The size of the Vector \a x.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  * A call to this function
+	//  *  -# consists of \f$ \Theta(1) \f$ work;
+	//  *  -# moves \f$ \Theta(1) \f$ bytes of memory;
+	//  *  -# does not allocate any dynamic memory;
+	//  *  -# shall not make any system calls.
+	//  * \endparblock
+	 */
+	template<
+		typename DataType, typename DataStructure, typename View, typename ImfR, typename ImfC
+	>
+	size_t size(
+		const Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, reference > &x
+	) noexcept {
+		return internal::getLength( x );
+	}
+
+	/**
+	 * Request the number of nonzeroes in a given Vector.
+	 *
+	 * A call to this function always succeeds.
+	 *
+	 * @tparam DataType      The type of elements contained in the vector \a x.
+	 * @tparam DataStructure The structure of the vector \a x.
+	 * @tparam View          The view type applied to the vector \a x.
+	 *
+	 * @param[in] x The Vector of which to retrieve the number of nonzeroes.
+	 *
+	 * @return The number of nonzeroes in \a x.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  * A call to this function
+	//  *   -# consists of \f$ \Theta(1) \f$ work;
+	//  *   -# moves \f$ \Theta(1) \f$ bytes of memory;
+	//  *   -# does not allocate nor free any dynamic memory;
+	//  *   -# shall not make any system calls.
+	//  * \endparblock
+	 */
+	template<
+		typename DataType, typename DataStructure, typename View, typename ImfR, typename ImfC
+	>
+	size_t nnz(
+		const Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, reference > &x
+	) noexcept {
+		throw std::runtime_error( "Needs an implementation." );
+		return 0;
+	}
+
+	/**
+	 * Retrieve the number of nonzeroes contained in this matrix.
+	 *
+	 * @returns The number of nonzeroes the current matrix contains.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# This function consitutes \f$ \Theta(1) \f$ work.
+	 *        -# This function allocates no additional dynamic memory.
+	 *        -# This function uses \f$ \mathcal{O}(1) \f$ memory
+	 *           beyond that which was already used at function entry.
+	 *        -# This function will move
+	 *             \f$ \mathit{sizeof}( size\_t ) \f$
+	 *           bytes of memory.
+	 * \endparblock
+	 */
+	template<
+		typename DataType, typename Structure, typename View, typename ImfR, typename ImfC
+	>
+	size_t nnz(
+		const Matrix< DataType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A
+	) noexcept {
+		return A.nz;
+	}
+
+	/**
+	 * Clears all elements from the given vector \a x.
+	 *
+	 * At the end of this operation, the number of nonzero elements in this vector
+	 * will be zero. The size of the vector remains unchanged.
+	 *
+	 * @return alp::SUCCESS When the vector is successfully cleared.
+	 *
+	 * \note This function cannot fail.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  *      This function
+	//  *        -# contains \f$ \mathcal{O}(n) \f$ work,
+	//  *        -# will not allocate new dynamic memory,
+	//  *        -# will take at most \f$ \Theta(1) \f$ memory beyond the memory
+	//  *           already used by the application before the call to this
+	//  *           function.
+	//  *        -# will move at most \f$ \mathit{sizeof}(\mathit{bool}) +
+	//  *           \mathit{sizeof}(\mathit{size\_t}) \f$ bytes of data.
+	//  * \endparblock
+	 */
+	template<
+		typename DataType, typename DataStructure, typename View, typename ImfR, typename ImfC
+	>
+	RC clear(
+		Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, reference > &x
+	) noexcept {
+		throw std::runtime_error( "Needs an implementation" );
+		return SUCCESS;
+	}
+
+	/**
+	 * Resizes the Scalar to have at least the given number of nonzeroes.
+	 * The contents of the scalar are not retained.
+	 *
+	 * Resizing of dense containers is not allowed as the capacity is determined
+	 * by the container dimensions and the storage scheme. Therefore, this
+	 * function will not change the capacity of the container.
+	 *
+	 * The resize function for Scalars exist to maintain compatibility with
+	 * other containers (i.e., vector and matrix).
+	 *
+	 * Even though the capacity remains unchanged, the contents of the scalar
+	 * are not retained to maintain compatibility with the general specification.
+	 * However, the actual memory will not be reallocated. Rather, the scalar
+	 * will be marked as uninitialized.
+	 *
+	 * @param[in] x      The Scalar to be resized.
+	 * @param[in] new_nz The number of nonzeroes this vector is to contain.
+	 *
+	 * @return SUCCESS   If \a new_nz is not larger than 1.
+	 *         ILLEGAL   If \a new_nz is larger than 1.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -$ This function consitutes \f$ \Theta(1) \f$ work.
+	 *        -# This function allocates \f$ \Theta(0) \f$
+	 *           bytes of dynamic memory.
+	 *        -# This function does not make system calls.
+	 * \endparblock
+	 * \todo add documentation. In particular, think about the meaning with \a P > 1.
+	 */
+	template< typename InputType, typename InputStructure, typename length_type >
+	RC resize( Scalar< InputType, InputStructure, reference > &s, const length_type new_nz ) noexcept {
+		if( new_nz <= 1 ) {
+			setInitialized( s, false );
+			return SUCCESS;
+		} else {
+			return ILLEGAL;
+		}
+	}
+
+	/**
+	 * Resizes the vector to have at least the given number of nonzeroes.
+	 * The contents of the vector are not retained.
+	 *
+	 * Resizing of dense containers is not allowed as the capacity is determined
+	 * by the container dimensions and the storage scheme. Therefore, this
+	 * function will not change the capacity of the vector.
+	 *
+	 * Even though the capacity remains unchanged, the contents of the vector
+	 * are not retained to maintain compatibility with the general specification.
+	 * However, the actual memory will not be reallocated. Rather, the vector
+	 * will be marked as uninitialized.
+	 *
+	 * @param[in] x      The Vector to be resized.
+	 * @param[in] new_nz The number of nonzeroes this vector is to contain.
+	 *
+	 * @return SUCCESS   If \a new_nz is not larger than the current capacity
+	 *                   of the vector.
+	 *         ILLEGAL   If \a new_nz is larger than the current capacity of
+	 *                   the vector.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -$ This function consitutes \f$ \Theta(1) \f$ work.
+	 *        -# This function allocates \f$ \Theta(0) \f$
+	 *           bytes of dynamic memory.
+	 *        -# This function does not make system calls.
+	 * \endparblock
+	 * \todo add documentation. In particular, think about the meaning with \a P > 1.
+	 */
+	template< typename InputType, typename InputStructure, typename View, typename ImfR, typename ImfC, typename length_type >
+	RC resize(
+		Vector< InputType, InputStructure, Density::Dense, View, ImfR, ImfC, reference > &x,
+		const length_type new_nz
+	) noexcept {
+		(void) x;
+		(void) new_nz;
+		// \todo Add implementation.
+		// setInitialized( x, false );
+		return PANIC;
+	}
+
+	/**
+	 * Resizes the matrix to have at least the given number of nonzeroes.
+	 * The contents of the matrix are not retained.
+	 *
+	 * Resizing of dense containers is not allowed as the capacity is determined
+	 * by the container dimensions and the storage scheme. Therefore, this
+	 * function will not change the capacity of the matrix.
+	 *
+	 * Even though the capacity remains unchanged, the contents of the matrix
+	 * are not retained to maintain compatibility with the general specification.
+	 * However, the actual memory will not be reallocated. Rather, the matrix
+	 * will be marked as uninitialized.
+	 *
+	 * @param[in] A         The matrix to be resized.
+	 * @param[in] nonzeroes The number of nonzeroes this matrix is to contain.
+	 *
+	 * @return SUCCESS   If \a new_nz is not larger than the current capacity
+	 *                   of the matrix.
+	 *         ILLEGAL   If \a new_nz is larger than the current capacity of
+	 *                   the matrix.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -$ This function consitutes \f$ \Theta(1) \f$ work.
+	 *        -# This function allocates \f$ \Theta(0) \f$
+	 *           bytes of dynamic memory.
+	 *        -# This function does not make system calls.
+	 * \endparblock
+	 */
+	template< typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC >
+	RC resize(
+		Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &A,
+		const size_t new_nz
+	) noexcept {
+		(void) A;
+		(void) new_nz;
+		// \todo Add implementation.
+		// setInitialized( A, false );
+		return PANIC;
+	}
+
+	/**
+	 * Sets the value of a given scalar \a alpha to be equal to that of
+	 * another given scalar \a beta.
+	 *
+	 * This operation is functionally equivalent to
+	 * \code
+	 * alp::operators::right_assign< T > op;
+	 * alp::foldl( alpha, beta, op );
+	 * \endcode,
+	 * \code
+	 * alp::operators::left_assign < T > op;
+	 * alp::foldr( beta, alpha, op );
+	 * \endcode, as well as the following pseudocode
+	 * \code
+	 * *alpha = *beta;
+	 * \endcode.
+	 *
+	 * The scalar \a alpha may not equal \a beta.
+	 *
+	 * \parblock
+	 * \par Accepted descriptors
+	 *   -# alp::descriptors::no_operation
+	 *   -# alp::descriptors::no_casting
+	 * \endparblock
+	 *
+	 * @tparam descr           The descriptor of the operation.
+	 * @tparam OutputType      The element type in the output scalar.
+	 * @tparam InputType       The element type in the input scalar.
+	 * @tparam OutputStructure The structure of the ouput scalar.
+	 * @tparam InputStructure  The structure of the input scalar.
+	 *
+	 * @param[in,out] alpha The scalar to be set.
+	 * @param[in]     beta  The source scalar.
+	 *
+	 * When \a descr includes alp::descriptors::no_casting and if \a InputType
+	 * does not match \a OutputType, the code shall not compile.
+	 *
+	 * \parblock
+	 * \par Performance semantics
+	 * A call to this function
+	 *   -# consists of \f$ \Theta(1) \f$ work;
+	 *   -# moves \f$ \Theta(1) \f$ bytes of memory;
+	 *   -# does not allocate nor free any dynamic memory;
+	 *   -# shall not make any system calls.
+	 * \endparblock
+	 *
+	 * @see alp::foldl.
+	 * @see alp::foldr.
+	 * @see alp::operators::left_assign.
+	 * @see alp::operators::right_assign.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure,
+		typename InputType, typename InputStructure
+	>
+	RC set(
+		Scalar< OutputType, OutputStructure, reference > &alpha,
+		const Scalar< InputType, InputStructure, reference > &beta,
+		const std::enable_if_t<
+			!alp::is_object< InputType >::value &&
+			!alp::is_object< OutputType >::value
+		> * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< OutputType, InputType >::value ),
+			"alp::set (scalar)",
+			"called with a value type that does not match that of the given "
+			"scalar"
+		);
+
+		if( !internal::getInitialized( beta ) ) {
+			internal::setInitialized( alpha, false );
+			return SUCCESS;
+		}
+
+		// foldl requires left-hand side to be initialized prior to the call
+		internal::setInitialized( alpha, true );
+		return foldl( alpha, beta, alp::operators::right_assign< OutputType >() );
+	}
+
+	/**
+	 * Sets all elements of a Vector to the given value. Can be masked.
+	 *
+	 * This function is functionally equivalent to
+	 * \code
+	 * alp::operators::right_assign< DataType > op;
+	 * return foldl< descr >( x, val, op );
+	 * \endcode,
+	 * \code
+	 * alp::operators::left_assign< DataType > op;
+	 * return foldr< descr >( val, x, op );
+	 * \endcode, and the following pseudocode
+	 * \code
+	 * for( size_t i = 0; i < size(x); ++i ) {
+	 *     if( mask(i) ) { setElement( x, i, val ); }
+	 * \endcode.
+	 *
+	 * @tparam descr         The descriptor used for this operation.
+	 * @tparam DataType      The type of each element in the vector \a x.
+	 * @tparam DataStructure The structure of the vector \a x.
+	 * @tparam View          The view type applied to the vector \a x.
+	 * @tparam T             The type of the given value.
+	 *
+	 * \parblock
+	 * \par Accepted descriptors
+	 *   -# alp::descriptors::no_operation
+	 *   -# alp::descriptors::no_casting
+	 * \endparblock
+	 *
+	 * @param[in,out] x The Vector of which every element is to be set to equal
+	 *                  \a val.
+	 * @param[in]   val The value to set each element of \a x equal to.
+	 *
+	 * @returns SUCCESS       When the call completes successfully.
+	 *
+	 * When \a descr includes alp::descriptors::no_casting and if \a T does not
+	 * match \a DataType, the code shall not compile.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  * A call to this function
+	//  *   -# consists of \f$ \Theta(n) \f$ work;
+	//  *   -# moves \f$ \Theta(n) \f$ bytes of memory;
+	//  *   -# does not allocate nor free any dynamic memory;
+	//  *   -# shall not make any system calls.
+	//  * \endparblock
+	 *
+	 * @see alp::foldl.
+	 * @see alp::foldr.
+	 * @see alp::operators::left_assign.
+	 * @see alp::operators::right_assign.
+	 * @see alp::setElement.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename DataStructure, typename View,
+		typename ImfR, typename ImfC,
+		typename T, typename ValStructure
+	>
+	RC set(
+		Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, reference > &x,
+		const Scalar< T, ValStructure, reference > val,
+		const typename std::enable_if<
+			!alp::is_object< DataType >::value &&
+			!alp::is_object< T >::value,
+		void >::type * const = NULL
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< DataType, T >::value ), "alp::set (Vector, unmasked)",
+			"called with a value type that does not match that of the given "
+			"vector" );
+
+		if( !internal::getInitialized( val ) ) {
+			internal::setInitialized( x, false );
+			return SUCCESS;
+		}
+
+		// foldl requires left-hand side to be initialized prior to the call
+		internal::setInitialized( x, true );
+
+		// return foldl( x, val, alp::operators::right_assign< DataType >() );
+		const size_t n = size( x );
+		for ( size_t i = 0; i < n; ++i ) {
+			x[ i ] = internal::template ValueOrIndex< descr, DataType, T >::
+				getFromScalar( *val, i );
+		}
+	
+		return SUCCESS;
+
+	}
+
+	/**
+	 * Sets the element of a given Vector at a given position to a given value.
+	 *
+	 * If the input Vector \a x already has an element \f$ x_i \f$, that element
+	 * is overwritten to the given value \a val. If no such element existed, it
+	 * is added and set equal to \a val. The number of nonzeroes in \a x may thus
+	 * be increased by one due to a call to this function.
+	 *
+	 * The parameter \a i may not be greater or equal than the size of \a x.
+	 *
+	 * @tparam descr         The descriptor to be used during evaluation of this
+	 *                       function.
+	 * @tparam DataType      The type of the elements of \a x.
+	 * @tparam DataStructure The structure of the vector \a x.
+	 * @tparam View          The view type applied to the vector \a x.
+	 * @tparam T             The type of the value to be set.
+	 *
+	 * @param[in,out] x The vector to be modified.
+	 * @param[in]   val The value \f$ x_i \f$ should read after function exit.
+	 * @param[in]     i The index of the element of \a x to set.
+	 *
+	 * @return alp::SUCCESS   Upon successful execution of this operation.
+	 * @return alp::MISMATCH  If \a i is greater or equal than the dimension of
+	 *                        \a x.
+	 *
+	 * \parblock
+	 * \par Accepted descriptors
+	 *   -# alp::descriptors::no_operation
+	 *   -# alp::descriptors::no_casting
+	 * \endparblock
+	 *
+	 * When \a descr includes alp::descriptors::no_casting and if \a T does not
+	 * match \a DataType, the code shall not compile.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  * A call to this function
+	//  *   -# consists of \f$ \Theta(1) \f$ work;
+	//  *   -# moves \f$ \Theta(1) \f$ bytes of memory;
+	//  *   -# does not allocate nor free any dynamic memory;
+	//  *   -# shall not make any system calls.
+	//  * \endparblock
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename DataStructure, typename View, typename ImfR, typename ImfC, typename ValStructure,
+		typename T
+	>
+	RC setElement(
+		Vector< DataType, DataStructure, Density::Dense, View, ImfR, ImfC, reference > &x,
+		const Scalar< T, ValStructure, reference > val,
+		const size_t i,
+		const typename std::enable_if< !alp::is_object< DataType >::value && !alp::is_object< T >::value, void >::type * const = NULL
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || std::is_same< DataType, T >::value ), "alp::set (Vector, at index)",
+			"called with a value type that does not match that of the given "
+			"Vector" );
+
+		throw std::runtime_error( "Needs an implementation." );
+
+		// done
+		return SUCCESS;
+	}
+
+	/**
+	 * Sets the content of a given vector \a x to be equal to that of
+	 * another given vector \a y. Can be masked.
+	 *
+	 * This operation is functionally equivalent to
+	 * \code
+	 * alp::operators::right_assign< T > op;
+	 * alp::foldl( x, y, op );
+	 * \endcode,
+	 * \code
+	 * alp::operators::left_assign < T > op;
+	 * alp::foldr( y, x, op );
+	 * \endcode, as well as the following pseudocode
+	 * \code
+	 * for( each nonzero in y ) {
+	 *    setElement( x, nonzero.index, nonzero.value );
+	 * }
+	 * \endcode.
+	 *
+	 * The vector \a x may not equal \a y.
+	 *
+	 * \parblock
+	 * \par Accepted descriptors
+	 *   -# alp::descriptors::no_operation
+	 *   -# alp::descriptors::no_casting
+	 * \endparblock
+	 *
+	 * @tparam descr           The descriptor of the operation.
+	 * @tparam OutputType      The type of each element in the output vector.
+	 * @tparam InputType       The type of each element in the input vector.
+	 * @tparam OutputStructure The structure of the ouput vector.
+	 * @tparam InputStructure  The structure of the input vector.
+	 * @tparam OuputView       The view applied to the output vector.
+	 * @tparam InputView       The view applied to the input vector.
+	 *
+	 * @param[in,out] x The vector to be set.
+	 * @param[in]     y The source vector.
+	 *
+	 * When \a descr includes alp::descriptors::no_casting and if \a InputType
+	 * does not match \a OutputType, the code shall not compile.
+	 *
+	//  * \parblock
+	//  * \par Performance semantics
+	//  * A call to this function
+	//  *   -# consists of \f$ \Theta(n) \f$ work;
+	//  *   -# moves \f$ \Theta(n) \f$ bytes of memory;
+	//  *   -# does not allocate nor free any dynamic memory;
+	//  *   -# shall not make any system calls.
+	//  * \endparblock
+	 *
+	 * @see alp::foldl.
+	 * @see alp::foldr.
+	 * @see alp::operators::left_assign.
+	 * @see alp::operators::right_assign.
+	 * @see alp::setElement.
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC
+	>
+	RC set(
+		Vector< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &x,
+		const Vector< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &y
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< OutputType, InputType >::value ), "alp::copy (Vector)", "called with vector parameters whose element data types do not match" );
+		constexpr bool out_is_void = std::is_void< OutputType >::value;
+		constexpr bool in_is_void = std::is_void< OutputType >::value;
+		static_assert( !in_is_void || out_is_void,
+			"alp::set (reference, Vector <- Vector, masked): "
+			"if input is void, then the output must be also" );
+		static_assert( !( descr & descriptors::use_index ) || !out_is_void,
+			"alp::set (reference, Vector <- Vector, masked): "
+			"use_index descriptor cannot be set if output vector is void" );
+
+		// check contract
+		if( reinterpret_cast< void * >( &x ) == reinterpret_cast< const void * >( &y ) ) {
+			return ILLEGAL;
+		}
+
+		if( size( x ) != size( y ) ) {
+			return MISMATCH;
+		}
+
+		if( !internal::getInitialized( y ) ) {
+			setInitialized( x, false );
+			return SUCCESS;
+		}
+
+		internal::setInitialized( x, true );
+		return foldl( x, y, alp::operators::right_assign< OutputType >() );
+	}
+
+	/**
+	 * Sets all elements of the output matrix to the values of the input matrix.
+	 * C = A
+	 *
+	 * @tparam descr
+	 * @tparam OutputType      Data type of the output matrix C
+	 * @tparam OutputStructure Structure of the matrix C
+	 * @tparam OutputView      View type applied to the matrix C
+	 * @tparam InputType       Data type of the scalar a
+	 *
+	 * @param C    Matrix whose values are to be set
+	 * @param A    The input matrix
+	 *
+	 * @return RC  SUCCESS on the successful execution of the set
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure, typename InputView, typename InputImfR, typename InputImfC
+	>
+	RC set(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const Matrix< InputType, InputStructure, Density::Dense, InputView, InputImfR, InputImfC, reference > &A
+	) noexcept {
+		static_assert(
+			!std::is_same< OutputType, void >::value,
+			"alp::set (set to value): cannot have a pattern matrix as output"
+		);
+#ifdef _DEBUG
+		std::cout << "Called alp::set (matrix-to-matrix, reference)" << std::endl;
+#endif
+		// static checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< InputType, OutputType >::value ),
+			"alp::set", "called with non-matching value types"
+		);
+
+		static_assert(
+			!internal::is_functor_based<
+				Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference >
+			>::value,
+			"alp::set cannot be called with a functor-based matrix as a destination."
+		);
+
+		// TODO: Improve this check to account for non-zero structrue (i.e., bands)
+		//       and algebraic properties (e.g., symmetry)
+		static_assert(
+			std::is_same< OutputStructure, InputStructure >::value,
+			"alp::set cannot be called for containers with different structures."
+		);
+
+		if( ( nrows( C ) != nrows( A ) ) || ( ncols( C ) != ncols( A ) ) ) {
+			return MISMATCH;
+		}
+
+		if( !internal::getInitialized( A ) ) {
+			internal::setInitialized( C, false );
+			return SUCCESS;
+		}
+
+		internal::setInitialized( C, true );
+		return foldl( C, A, alp::operators::right_assign< OutputType >() );
+	}
+
+	/**
+	 * Sets all elements of the given matrix to the value of the given scalar.
+	 * C = val
+	 *
+	 * @tparam descr
+	 * @tparam OutputType      Data type of the output matrix C
+	 * @tparam OutputStructure Structure of the matrix C
+	 * @tparam OutputView      View type applied to the matrix C
+	 * @tparam InputType       Data type of the scalar a
+	 *
+	 * @param C    Matrix whose values are to be set
+	 * @param val  The value to set the elements of the matrix C
+	 *
+	 * @return RC  SUCCESS on the successful execution of the set
+	 */
+	template< Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename OutputStructure, typename OutputView, typename OutputImfR, typename OutputImfC,
+		typename InputType, typename InputStructure
+	>
+	RC set(
+		Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference > &C,
+		const Scalar< InputType, InputStructure, reference > &val
+	) noexcept {
+
+		static_assert(
+			!std::is_same< OutputType, void >::value,
+			"alp::set (set to matrix): cannot have a pattern matrix as output"
+		);
+#ifdef _DEBUG
+		std::cout << "Called alp::set (matrix-to-value, reference)" << std::endl;
+#endif
+		// static checks
+		NO_CAST_ASSERT(
+			( !( descr & descriptors::no_casting ) || std::is_same< InputType, OutputType >::value ),
+			"alp::set", "called with non-matching value types"
+		);
+
+		static_assert(
+			!internal::is_functor_based<
+				Matrix< OutputType, OutputStructure, Density::Dense, OutputView, OutputImfR, OutputImfC, reference >
+			>::value,
+			"alp::set cannot be called with a functor-based matrix as a destination."
+		);
+
+		if( !internal::getInitialized( val ) ) {
+			internal::setInitialized( C, false );
+			return SUCCESS;
+		}
+
+		internal::setInitialized( C, true );
+		return foldl( C, val, alp::operators::right_assign< OutputType >() );
+	}
+
+	/**
+	 * Assigns elements to a matrix from an iterator.
+	 *
+	 * @tparam InputType      The matrix's element type.
+	 * @tparam fwd_iterator   The source iterator type.
+	 *
+	 * The iterator \a fwd_iterator must be  STL-compatible, may
+	 * support the following three public functions:
+	 *  -# <tt>S fwd_iterator.i();</tt> which returns the row index of the current
+	 *     nonzero;
+	 *  -# <tt>S fwd_iterator.j();</tt> which returns the column index of the
+	 *     current nonzero;
+	 *  -# <tt>V fwd_iterator.v();</tt> which returns the nonzero value of the
+	 *     current nonzero.
+	 *
+	 * It also may provide the following public typedefs:
+	 *  -# <tt>fwd_iterator::row_coordinate_type</tt>
+	 *  -# <tt>fwd_iterator::column_coordinate_type</tt>
+	 *  -# <tt>fwd_iterator::nonzero_value_type</tt>
+	 *
+	 * @param[in]  _start Iterator pointing to the first element to be added.
+	 * @param[in]  _end   Iterator pointing past the last element to be added.
+	 * 
+	 * @return alp::MISMATCH -# the dimension of the input and output containers
+	 *                          do not match.
+	 *                       When this error code is returned the state of this
+	 *                       container will be as though this function was never
+	 *                       called; however, the given forward iterators may
+	 *                       have been copied and the copied iterators may have
+	 *                       incurred multiple increments and dereferences.
+	 * @return alp::SUCCESS  When the function completes successfully.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# A call to this function will use \f$ \Theta(1) \f$ bytes
+	 *           of memory beyond the memory in use at the function call entry.
+	 *        -# This function will copy the input forward iterator at most
+	 *           \em once.
+	 *        -# This function moves
+	 *           \f$ \Theta(mn) \f$ bytes of data.
+	 *        -# This function will likely make system calls.
+	 * \endparblock
+	 *
+	 * \warning This is an expensive function. Use sparingly and only when
+	 *          absolutely necessary.
+	 *
+	 */
+	template< typename InputType, typename fwd_iterator >
+	RC buildMatrixUnique( internal::Matrix< InputType, reference > &A, fwd_iterator start, const fwd_iterator end ) {
+		return A.template buildMatrixUnique( start, end );
+	}
+
+	/**
+	 * @brief \a buildMatrix version. The semantics of this function equals the one of
+	 *        \a buildMatrixUnique for the \a reference backend.
+	 * 
+	 * @see alp::buildMatrix
+	 */
+	template< typename InputType, typename fwd_iterator >
+	RC buildMatrix( internal::Matrix< InputType, reference > &A, fwd_iterator start, const fwd_iterator end ) {
+		return A.template buildMatrixUnique( start, end );
+	}
+
+
+	/**
+	 * Assigns elements to a structured matrix from an iterator.
+	 *
+	 * @tparam MatrixT The structured matrix type.
+	 * @tparam fwd_iterator   The source iterator type.
+	 *
+	 * The iterator \a fwd_iterator must be  STL-compatible, may
+	 * support the following three public functions:
+	 *  -# <tt>S fwd_iterator.i();</tt> which returns the row index of the current
+	 *     nonzero;
+	 *  -# <tt>S fwd_iterator.j();</tt> which returns the column index of the
+	 *     current nonzero;
+	 *  -# <tt>V fwd_iterator.v();</tt> which returns the nonzero value of the
+	 *     current nonzero.
+	 *
+	 * It also may provide the following public typedefs:
+	 *  -# <tt>fwd_iterator::row_coordinate_type</tt>
+	 *  -# <tt>fwd_iterator::column_coordinate_type</tt>
+	 *  -# <tt>fwd_iterator::nonzero_value_type</tt>
+	 *
+	 * @param[in]  _start Iterator pointing to the first element to be added.
+	 * @param[in]  _end   Iterator pointing past the last element to be added.
+	 * 
+	 * @return alp::MISMATCH -# the dimension of the input and output containers
+	 *                          do not match.
+	 *                       When this error code is returned the state of this
+	 *                       container will be as though this function was never
+	 *                       called; however, the given forward iterators may
+	 *                       have been copied and the copied iterators may have
+	 *                       incurred multiple increments and dereferences.
+	 * @return alp::SUCCESS  When the function completes successfully.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *        -# A call to this function will use \f$ \Theta(1) \f$ bytes
+	 *           of memory beyond the memory in use at the function call entry.
+	 *        -# This function will copy the input forward iterator at most
+	 *           \em once.
+	 *        -# This function moves
+	 *           \f$ \mathcal{O}(mn) \f$ bytes of data.
+	 *        -# This function will likely make system calls.
+	 * \endparblock
+	 *
+	 * \warning This is an expensive function. Use sparingly and only when
+	 *          absolutely necessary.
+	 *
+	 */
+	template< typename MatrixT, typename fwd_iterator >
+	RC buildMatrixUnique( MatrixT &A, const fwd_iterator &start, const fwd_iterator &end ) noexcept {
+		(void)A;
+		(void)start;
+		(void)end;
+		return PANIC;
+		// return A.template buildMatrixUnique( start, end );
+	}
+
+	/**
+	 * @brief \a buildMatrix version. The semantics of this function equals the one of
+	 *        \a buildMatrixUnique for the \a reference backend.
+	 * 
+	 * @see alp::buildMatrix
+	 */
+	template< typename InputType, typename Structure, typename View, typename ImfR, typename ImfC, typename fwd_iterator >
+	RC buildMatrix(
+		Matrix< InputType, Structure, Density::Dense, View, ImfR, ImfC, reference > &A,
+		const fwd_iterator &start,
+		const fwd_iterator &end
+	) noexcept {
+
+		// Temporarily assuming 1-1 mapping with user container
+		internal::setInitialized(A, true);
+
+		InputType * praw, * p;
+		
+		size_t len = internal::getLength( internal::getContainer( A ) );
+		praw = p = internal::getRaw( internal::getContainer( A ) );
+
+		for( fwd_iterator it = start; p < praw + len && it != end; ++it, ++p ) {
+			*p = *it;
+		}
+
+		return SUCCESS;
+	}
+
+	/**
+	 * @brief \a buildVector version.
+	 *
+	 */
+	template< typename InputType, typename Structure, typename View, typename ImfR, typename ImfC, typename fwd_iterator >
+	RC buildVector(
+		Vector< InputType, Structure, Density::Dense, View, ImfR, ImfC, reference > &v,
+		const fwd_iterator &start,
+		const fwd_iterator &end
+	) noexcept {
+		// Temporarily assuming 1-1 mapping with user container
+		internal::setInitialized(v, true);
+
+		InputType * praw, * p;
+		
+		size_t len = internal::getLength( internal::getContainer( v ) );
+		praw = p = internal::getRaw( internal::getContainer( v ) );
+
+		for( fwd_iterator it = start; p < praw + len && it != end; ++it, ++p ) {
+			*p = *it;
+		}
+
+		return SUCCESS;
+	}
+
+} // end namespace ``alp''
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_ALP_REFERENCE_IO''
+
diff --git a/include/alp/reference/matrix.hpp b/include/alp/reference/matrix.hpp
new file mode 100644
index 000000000..9a8820b26
--- /dev/null
+++ b/include/alp/reference/matrix.hpp
@@ -0,0 +1,371 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_MATRIX
+#define _H_ALP_REFERENCE_MATRIX
+
+#include <algorithm>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/imf.hpp>
+#include <alp/ops.hpp>
+#include <alp/rc.hpp>
+#include <alp/structures.hpp>
+#include <alp/type_traits.hpp>
+#include <alp/utils.hpp>
+#include <alp/utils/autodeleter.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/matrix.hpp>
+#include <alp/amf-based/matrix.hpp>
+
+#include "config.hpp"
+#include "storage.hpp"
+#include "vector.hpp"
+
+
+namespace alp {
+	namespace internal {
+
+		/**
+		 * Retrieve the row dimension size of this matrix.
+		 *
+		 * @returns The number of rows the current matrix contains.
+		 *
+		 * \parblock
+		 * \par Performance semantics.
+		 *        -# This function consitutes \f$ \Theta(1) \f$ work.
+		 *        -# This function allocates no additional dynamic memory.
+		 *        -# This function uses \f$ \mathcal{O}(1) \f$ memory
+		 *           beyond that which was already used at function entry.
+		 *        -# This function will move
+		 *             \f$ \mathit{sizeof}( size\_t ) \f$
+		 *           bytes of memory.
+		 * \endparblock
+		 */
+		template< typename D >
+		size_t nrows( const Matrix< D, reference > & ) noexcept;
+
+		/**
+		 * Retrieve the column dimension size of this matrix.
+		 *
+		 * @returns The number of columns the current matrix contains.
+		 *
+		 * \parblock
+		 * \par Performance semantics.
+		 *        -# This function consitutes \f$ \Theta(1) \f$ work.
+		 *        -# This function allocates no additional dynamic memory.
+		 *        -# This function uses \f$ \mathcal{O}(1) \f$ memory
+		 *           beyond that which was already used at function entry.
+		 *        -# This function will move
+		 *             \f$ \mathit{sizeof}( size\_t ) \f$
+		 *           bytes of memory.
+		 * \endparblock
+		 */
+		template< typename D >
+		size_t ncols( const Matrix< D, reference > & ) noexcept;
+
+
+		template< typename D >
+		D * getRaw( Matrix< D, reference > & ) noexcept;
+
+		template< typename D >
+		const D * getRaw( const Matrix< D, reference > & ) noexcept;
+
+		/** Forward declaration */
+		template< typename T >
+		const bool & getInitialized( const Vector< T, reference > & v ) noexcept;
+
+		/** Forward declaration */
+		template< typename T >
+		void setInitialized( Vector< T, reference > & v, const bool initialized ) noexcept;
+
+		template< typename D >
+		const bool & getInitialized( const alp::internal::Matrix< D, reference > & A ) noexcept {
+			return A.initialized;
+		}
+
+		template< typename D >
+		void setInitialized( alp::internal::Matrix< D, reference > & A, const bool initialized ) noexcept {
+			A.initialized = initialized;
+		}
+
+		/**
+		 * ALP/Dense matrix container.
+		 *
+		 * A matrix is stored in full format.
+		 * \a Matrix may be used by \a Matrix as a raw container.
+		 *
+		 * @tparam D  The element type.
+		 */
+		template< typename D >
+		class Matrix< D, reference > {
+
+		private:
+			/* *********************
+				BLAS2 friends
+			   ********************* */
+
+			template< typename DataType >
+			friend size_t nrows( const Matrix< DataType, reference > & m ) noexcept;
+
+			template< typename DataType >
+			friend size_t ncols( const Matrix< DataType, reference > & m ) noexcept;
+
+			/* *********************
+			     `Getter' friends
+			   ********************* */
+
+			friend D * internal::getRaw< D >( Matrix< D, reference > & ) noexcept;
+
+			friend const D * internal::getRaw< D >( const Matrix< D, reference > & ) noexcept;
+
+			/* ********************
+				IO friends
+			   ******************** */
+
+			template< typename InputType, typename fwd_iterator >
+			friend RC buildMatrix( Matrix< InputType, reference > &, fwd_iterator, const fwd_iterator );
+
+			template< typename DataType >
+			friend const bool & internal::getInitialized( const alp::internal::Matrix< DataType, reference > & ) noexcept;
+
+			template< typename DataType >
+			friend void internal::setInitialized( alp::internal::Matrix< DataType, reference > & , const bool ) noexcept;
+
+			typedef Matrix< D, reference > self_type;
+
+			/**
+			 * The number of rows.
+			 */
+			size_t m;
+
+			/**
+			 * The number of columns.
+			 */
+			size_t n;
+
+			/** The container capacity (in elements). */
+			size_t cap;
+
+			/** The matrix data. */
+			D * __restrict__ data;
+
+			/**
+			 * Whether the container presently is initialized or not.
+			 * We differentiate the concept of empty matrix (matrix of size \f$0\times 0\f$)
+			 * from the one of uninitialized (matrix of size \f$m\times n\f$ which was never set)
+			 * and that of zero matrix (matrix with all zero elements).
+			 * \note in sparse format a zero matrix result in an ampty data structure. Is this
+			 * used to refer to uninitialized matrix in ALP/GraphBLAS?
+			 **/
+			bool initialized;
+
+			/**
+			 * @see alp::buildMatrixUnique
+			 */
+			template< typename fwd_iterator >
+			RC buildMatrixUnique( const fwd_iterator & _start, const fwd_iterator & _end ) {
+				// detect trivial case
+				if ( _start == _end || m == 0 || n == 0) {
+					return SUCCESS;
+				}
+
+				if ( (size_t)( _end - _start ) != ( m * n ) ) {
+					return MISMATCH;
+				}
+
+				// TODO: Add more sanity checks (e.g. overflow)
+
+				for( auto it = _start; it != _end; ++it ) {
+					data[ it - _start ] = *it;
+				}
+
+				initialized = true;
+
+				// done
+				return RC::SUCCESS;
+			}
+
+		public:
+			/** @see Matrix::value_type */
+			typedef D value_type;
+
+			/**
+			 * The main ALP/Dense matrix constructor.
+			 *
+			 * The constructed object will be uninitalised after successful construction.
+			 *
+			 * Requesting a matrix with zero \a rows or \a columns will yield an empty
+			 * matrix.
+			 *
+			 * @param rows        The number of rows in the new matrix.
+			 * @param columns     The number of columns in the new matrix.
+			 * @param cap         The capacity in terms of elements of the new matrix. Optional.
+			 *
+			 * @return SUCCESS This function never fails.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta( \max{mn, cap} ) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 * \warning Avoid the use of this constructor within performance critical
+			 *          code sections.
+			 * \warning \a cap is present for compatibility with other matrix specializations.
+			 *          In reference backend, the number of non-zeros (i.e. capacity)
+			 *          depends on the used storage scheme. Therefore, this parameter is
+			 *          ignored.
+			 */
+			Matrix( const size_t rows, const size_t columns, const size_t cap = 0 ): m( rows ), n( columns ), cap( std::max( m*n, cap ) ), initialized( false ) {
+				// TODO Implement allocation properly
+				if( m > 0 && n > 0) {
+					data = new (std::nothrow) D[ m * n ];
+				} else {
+					data = nullptr;
+				}
+
+				if ( m > 0 && n > 0 && data == nullptr ) {
+					throw std::runtime_error( "Could not allocate memory during alp::Matrix<reference> construction." );
+				}
+
+			}
+
+			/**
+			 * Copy constructor.
+			 *
+			 * @param other The matrix to copy. The initialization state of the copy
+			 *              reflects the state of \a other.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *      Allocates the same capacity as the \a other matrix, even if the
+			 *      actual number of elements contained in \a other is less.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor allocates \f$ \Theta(\max{mn, cap} ) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor incurs \f$ \Theta(mn) \f$ of data
+			 *           movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 * \warning Avoid the use of this constructor within performance critical
+			 *          code sections.
+			 */
+			Matrix( const Matrix< D, reference > & other ) : Matrix( other.m, other.n ) {
+				initialized = other.initialized;
+			}
+
+			/**
+			 * Move constructor. The new matrix equal the given
+			 * matrix. Invalidates the use of the input matrix.
+			 *
+			 * @param[in] other The GraphBLAS matrix to move to this new instance.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor will not allocate any new dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+			 * \endparblock
+			 */
+			// Matrix( self_type && other ) noexcept {
+			// 	moveFromOther( std::forward< self_type >( other ) );
+			// }
+
+			/**
+			 * Move assignment operator.
+			 * @see Matrix::Matrix( Matrix && )
+			 */
+			// self_type & operator=( self_type && other ) noexcept {
+			// 	moveFromOther( std::forward< self_type >( other ) );
+			// 	return *this;
+			// }
+
+			/**
+			 * Matrix destructor.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This destructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This destructor will not perform any memory allocations.
+			 *        -# This destructor will use \f$ \mathcal{O}(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This destructor will move \f$ \Theta(1) \f$ bytes of data.
+			 *        -# This destructor makes system calls.
+			 * \endparblock
+			 *
+			 * \warning Avoid calling destructors from within performance critical
+			 *          code sections.
+			 */
+			~Matrix() {
+				if( data != nullptr ) {
+					delete [] data;
+				}
+			}
+		};
+
+		template< typename T >
+		T * getRaw( Matrix< T, reference > & m ) noexcept {
+			return m.data;
+		}
+
+		template< typename T >
+		const T * getRaw( const Matrix< T, reference > & m ) noexcept {
+			return m.data;
+		}
+
+		template< typename T >
+		size_t nrows( const Matrix< T, reference > & m ) noexcept {
+			return m.m;
+		}
+
+		template< typename T >
+		size_t ncols( const Matrix< T, reference > & m ) noexcept {
+			return m.n;
+		}
+
+		/**
+		 * Identifies any reference internal matrix is an internal container.
+		 */
+		template< typename T >
+		struct is_container< internal::Matrix< T, reference > > : std::true_type {};
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // end ``_H_ALP_REFERENCE_MATRIX''
diff --git a/include/alp/reference/pinnedvector.hpp b/include/alp/reference/pinnedvector.hpp
new file mode 100644
index 000000000..4ba35fddc
--- /dev/null
+++ b/include/alp/reference/pinnedvector.hpp
@@ -0,0 +1,31 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_PINNEDVECTOR
+#define _H_ALP_REFERENCE_PINNEDVECTOR
+
+namespace alp {
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_REFERENCE_PINNEDVECTOR''
+
diff --git a/include/alp/reference/properties.hpp b/include/alp/reference/properties.hpp
new file mode 100644
index 000000000..136e07eb8
--- /dev/null
+++ b/include/alp/reference/properties.hpp
@@ -0,0 +1,42 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_PROPERTIES
+#define _H_ALP_REFERENCE_PROPERTIES
+
+#include <alp/base/properties.hpp>
+
+
+namespace alp {
+
+	/** \internal No implementation notes. */
+	template<>
+	class Properties< reference > {
+	public:
+		/** No implementation notes. */
+		constexpr static bool writableCaptured = true;
+	};
+
+} // namespace alp
+
+#endif // end `_H_ALP_REFERENCE_PROPERTIES''
+
diff --git a/include/alp/reference/scalar.hpp b/include/alp/reference/scalar.hpp
new file mode 100644
index 000000000..e3c790204
--- /dev/null
+++ b/include/alp/reference/scalar.hpp
@@ -0,0 +1,204 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_REFERENCE_SCALAR
+#define _H_ALP_REFERENCE_SCALAR
+
+
+#include <memory>
+#include <stdexcept>
+
+#include <assert.h>
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/rc.hpp>
+#include <alp/structures.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/scalar.hpp>
+
+
+namespace alp {
+
+	namespace internal {
+		template< typename T, typename Structure >
+		bool getInitialized( const Scalar< T, Structure, reference > & ) noexcept;
+
+		template< typename T, typename Structure >
+		void setInitialized( Scalar< T, Structure, reference > &, bool ) noexcept;
+	} // end namespace ``alp::internal''
+
+	/**
+	 * \brief An ALP scalar.
+	 *
+	 * This is an opaque data type for scalars.
+	 *
+	 * @tparam T                 The type of the vector elements. \a T shall not
+	 *                           be a ALP type.
+	 * @tparam Structure         One of the structures.
+	 *
+	 * \warning Creating a alp::Scalar of other ALP types is
+	 *                <em>not allowed</em>.
+	 *          Passing a ALP type as template parameter will lead to
+	 *          undefined behaviour.
+	 *
+	 */
+	template< typename T, typename Structure >
+	class Scalar< T, Structure, reference > {
+
+		private:
+
+			typedef Scalar< T, Structure, reference > self_type;
+
+			friend bool internal::getInitialized<>( const self_type & ) noexcept;
+
+			friend void internal::setInitialized<>( self_type &, bool ) noexcept;
+
+			// Scalar value
+			T value;
+
+			/** Whether the scalar value is currently initialized */
+			bool initialized;
+
+		public:
+			/** @see Vector::value_type. */
+			typedef T value_type;
+
+			/** @see Vector::lambda_reference */
+			typedef T& lambda_reference;
+			typedef const T& const_lambda_reference;
+
+			/**
+			 * The main ALP scalar constructor.
+			 *
+			 * The constructed object will be uninitalised after successful construction.
+			 *
+			 *
+			 * @return SUCCESS This function never fails.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar() : initialized( false ) {}
+
+			/**
+			 * The ALP scalar constructor for converting C/C++ scalar to ALP scalar.
+			 *
+			 * The constructed object will be initialized after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			explicit Scalar( const T &value ) : value( value ), initialized( true ) {}
+
+			/**
+			 * Copy constructor.
+			 *
+			 * @param other The scalar to copy. The initialization state of the copy
+			 *              reflects the state of \a other.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor allocates \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ of data
+			 *           movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar( const Scalar &other ) : value( other.value ), initialized( other.initialized ) {
+				// const RC rc = set( *this, other ); // note: initialized will be set as part of this call
+				// if( rc != SUCCESS ) {
+				// 	throw std::runtime_error( "alp::Scalar< T, Structure, Density::Dense, View::Original< void >, reference > (copy constructor): error during call to alp::set (" + toString( rc ) + ")" );
+				// }
+			}
+
+			/**
+			 * Move constructor. The new scalar equals the given
+			 * scalar. Invalidates the use of the input scalar.
+			 *
+			 * @param[in] other The ALP scalar to move to this new instance.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor will not allocate any new dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+			 * \endparblock
+			 */
+			Scalar( Scalar &&other ) : value( other.value ), initialized( other.initialized ) {
+				other.initialized = false;
+			}
+
+			/** \internal No implementation notes. */
+			lambda_reference operator*() noexcept {
+				assert( internal::getInitialized( *this ) );
+				return value;
+			}
+
+			/** \internal No implementation notes. */
+			const_lambda_reference operator*() const noexcept {
+				assert( internal::getInitialized( *this ) );
+				return value;
+			}
+
+	}; // class Scalar with physical container
+
+	/** Identifies any reference scalar as an ALP scalar. */
+	template< typename T, typename Structure >
+	struct is_scalar< Scalar< T, Structure, reference > > : std::true_type {};
+
+	namespace internal {
+		template< typename T, typename Structure >
+		bool getInitialized( const Scalar< T, Structure, reference > &s ) noexcept {
+			return s.initialized;
+		}
+
+		template< typename T, typename Structure >
+		void setInitialized( Scalar< T, Structure, reference > &s, bool initialized ) noexcept {
+			s.initialized = initialized;
+		}
+	} // end namespace ``alp::internal''
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_REFERENCE_SCALAR''
+
diff --git a/include/alp/reference/spmd.hpp b/include/alp/reference/spmd.hpp
new file mode 100644
index 000000000..3bae30777
--- /dev/null
+++ b/include/alp/reference/spmd.hpp
@@ -0,0 +1,70 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January, 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_SPMD
+#define _H_ALP_REFERENCE_SPMD
+
+#include <cstddef> //size_t
+
+#include <alp/base/spmd.hpp>
+
+namespace alp {
+
+	/** \internal This is a single-process back-end. */
+	template<>
+	class spmd< reference > {
+
+	public:
+
+		/**
+		 * @return The number of user processes in this GraphBLAS run.
+		 *
+		 * In this single-process backend, will always return 1.
+		 */
+		static inline size_t nprocs() noexcept {
+			return 1;
+		}
+
+		/**
+		 * @return The user process ID.
+		 *
+		 * In this single-process backend, will always return 0.
+		 */
+		static inline size_t pid() noexcept {
+			return 0;
+		}
+
+		/**
+		 * In this backend, corresponds to a no-op.
+		 *
+		 * @return alp::SUCCESS.
+		 */
+		static RC barrier() noexcept {
+			return SUCCESS;
+		}
+
+	}; // end class ``spmd'' reference implementation
+
+} // namespace alp
+
+#endif // end _H_ALP_REFERENCE_SPMD
+
diff --git a/include/alp/reference/storage.hpp b/include/alp/reference/storage.hpp
new file mode 100644
index 000000000..4e36c3d87
--- /dev/null
+++ b/include/alp/reference/storage.hpp
@@ -0,0 +1,186 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file
+ *
+ * This file registers mechanisms for coordinate mapping between
+ * logical and physical iteration spaces.
+ *
+ */
+
+#ifndef _H_ALP_REFERENCE_STORAGE
+#define _H_ALP_REFERENCE_STORAGE
+
+#include <alp/amf-based/storage.hpp>
+
+
+namespace alp {
+
+	namespace internal {
+
+		/** Specialization for general matrix */
+		template<>
+		struct determine_poly_factory< structures::General, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for square matrix */
+		template<>
+		struct determine_poly_factory< structures::Square, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for orthogonal matrix */
+		template<>
+		struct determine_poly_factory< structures::Orthogonal, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for orthogonalrows matrix */
+		template<>
+		struct determine_poly_factory< structures::OrthogonalRows, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for orthogonalcolumns matrix */
+		template<>
+		struct determine_poly_factory< structures::OrthogonalColumns, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+
+		/** Specialization for upper-triangular matrix */
+		template<>
+		struct determine_poly_factory< structures::UpperTriangular, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::PackedFactory< storage::UPPER, storage::ROW_WISE > factory_type;
+		};
+
+		/** Specialization for lower-triangular matrix */
+		template<>
+		struct determine_poly_factory< structures::LowerTriangular, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::PackedFactory< storage::LOWER, storage::ROW_WISE > factory_type;
+		};
+
+		/** Specialization for upper-trapezoidal matrix */
+		template<>
+		struct determine_poly_factory< structures::UpperTrapezoidal, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for lower-trapezoidal matrix */
+		template<>
+		struct determine_poly_factory< structures::LowerTrapezoidal, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for symmetric matrix */
+		template<>
+		struct determine_poly_factory< structures::Symmetric, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::PackedFactory< storage::UPPER, storage::ROW_WISE > factory_type;
+		};
+
+		/** Specialization for hermitian matrix */
+		template<>
+		struct determine_poly_factory< structures::Hermitian, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for symmetric positive definite matrix */
+		template<>
+		struct determine_poly_factory< structures::SymmetricPositiveDefinite, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::PackedFactory< storage::UPPER, storage::ROW_WISE > factory_type;
+		};
+
+		/** Specialization for hermitian positive definite matrix */
+		template<>
+		struct determine_poly_factory< structures::HermitianPositiveDefinite, imf::Id, imf::Id, reference > {
+
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for symmetric tridiagonal matrix */
+		template<>
+		struct determine_poly_factory< structures::SymmetricTridiagonal, imf::Id, imf::Id, reference > {
+
+			private:
+				using interval = std::tuple_element< 0, structures::SymmetricTridiagonal::band_intervals >::type;
+
+			public:
+				//typedef storage::polynomials::BandFactory< interval, storage::ROW_WISE > factory_type;
+				typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for hermitian tridiagonal matrix */
+		template<>
+		struct determine_poly_factory< structures::HermitianTridiagonal, imf::Id, imf::Id, reference > {
+
+			private:
+				// This will be used in the commented line below once band storage is added.
+				// Added for readability.
+				using interval = std::tuple_element< 0, structures::SymmetricTridiagonal::band_intervals >::type;
+
+			public:
+				//typedef storage::polynomials::BandFactory< interval, storage::ROW_WISE > factory_type;
+				typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for rectangular-upper-bidiagonal matrix */
+		template<>
+		struct determine_poly_factory< structures::RectangularUpperBidiagonal, imf::Id, imf::Id, reference > {
+			// should use band storage
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for rectangular-lower-bidiagonal matrix */
+		template<>
+		struct determine_poly_factory< structures::RectangularLowerBidiagonal, imf::Id, imf::Id, reference > {
+			// should use band storage
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for rectangular-diagonal matrix */
+		template<>
+		struct determine_poly_factory< structures::RectangularDiagonal, imf::Id, imf::Id, reference > {
+			// should use band storage
+			typedef storage::polynomials::FullFactory<> factory_type;
+		};
+
+		/** Specialization for vectors */
+		template< typename Structure >
+		struct determine_poly_factory< Structure, imf::Id, imf::Zero, reference > {
+
+			typedef storage::polynomials::ArrayFactory factory_type;
+		};
+
+	} // namespace internal
+
+} // namespace alp
+
+#endif // _H_ALP_REFERENCE_STORAGE
diff --git a/include/alp/reference/vector.hpp b/include/alp/reference/vector.hpp
new file mode 100644
index 000000000..aea0d0300
--- /dev/null
+++ b/include/alp/reference/vector.hpp
@@ -0,0 +1,339 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 14th of January 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_VECTOR
+#define _H_ALP_REFERENCE_VECTOR
+
+#include <memory>
+#include <stdexcept>
+
+#include <assert.h>
+
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/imf.hpp>
+#include <alp/rc.hpp>
+#include <alp/views.hpp>
+
+#include <alp/base/vector.hpp>
+#include <alp/amf-based/vector.hpp>
+
+#include "matrix.hpp"
+#include "storage.hpp"
+
+
+namespace alp {
+
+	namespace internal {
+
+		template< typename T >
+		T * getRaw( Vector< T, reference > & ) noexcept;
+
+		template< typename T >
+		const T * getRaw( const Vector< T, reference > & ) noexcept;
+
+		template< typename T >
+		size_t getLength( const Vector< T, reference > & ) noexcept;
+
+		template< typename T >
+		const bool & getInitialized( const Vector< T, reference > & v ) noexcept;
+
+		template< typename T >
+		void setInitialized( Vector< T, reference > & v, const bool initialized ) noexcept;
+
+
+		/**
+		 * The reference implementation of the ALP/Dense vector.
+		 *
+		 * @tparam T The type of an element of this vector. \a T shall not be a
+		 *           GraphBLAS type.
+		 *
+		 * \warning Creating a alp::Vector of other GraphBLAS types is
+		 *                <em>not allowed</em>.
+		 *          Passing a GraphBLAS type as template parameter will lead to
+		 *          undefined behaviour.
+		 */
+		template< typename T >
+		class Vector< T, reference > {
+
+			friend T * internal::getRaw< T >( Vector< T, reference > & ) noexcept;
+			friend const T * internal::getRaw< T >( const Vector< T, reference > & ) noexcept;
+			friend size_t internal::getLength< T >( const Vector< T, reference > & ) noexcept;
+
+			/* ********************
+				IO friends
+			   ******************** */
+
+			friend const bool & internal::getInitialized< T >( const Vector< T, reference > & ) noexcept;
+
+			friend void internal::setInitialized< T >( Vector< T, reference > & , bool ) noexcept;
+
+			protected:
+
+				/** The length of the vector. */
+				size_t n;
+
+				/** The container capacity (in elements).
+				 *
+				 * \warning \a cap is present for compatibility with other vector specializations.
+				 *          In reference backend, the number of non-zeros (i.e. capacity)
+				 *          depends on the used storage scheme. Therefore, this parameter is
+				 *          ignored when provided by user.
+				*/
+				size_t cap;
+
+				/** The vector data. */
+				T *__restrict__ data;
+
+				/** Whether the container presently is uninitialized. */
+				bool initialized;
+
+				/** Whether the destructor should free the vector data. True when the vector data is allocated by this class. */
+				bool should_free_data;
+
+			public:
+
+				/** Exposes the element type. */
+				typedef T value_type;
+
+				/** The return type of #operator[](). */
+				typedef T& lambda_reference;
+
+				/**
+				 * The main ALP/Dense vector constructor.
+				 *
+				 * The constructed object will be uninitalised after successful construction.
+				 *
+				 *
+				 * @param length      The number of elements in the new vector.
+				 *
+				 * @return SUCCESS This function never fails.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor may allocate \f$ \Theta( length ) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+				 *        -# This constructor \em may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector( const size_t length, const size_t cap = 0 ) :
+					n( length ), cap( std::max( length, cap ) ), initialized( false ), should_free_data( true ) {
+					// TODO: Implement allocation properly
+					if( n > 0) {
+						data = new (std::nothrow) T[ n ];
+					} else {
+						data = nullptr;
+					}
+
+					if ( n > 0 && data == nullptr ) {
+						throw std::runtime_error( "Could not allocate memory during alp::Vector<reference> construction." );
+					}
+				}
+
+				/**
+				 * The ALP/Dense vector constructor providing an already allocated data.
+				 *
+				 * The constructed object will be uninitalised after successful construction.
+				 *
+				 * @param data    The pointer to an allocated array of elements of
+				 *                elements of types T.
+				 * @param length  The number of elements the vector above guarantees
+				 *                to hold.
+				 * @param cap     The capacity (not used, kept for compatibility with sparse backend)
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor may allocate \f$ \Theta( 1 ) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+				 *        -# This constructor does \em not make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector( T *data, const size_t length, const size_t cap = 0 ) :
+					n( length ), cap( std::max( length, cap ) ), data( data ), initialized( false ), should_free_data( false ) {}
+
+				/**
+				 * Copy constructor.
+				 *
+				 * @param other The vector to copy. The initialization state of the copy
+				 *              reflects the state of \a other.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *      Allocates the same capacity as the \a other vector, even if the
+				 *      actual number of elements contained in \a other is less.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor allocates \f$ \Theta(\max{mn, cap} ) \f$ bytes
+				 *           of dynamic memory.
+				 *        -# This constructor incurs \f$ \Theta(mn) \f$ of data
+				 *           movement.
+				 *        -# This constructor \em may make system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid the use of this constructor within performance critical
+				 *          code sections.
+				 */
+				Vector( const Vector< T, reference > &other ) : Vector( other.n, other.cap ) {
+					initialized = other.initialized;
+					// const RC rc = set( *this, other ); // note: initialized will be set as part of this call
+					// if( rc != SUCCESS ) {
+					// 	throw std::runtime_error( "alp::Vector< T, reference > (copy constructor): error during call to alp::set (" + toString( rc ) + ")" );
+					// }
+				}
+
+				/**
+				 * Move constructor. The new vector equal the given
+				 * vector. Invalidates the use of the input vector.
+				 *
+				 * @param[in] other The GraphBLAS vector to move to this new instance.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This constructor will not allocate any new dynamic memory.
+				 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+				 * \endparblock
+				 */
+				Vector( Vector< T, reference > &&other ) : n( other.n ), cap( other.cap ), data( other.data ) {
+					other.n = 0;
+					other.cap = 0;
+					other.data = 0;
+					// data_deleter = std::move( other.data_deleter );
+					// initialized = other.initialized; other.initialized = false;
+				}
+
+				/**
+				 * Vector destructor.
+				 *
+				 * \parblock
+				 * \par Performance semantics.
+				 *        -# This destructor entails \f$ \Theta(1) \f$ amount of work.
+				 *        -# This destructor will not perform any memory allocations.
+				 *        -# This destructor will use \f$ \mathcal{O}(1) \f$ extra bytes of
+				 *           memory beyond that at constructor entry.
+				 *        -# This destructor will move \f$ \Theta(1) \f$ bytes of data.
+				 *        -# This destructor makes system calls.
+				 * \endparblock
+				 *
+				 * \warning Avoid calling destructors from within performance critical
+				 *          code sections.
+				 */
+				~Vector() {
+					if( data != nullptr && should_free_data ) {
+						delete [] data;
+					}
+				}
+
+				/** \internal No implementation notes. */
+				lambda_reference operator[]( const size_t i ) noexcept {
+					assert( i < n );
+					/** \internal \todo See if the assert below makes sense in some scenarios. */
+					//assert( initialized );
+					return data[ i ];
+				}
+
+				/** \internal No implementation notes. */
+				const lambda_reference operator[]( const size_t i ) const noexcept {
+					assert( i < n );
+					assert( initialized );
+					return data[ i ];
+				}
+
+				// /** \internal Relies on #internal::ConstDenserefVectorIterator. */
+				// const_iterator cbegin() const noexcept {
+				// 	return initialized ?
+				// 		const_iterator( data, n, false ) :
+				// 		const_iterator( nullptr, 0, false );
+				// }
+
+				// /** \internal Relies on #internal::ConstDenserefVectorIterator. */
+				// const_iterator begin() const noexcept {
+				// 	return cbegin();
+				// }
+
+				// /** \internal Relies on #internal::ConstDenserefVectorIterator. */
+				// const_iterator cend() const noexcept {
+				// 	return initialized ?
+				// 		const_iterator( data, n, true ) :
+				// 		const_iterator( nullptr, 0, true );
+				// }
+
+				// /** \internal Relies on #internal::ConstDenserefVectorIterator. */
+				// const_iterator end() const noexcept {
+				// 	return cend();
+				// }
+
+		};
+
+		/** Identifies any reference internal vector as an internal container. */
+		template< typename T >
+		struct is_container< internal::Vector< T, reference > > : std::true_type {};
+
+	} // end namespace ``alp::internal''
+
+	namespace internal {
+
+		template< typename T >
+		T * getRaw( Vector< T, reference > &v ) noexcept {
+			return v.data;
+		}
+
+		template< typename T >
+		const T * getRaw( const Vector< T, reference > &v ) noexcept {
+			return v.data;
+		}
+
+		template< typename T >
+		size_t getLength( const Vector< T, reference > &v ) noexcept {
+			return v.n;
+		}
+
+		template< typename T >
+		const bool & getInitialized( const Vector< T, reference > & v ) noexcept {
+			return v.initialized;
+		}
+
+		template< typename T >
+		void setInitialized( Vector< T, reference > & v, bool initialized ) noexcept {
+			v.initialized = initialized;
+		}
+	} // end namespace ``alp::internal''
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_REFERENCE_VECTOR''
+
diff --git a/include/alp/reference/vectoriterator.hpp b/include/alp/reference/vectoriterator.hpp
new file mode 100644
index 000000000..121e86ae7
--- /dev/null
+++ b/include/alp/reference/vectoriterator.hpp
@@ -0,0 +1,228 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 17th of January, 2022
+ */
+
+#ifndef _H_ALP_REFERENCE_VECTOR_ITERATOR
+#define _H_ALP_REFERENCE_VECTOR_ITERATOR
+
+#include <alp/backends.hpp>
+
+#include <iterator>
+
+
+namespace alp {
+
+	namespace internal {
+
+		template< typename T, Backend spmd_backend = reference >
+		class ConstDenserefVectorIterator : public std::iterator<
+			std::random_access_iterator_tag,
+			std::pair< const size_t, const T >,
+			size_t
+		> {
+			friend class Vector< T, reference, void >;
+
+			private:
+
+				using ActiveDistribution = internal::Distribution< spmd_backend >;
+
+				const T *__restrict__ data;
+
+				size_t pos;
+
+				size_t n;
+
+				size_t s;
+
+				size_t P;
+
+				std::pair< size_t, T > currentEntry;
+
+				ConstDenserefVectorIterator(
+					const T * const data_in, const size_t n_in, const bool end,
+					const size_t processID = 0, const size_t numProcesses = 1
+				) noexcept :
+					data( data_in ), pos( 0 ), n( n_in ),
+					s( processID ), P( numProcesses )
+				{
+					assert( P > 0 );
+					assert( s < P );
+					if( !end ) {
+						pos = 0;
+						if( n > 0 ) {
+							currentEntry.first = 0;
+							currentEntry.second = data[ 0 ];
+						}
+					} else {
+						pos = n;
+					}
+				}
+				
+				ConstDenserefVectorIterator( const T * const data_in,
+					const size_t pos_in, const size_t n_in,
+					const size_t s_in, const size_t P_in
+				) noexcept : data( data_in ), pos( pos_in ), n( n_in ),
+					s( s_in ), P( P_in )
+				{
+					if( pos < n && n > 0 ) {
+						currentEntry.first = pos;
+						currentEntry.second = data[ pos ];
+					}
+				}
+
+
+			public:
+
+				ConstDenserefVectorIterator() noexcept :
+					ConstDenserefVectorIterator( nullptr, 0, true )
+				{}
+
+				ConstDenserefVectorIterator( const ConstDenserefVectorIterator< T, spmd_backend > &other ) noexcept :
+					data( other.data ), pos( other.pos ), n( other.n ),
+					s( other.s ), P( other.P ),
+					currentEntry( other.currentEntry )
+				{}
+
+				ConstDenserefVectorIterator( ConstDenserefVectorIterator< T, spmd_backend > &&other ) noexcept :
+					data( other.data ), pos( other.pos ), n( other.n ),
+					s( other.s ), P( other.P )
+				{
+					other.data = nullptr; other.pos = 0; other.n = 0;
+					currentEntry = std::move( other.currentEntry );
+				}
+
+				ConstDenserefVectorIterator< T, spmd_backend >& operator=( const ConstDenserefVectorIterator< T, spmd_backend > &other ) noexcept {
+					data = other.data; pos = other.pos; n = other.n;
+					assert( s == other.s ); assert( P == other.P );
+					currentEntry = other.currentEntry;
+					return *this;
+				}
+
+				ConstDenserefVectorIterator< T, spmd_backend >& operator=( ConstDenserefVectorIterator< T, spmd_backend > &&other ) noexcept {
+					data = other.data; other.data = nullptr;
+					pos = other.pos; other.pos = 0;
+					n = other.n; other.n = 0;
+					assert( s == other.s ); assert( P == other.P );
+					currentEntry = std::move( other.currentEntry );
+					return *this;
+				}
+
+				bool operator==( const ConstDenserefVectorIterator< T, spmd_backend > &other ) const noexcept {
+					assert( data == other.data ); assert( n == other.n );
+					assert( s == other.s ); assert( P == other.P );
+					return pos == other.pos;
+				}
+				
+				bool operator!=( const ConstDenserefVectorIterator< T, spmd_backend > &other ) const noexcept {
+					assert( data == other.data ); assert( n == other.n );
+					assert( s == other.s ); assert( P == other.P );
+					return pos != other.pos;
+				}
+
+				bool operator<( const ConstDenserefVectorIterator< T, spmd_backend > &other ) const noexcept {
+					assert( data == other.data ); assert( n == other.n );
+					assert( s == other.s ); assert( P == other.P );
+					return pos < other.pos;
+				}
+
+				std::pair< const size_t, const T > operator[]( const size_t i ) {
+					assert( pos + i < n );
+					assert( n > 0 );
+					std::pair< size_t, T > ret;
+					ret.first = pos + i;
+					ret.second = data[ pos + i ];
+					return ret;
+				}
+
+				std::pair< const size_t, const T >& operator*() const noexcept {
+					assert( n > 0 );
+					assert( pos < n );
+					return currentEntry;
+				}
+
+				const std::pair< size_t, T >* operator->() const noexcept {
+					assert( n > 0 );
+					assert( pos < n );
+					return &currentEntry;
+				}
+
+				ConstDenserefVectorIterator< T, spmd_backend >& operator+=( const size_t i ) noexcept {
+					assert( pos + i <= n );
+					pos = std::max( n, pos + i );
+					if( n > 0 && pos < n ) {
+						currentEntry.first = pos;
+						currentEntry.second = data[ pos ];
+					}
+					return *this;
+				}
+				
+				ConstDenserefVectorIterator< T, spmd_backend >& operator-=( const size_t i ) noexcept {
+					assert( i >= pos );
+					if( i > pos ) {
+						pos = n;
+					} else {
+						pos -= i;
+					}
+					assert( pos <= n );
+					if( n > 0 && pos < n ) {
+						currentEntry.first = pos;
+						currentEntry.second = data[ pos ];
+					}
+					return *this;
+				}
+
+				ConstDenserefVectorIterator< T, spmd_backend >& operator++() noexcept {
+					return operator+=( 1 );
+				}
+
+				ConstDenserefVectorIterator< T, spmd_backend >& operator--() noexcept {
+					return operator-=( 1 );
+				}
+
+				ConstDenserefVectorIterator< T, spmd_backend > operator+( const ConstDenserefVectorIterator< T, spmd_backend > &other ) noexcept {
+					assert( data == other.data );
+					assert( pos + other.pos < n );
+					assert( n == other.n );
+					assert( s == other.s ); assert( P == other.P );
+					const size_t newPos = std::max( pos + other.pos, n );
+					return ConstDenserefVectorIterator< T, spmd_backend >( data, pos, n, s, P );
+				}
+
+				ptrdiff_t operator-( const ConstDenserefVectorIterator< T, spmd_backend > &other ) noexcept {
+					assert( data == other.data );
+					assert( pos >= other.pos );
+					assert( n == other.n );
+					assert( s == other.s ); assert( P == other.P );
+					if( pos >= other.pos ) {
+						return static_cast< ptrdiff_t >( pos - other.pos );
+					} else {
+						return -static_cast< ptrdiff_t >( other.pos - pos );
+					}
+				}
+
+		};
+
+	} // end namespace ``alp::internal''
+
+} // end namespace ``alp''
+
+#endif // end ``_H_ALP_REFERENCE_VECTOR_ITERATOR''
+
diff --git a/include/alp/rels.hpp b/include/alp/rels.hpp
new file mode 100644
index 000000000..66a3dfbd1
--- /dev/null
+++ b/include/alp/rels.hpp
@@ -0,0 +1,228 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author D. G. Spampinato
+ * @date 3rd of November, 2022
+ */
+
+#ifndef _H_ALP_RELATIONS
+#define _H_ALP_RELATIONS
+
+#include <type_traits>
+
+#include "type_traits.hpp"
+#include "internalrels.hpp"
+
+
+namespace alp {
+
+	/**
+	 * This namespace holds various standard operators such as #alp::relations::lt.
+	 */
+	namespace relations {
+
+		/**
+		 * This class implements the less-than relation.
+		 * It exposes the complete interface detailed in 
+		 * \a alp::relations::internal::HomogeneousRelation.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       relation directly, and instead simply passes the relation on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam SET The domain and codomain of the relation.
+		 *
+		 * \warning This operator expects a numerical type for \a SET or types 
+		 *          that have the appropriate operator<-functions available.
+		 */
+		// [Relation Wrapping]
+		template< typename SET, enum Backend implementation = config::default_backend >
+		class lt : public internal::HomogeneousRelation< internal::lt< SET, implementation > > {
+
+			public:
+
+				lt() {}
+		};
+		// [Relation Wrapping]
+
+		/**
+		 * This class implements the greater-than relation.
+		 * It exposes the complete interface detailed in 
+		 * \a alp::relations::internal::HomogeneousRelation.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       relation directly, and instead simply passes the relation on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam SET The domain and codomain of the relation.
+		 *
+		 * \warning This operator expects a numerical type for \a SET or types 
+		 *          that have the appropriate operator>-functions available.
+		 */
+		template< typename SET, enum Backend implementation = config::default_backend >
+		class gt : public internal::HomogeneousRelation< internal::gt< SET, implementation > > {
+
+			public:
+
+				gt() {}
+		};
+
+		/**
+		 * This class implements the equality relation.
+		 * It exposes the complete interface detailed in 
+		 * \a alp::relations::internal::HomogeneousRelation.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam SET The domain and codomain of the relation.
+		 *
+		 * \warning This operator expects a numerical type for \a SET or types 
+		 *          that have the appropriate operator==-functions available.
+		 */
+		template< typename SET, enum Backend implementation = config::default_backend >
+		class eq : public internal::HomogeneousRelation< internal::eq< SET, implementation > > {
+
+			public:
+
+				eq() {}
+		};
+
+		/**
+		 * This class implements the not-equal relation.
+		 * It exposes the complete interface detailed in 
+		 * \a alp::relations::internal::HomogeneousRelation.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam SET The domain and codomain of the relation.
+		 *
+		 * \warning This operator expects a numerical type for \a SET or types 
+		 *          that have the appropriate operator==-functions available.
+		 */
+		template< typename SET, enum Backend implementation = config::default_backend >
+		class neq : public internal::HomogeneousRelation< internal::neq< SET, implementation > > {
+
+			public:
+
+				neq() {}
+		};
+
+		/**
+		 * This class implements the less-than-or-equal relation.
+		 * It exposes the complete interface detailed in 
+		 * \a alp::relations::internal::HomogeneousRelation.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam SET The domain and codomain of the relation.
+		 *
+		 * \warning This operator expects a numerical type for \a SET or types 
+		 *          that have the appropriate operator<=-functions available.
+		 */
+		template< typename SET, enum Backend implementation = config::default_backend >
+		class le : public internal::HomogeneousRelation< internal::le< SET, implementation > > {
+
+			public:
+
+				le() {}
+		};
+
+		/**
+		 * This class implements the greater-than-or-equal relation.
+		 * It exposes the complete interface detailed in 
+		 * \a alp::relations::internal::HomogeneousRelation.
+		 *
+		 * \note A proper GraphBLAS program never uses the interface exposed by this
+		 *       operator directly, and instead simply passes the operator on to
+		 *       GraphBLAS functions.
+		 *
+		 * @tparam SET The domain and codomain of the relation.
+		 *
+		 * \warning This operator expects a numerical type for \a SET or types 
+		 *          that have the appropriate operator<=-functions available.
+		 */
+		template< typename SET, enum Backend implementation = config::default_backend >
+		class ge : public internal::HomogeneousRelation< internal::ge< SET, implementation > > {
+
+			public:
+
+				ge() {}
+		};
+
+	} // namespace relations
+
+	// [Relation Type Traits]
+	template<
+		typename IntRel,
+		enum Backend implementation
+	>
+	struct is_relation< relations::lt< IntRel, implementation > > {
+		static const constexpr bool value = true;
+	};
+	// [Relation Type Traits]
+
+	template<
+		typename IntRel,
+		enum Backend implementation
+	>
+	struct is_relation< relations::gt< IntRel, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template<
+		typename IntRel,
+		enum Backend implementation
+	>
+	struct is_relation< relations::eq< IntRel, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template<
+		typename IntRel,
+		enum Backend implementation
+	>
+	struct is_relation< relations::neq< IntRel, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template<
+		typename IntRel,
+		enum Backend implementation
+	>
+	struct is_relation< relations::le< IntRel, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template<
+		typename IntRel,
+		enum Backend implementation
+	>
+	struct is_relation< relations::ge< IntRel, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+} // namespace alp
+
+#endif // end ``_H_ALP_RELATIONS''
+
diff --git a/include/alp/scalar.hpp b/include/alp/scalar.hpp
new file mode 100644
index 000000000..b994241ab
--- /dev/null
+++ b/include/alp/scalar.hpp
@@ -0,0 +1,55 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_ALP_SCALAR
+#define _H_ALP_SCALAR
+
+#include <alp/structures.hpp>
+
+#include "config.hpp"
+
+#include "base/scalar.hpp"
+
+// now include all specialisations contained in the backend directories:
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/scalar.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/scalar.hpp>
+#endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/scalar.hpp>
+#endif
+
+// specify default only if requested during compilation
+// #ifdef _ALP_BACKEND
+namespace alp {
+
+	template< typename T, typename Structure = structures::General, enum Backend backend = config::default_backend >
+	class Scalar;
+
+	/** Specializations of ALP backend-agnostic type traits */
+	template< typename T, typename Structure, enum Backend backend >
+	struct inspect_structure< Scalar< T, Structure, backend > > {
+		typedef Structure type;
+	};
+
+}
+// #endif
+
+#endif // end ``_H_ALP_SCALAR''
+
diff --git a/include/alp/semiring.hpp b/include/alp/semiring.hpp
new file mode 100644
index 000000000..69de9835f
--- /dev/null
+++ b/include/alp/semiring.hpp
@@ -0,0 +1,353 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 15th of March, 2016
+ */
+
+#ifndef _H_ALP_SEMIRING
+#define _H_ALP_SEMIRING
+
+#include <alp/identities.hpp>
+#include <alp/monoid.hpp>
+#include <alp/ops.hpp>
+
+/**
+ * The main GraphBLAS namespace.
+ */
+namespace alp {
+
+	/**
+	 * A generalised semiring.
+	 *
+	 * This semiring works with the standard operators provided in alp::operators
+	 * as well as with standard identities provided in alp::identities.
+	 *
+	 * \par Operators
+	 *
+	 * An operator \a OP here is of the form \f$ f:\ D_1 \times D_2 \to D_3 \f$;
+	 * i.e., it has a fixed left-hand input domain, a fixed right-hand input
+	 * domain, and a fixed output domain.
+	 *
+	 * A generalised semiring must include two operators; an additive operator,
+	 * and a multiplicative one:
+	 *   -# \f$ \oplus: \ D_1 \times D_2 \to D_3 \f$, and
+	 *   -# \f$ \otimes:\ D_4 \times D_5 \to D_6 \f$.
+	 *
+	 * By convention, primitives such as alp::mxv will feed the output of the
+	 * multiplicative operation to the additive operator as left-hand side input;
+	 * hence, a valid semiring must have \f$ D_6 = D_1 \f$. Should the additive
+	 * operator reduce several multiplicative outputs, the thus-far accumulated
+	 * value will thus be passed as right-hand input to the additive operator;
+	 * hence, a valid semiring must also have \f$ D_2 = D_3 \f$.
+	 *
+	 * If these constraints on the domains do not hold, attempted compilation will
+	 * result in a clear error message.
+	 *
+	 * A semiring, in our definition here, thus in fact only defines four domains.
+	 * We may thus rewrite the above definitions of the additive and multiplicative
+	 * operators as:
+	 *   -# \f$ \otimes:\ D_1 \times D_2 \to D_3 \f$, and
+	 *   -# \f$ \oplus: \ D_3 \times D_4 \to D_4 \f$.
+	 *
+	 * \par Identities
+	 *
+	 * There are two identities that make up a generalised semiring: the zero-
+	 * identity and the one-identity. These identities must be able to instantiate
+	 * values for different domains, should indeed the four domains a generalised
+	 * semiring operates on differ.
+	 *
+	 * Specifically, the zero-identity may be required for any of the domains the
+	 * additive and multiplicative operators employ, whereas the one-identity may
+	 * only be required for the domains the multiplicative operator employs.
+	 *
+	 * \par Standard examples
+	 *
+	 * An example of the standard semiring would be:
+	 *    alp::Semiring<
+	 *        alp::operators::add< double, double, double >,
+	 *        alp::operators::mul< double, double, double >,
+	 *        alp::identities::zero,
+	 *        alp::identitites::one
+	 *    > realSemiring;
+	 * In this standard case, all domains the operators the semiring comprises are
+	 * equal to one another. GraphBLAS supports the following shorthand for this
+	 * special case:
+	 *    alp::Semiring<
+	 *        alp::operators::add< double >,
+	 *        alp::operators::mul< double >,
+	 *        alp::identities::zero,
+	 *        alp::identities::one
+	 *    > realSemiring;
+	 *
+	 * As another example, consider min-plus algebras. These may be used, for
+	 * example, for deriving shortest paths through an edge-weighted graph:
+	 *    alp::Semiring<
+	 *        alp::operators::min< unsigned int >,
+	 *        alp::operators::add< unsigned int >,
+	 *        alp::identities::negative_infinity,
+	 *        alp::identities::zero
+	 *    > minPlus;
+	 *
+	 * \par CMonoid-categories
+	 *
+	 * While in these standard examples the relation to standard semirings as
+	 * defined in mathematics apply, the possiblity of having differing domains
+	 * that may not even be subsets of one another makes the above sketch
+	 * generalisation incompatible with the standard notion of semirings.
+	 *
+	 * Our notion of a generalised semiring indeed is closer to what one might call
+	 * CMonoid-categories, i.e. categories enriched in commutative monoids. Such
+	 * CMonoid-categories are specified by some data, and are required to satisfy
+	 * certain algebraic (equational) laws, thus being well-specified mathematical
+	 * objects.
+	 *
+	 * Additionally, such CMonoid-categories encapsulate the definition of
+	 * semirings, vector spaces, left modules and right modules.
+	 *
+	 * The full structure of a CMonoid-category C is specified by the data:
+	 *
+	 *  -# a set ob(C) of so-called objects,
+	 *  -# for each pair of objects a,b in ob(C), a commutative monoid
+	 *     (C(a,b), 0_{a,b}, +_{a,b}),
+	 *  -# for each triple of objects a,b,c in ob(C), a multiplication operation
+	 *     ._{a,b,c} : C(b,c) x C(a,b) -> C(a,c), and
+	 *  -# for each object a in ob(C), a multiplicative identity 1_a in C(a,a).
+	 *
+	 * This data is then required to specify a list of algebraic laws that
+	 * essentially capture:
+	 *  -# (that the (C(a,b), 0_{a,b}, +_{a,b}) are commutative monoids)
+	 *  -# joint associativity of the family of multiplication operators,
+	 *  -# that the multiplicative identities 1_a are multiplicative identities,
+	 *  -# that the family of multiplication operators ._{a,b,c} distributes over
+	 *     the family of addition operators +_{a,b} on the left and on the right
+	 *     in an appropriate sense, and
+	 *  -# left and right annihilativity of the family of additive zeros 0_{a,b}.
+	 *
+	 * \par Generalised semirings in terms of CMonoid-categories
+	 *
+	 * The current notion of generalised semiring is specified by the following
+	 * data:
+	 *  -# operators OP1, OP2,
+	 *  -# the four domains those operators are defined on,
+	 *  -# an additive identity ID1, and
+	 *  -# a multiplicative identity ID2.
+	 *
+	 * The four domains correspond to the choice of a CMonoid-category with two
+	 * objects; e.g., \f$ ob(C)=\{a,b\} \f$. This gives rise to four possible
+	 * pairings of the objects, including self-pairs, that correspond to the
+	 * four different domains.
+	 *
+	 * CMonoid-categories then demand an additive operator must exist that
+	 * operates purely within each of the four domains, when combined with a zero
+	 * identity that likewise must exist in each of the four domains. None of
+	 * these additive operators in fact matches with the generalised semiring's
+	 * additive operator.
+	 *
+	 * CMonoid-categories also demand the existance of six different
+	 * multiplicative operators that operate on three different domains each, that
+	 * the composition of these operators is associative, that these operators
+	 * distribute over the appropriate additive operators, and that there exists
+	 * an multiplicative identity over at least one of the input domains.
+	 *
+	 * One of these six multiplicative operators is what appears in our generalised
+	 * semiring. We seem to select exactly that multiplicative operator for which
+	 * both input domains have an multiplicative identity.
+	 *
+	 * Finally, the identities corresponding to additive operators must act as
+	 * annihilators over the matching multiplicative operators.
+	 *
+	 * Full details can be found in the git repository located here:
+	 * https://gitlab.huaweirc.ch/abooij/semirings
+	 *
+	 * @tparam _OP1 The addition operator.
+	 * @tparam _OP2 The multiplication operator.
+	 * @tparam _ID1 The identity under addition (the `0').
+	 * @tparam _ID2 The identity under multiplication (the `1').
+	 */
+	template< class _OP1, class _OP2, template< typename > class _ID1, template< typename > class _ID2 >
+	class Semiring {
+
+		static_assert( std::is_same< typename _OP2::D3, typename _OP1::D1 >::value,
+			"The multiplicative output type must match the left-hand additive "
+			"input type" );
+
+		static_assert( std::is_same< typename _OP1::D2, typename _OP1::D3 >::value,
+			"The right-hand input type of the additive operator must match its "
+			"output type" );
+
+		static_assert( alp::is_associative< _OP1 >::value,
+			"Cannot construct a semiring using a non-associative additive "
+			"operator" );
+
+		static_assert( alp::is_associative< _OP2 >::value,
+			"Cannot construct a semiring using a non-associative multiplicative "
+			"operator" );
+
+		static_assert( alp::is_commutative< _OP1 >::value,
+			"Cannot construct a semiring using a non-commutative additive "
+			"operator" );
+
+	public:
+		/** The first input domain of the multiplicative operator. */
+		typedef typename _OP2::D1 D1;
+
+		/** The second input domain of the multiplicative operator. */
+		typedef typename _OP2::D2 D2;
+
+		/**
+		 * The output domain of the multiplicative operator.
+		 * The first input domain of the additive operator.
+		 */
+		typedef typename _OP2::D3 D3;
+
+		/**
+		 * The second input domain of the additive operator.
+		 * The output domain of the additive operator.
+		 */
+		typedef typename _OP1::D2 D4;
+
+		/** The additive operator type. */
+		typedef _OP1 AdditiveOperator;
+
+		/** The multiplicative operator type. */
+		typedef _OP2 MultiplicativeOperator;
+
+		/** The additive monoid type. */
+		typedef Monoid< _OP1, _ID1 > AdditiveMonoid;
+
+		/** The multiplicative monoid type. */
+		typedef Monoid< _OP2, _ID2 > MultiplicativeMonoid;
+
+		/** The identity under addition. */
+		template< typename ZeroType >
+		using Zero = _ID1< ZeroType >;
+
+		/** The identity under multiplication. */
+		template< typename OneType >
+		using One = _ID2< OneType >;
+
+	private:
+		static constexpr size_t D1_bsz = alp::config::SIMD_BLOCKSIZE< D1 >::value();
+		static constexpr size_t D2_bsz = alp::config::SIMD_BLOCKSIZE< D2 >::value();
+		static constexpr size_t D3_bsz = alp::config::SIMD_BLOCKSIZE< D3 >::value();
+		static constexpr size_t D4_bsz = alp::config::SIMD_BLOCKSIZE< D4 >::value();
+		static constexpr size_t mul_input_bsz = D1_bsz < D2_bsz ? D1_bsz : D2_bsz;
+
+		/** The additive monoid. */
+		AdditiveMonoid additiveMonoid;
+
+		/** The multiplicative monoid. */
+		MultiplicativeMonoid multiplicativeMonoid;
+
+	public:
+		/** Blocksize for element-wise addition. */
+		static constexpr size_t blocksize_add = D3_bsz < D4_bsz ? D3_bsz : D4_bsz;
+
+		/** Blocksize for element-wise multiplication. */
+		static constexpr size_t blocksize_mul = mul_input_bsz < D3_bsz ? mul_input_bsz : D3_bsz;
+
+		/** Blocksize for element-wise multiply-adds. */
+		static constexpr size_t blocksize = blocksize_mul < blocksize_add ? blocksize_mul : blocksize_add;
+
+		/**
+		 * Retrieves the zero corresponding to this semiring. The zero value will be
+		 * cast to the requested domain.
+		 *
+		 * @tparam D The requested domain of the zero. The arbitrary choice for the
+		 *           default return type is \a D1-- inspired by the regularly
+		 *           occurring expression \f$ a_{ij}x_j \f$ where often the left-
+		 *           hand side is zero.
+		 *
+		 * @returns The zero corresponding to this semiring, cast to the requested
+		 *          domain.
+		 */
+		template< typename D >
+		constexpr D getZero() const {
+			return additiveMonoid.template getIdentity< D >();
+		}
+
+		/**
+		 * Sets the given value equal to one, corresponding to this semiring.
+		 * The identity value will be cast to the requested domain.
+		 *
+		 * @tparam D The requested domain of the one. The arbitrary choice for the
+		 *           default return type is \a D1-- the reasoning being to simply
+		 *           have the same default type as getZero().
+		 *
+		 * @return The one corresponding to this semiring, cast to the requested
+		 *         domain.
+		 */
+		template< typename D >
+		constexpr D getOne() const {
+			return multiplicativeMonoid.template getIdentity< D >();
+		}
+
+		/**
+		 * Retrieves the underlying additive monoid.
+		 *
+		 * @return The underlying monoid. Any state is copied.
+		 */
+		AdditiveMonoid getAdditiveMonoid() const {
+			return additiveMonoid;
+		}
+
+		/**
+		 * Retrieves the underlying multiplicative monoid.
+		 *
+		 * @return The underlying monoid. Any state is copied.
+		 */
+		MultiplicativeMonoid getMultiplicativeMonoid() const {
+			return multiplicativeMonoid;
+		}
+
+		/**
+		 * Retrieves the underlying additive operator.
+		 *
+		 * @return The underlying operator. Any state is copied.
+		 */
+		AdditiveOperator getAdditiveOperator() const {
+			return additiveMonoid.getOperator();
+		}
+
+		/**
+		 * Retrieves the underlying multiplicative operator.
+		 *
+		 * @return The underlying operator. Any state is copied.
+		 */
+		MultiplicativeOperator getMultiplicativeOperator() const {
+			return multiplicativeMonoid.getOperator();
+		}
+	};
+
+	// overload for GraphBLAS type traits.
+	template< class _OP1, class _OP2, template< typename > class _ID1, template< typename > class _ID2 >
+	struct is_semiring< Semiring< _OP1, _OP2, _ID1, _ID2 > > {
+		/** This is a GraphBLAS semiring. */
+		static const constexpr bool value = true;
+	};
+
+	template< class _OP1, class _OP2, template< typename > class _ID1, template< typename > class _ID2 >
+	struct has_immutable_nonzeroes< Semiring< _OP1, _OP2, _ID1, _ID2 > > {
+		static const constexpr bool value = alp::is_semiring< Semiring< _OP1, _OP2, _ID1, _ID2 > >::value &&
+			std::is_same< _OP1, typename alp::operators::logical_or< typename _OP1::D1, typename _OP1::D2, typename _OP1::D3 > >::value;
+	};
+
+} // namespace alp
+
+#endif
diff --git a/include/alp/storage.hpp b/include/alp/storage.hpp
new file mode 100644
index 000000000..671c18c2c
--- /dev/null
+++ b/include/alp/storage.hpp
@@ -0,0 +1,40 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file
+ *
+ * This file registers mechanisms for coordinate mapping between
+ * logical and physical iteration spaces.
+ *
+ */
+
+#ifndef _H_ALP_STORAGE
+#define _H_ALP_STORAGE
+
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/storage.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/storage.hpp>
+#endif
+
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/storage.hpp>
+#endif
+
+#endif // _H_ALP_STORAGE
diff --git a/include/alp/structures.hpp b/include/alp/structures.hpp
new file mode 100644
index 000000000..437e24010
--- /dev/null
+++ b/include/alp/structures.hpp
@@ -0,0 +1,1005 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file 
+ * 
+ * This file registers all matrix structures that are either
+ * implemented, under implementation, or were at any point in time
+ * conceived and noteworthy enough to be recorded for future consideration.
+ * 
+ * A structure 
+ */
+
+#ifndef _H_ALP_STRUCTURES
+#define _H_ALP_STRUCTURES
+
+#include <cstddef>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+
+#include "imf.hpp"
+#include "views.hpp"
+
+
+namespace alp {
+
+	template< typename... Tuples >
+	struct tuple_cat {
+		using type = decltype( std::tuple_cat( std::declval< Tuples >()... ) );
+	};
+
+
+	/**
+	 * @brief Compile-time interval [ _left, _right )
+	 *
+	 * @tparam _left  left boundary of the interval.
+	 * @tparam _right right boundary of the interval. Optional, in which case 
+	 *                _right = _left + 1.
+	 */
+	template < std::ptrdiff_t _left, std::ptrdiff_t _right = _left + 1 >
+	struct Interval {
+		
+		static_assert( _left < _right );
+
+		static constexpr std::ptrdiff_t left = _left;
+		static constexpr std::ptrdiff_t right = _right;
+
+	};
+
+	template< typename IntervalT >
+	struct is_interval: std::false_type { };
+
+	template< std::ptrdiff_t _left, std::ptrdiff_t _right >
+	struct is_interval< Interval< _left, _right > >: std::true_type { };
+
+	/**
+	 * @brief Compile-time interval [ -inf, _right )
+	 */
+	template < std::ptrdiff_t _right > 
+	using LeftOpenInterval = Interval<std::numeric_limits< std::ptrdiff_t >::min(), _right >;
+
+	/**
+	 * @brief Compile-time interval [ _left, +inf ]
+	 */
+	template < std::ptrdiff_t _left >
+	using RightOpenInterval = Interval< _left, std::numeric_limits< std::ptrdiff_t >::max() >;
+
+	/**
+	 * @brief Compile-time interval [ -inf, +inf ]
+	 */
+	typedef Interval<std::numeric_limits< std::ptrdiff_t >::min(), std::numeric_limits< std::ptrdiff_t >::max() > OpenInterval;
+
+
+	/**
+	 * @brief Compile-time transposition of interval [ left, right ).
+	 * @typedef type The transposed [ -right + 1, -left + 1 ) interval.
+	 */
+	template< typename IntervalT, typename = std::enable_if_t< is_interval< IntervalT >::value > >
+	struct transpose_interval {
+		typedef Interval< -IntervalT::right + 1, -IntervalT::left + 1 > type;
+	};
+
+	template< std::ptrdiff_t _right >
+	struct transpose_interval< LeftOpenInterval< _right > > {
+		typedef RightOpenInterval< -_right + 1 > type;
+	};
+
+	template< std::ptrdiff_t _left >
+	struct transpose_interval< RightOpenInterval< _left > > {
+		typedef LeftOpenInterval< -_left + 1 > type;
+	};
+
+	template<>
+	struct transpose_interval< OpenInterval > {
+		typedef OpenInterval type;
+	};
+
+	/**
+	 * Checks if a given diagonal belongs to the given interval.
+	 */
+	template< typename Interval >
+	bool is_within_interval( const std::ptrdiff_t diag_offset ) {
+		return ( ( diag_offset >= Interval::left ) && ( diag_offset < Interval::right ) );
+	}
+
+	namespace internal {
+
+		/**
+		 * Checks if a pair of coordinates (i, j) belong to non-zero structure
+		 * of the band defined by the band_index and the union of bands.
+		 * \note Does not check for matrix dimensions.
+		 */
+
+		/** Specialization for out-of-bounds band index */
+		template<
+			size_t band_index, typename Bands,
+			std::enable_if_t<
+				band_index >= std::tuple_size< Bands >::value
+			> * = nullptr
+		>
+		bool is_non_zero( const size_t i, const size_t j ) {
+			(void)i;
+			(void)j;
+			return false;
+		}
+
+		/** Specialization for within-the-bounds band index */
+		template<
+			size_t band_index, typename Bands,
+			std::enable_if_t<
+				band_index < std::tuple_size< Bands >::value
+			> * = nullptr
+		>
+		bool is_non_zero( const size_t i, const size_t j ) {
+
+			using band_interval = typename std::tuple_element< band_index, Bands >::type;
+
+			if( is_within_interval< band_interval >( static_cast< std::ptrdiff_t >( j ) - static_cast< std::ptrdiff_t >( i ) ) ) {
+				return true;
+			} else {
+				return is_non_zero< band_index + 1, Bands >( i, j );
+			}
+		}
+
+	} // namespace internal
+
+	/**
+	 * Checks if a pair of coordinates (i, j) belong to non-zero structure
+	 * of the band defined by the band_index and the union of bands.
+	 * \note Does not check for matrix dimensions.
+	 */
+	template< typename Structure >
+	bool is_non_zero( const size_t i, const size_t j ) {
+		return internal::is_non_zero< 0, typename Structure::band_intervals >( i, j );
+	}
+
+	namespace internal {
+		/**
+		 * @internal Compile-time check if a tuple of intervals is sorted and non-overlapping.
+		 * E.g., a pair ( [a, b) [c, d) ) with a < b <= c < d
+		 */
+		template< typename IntervalTuple >
+		struct is_tuple_sorted_non_overlapping;
+
+		template< std::ptrdiff_t _left0, std::ptrdiff_t _right0, std::ptrdiff_t _left1, std::ptrdiff_t _right1, typename... Intervals >
+		struct is_tuple_sorted_non_overlapping < std::tuple< Interval< _left0, _right0 >, Interval< _left1, _right1 >, Intervals... > > {
+			static constexpr bool value = ( _right0 <= _left1 ) && is_tuple_sorted_non_overlapping< std::tuple< Interval< _left1, _right1 >, Intervals... > >::value;
+		};
+
+		template< std::ptrdiff_t _left, std::ptrdiff_t _right >
+		struct is_tuple_sorted_non_overlapping < std::tuple< Interval< _left, _right > > > : std::true_type { };
+
+		template< >
+		struct is_tuple_sorted_non_overlapping < std::tuple< > > : std::true_type { };
+
+		/**
+		 * @internal Compile-time transposition of an interval tuple.
+		 * E.g., a pair ( [-2, 3) [4, 6) )
+		 * Results in ( [-5, -3) [-2, 3) )
+		 */
+		template< typename IntervalTuple >
+		struct transpose_interval_tuple;
+
+		template< typename IntervalT, typename... Intervals >
+		struct transpose_interval_tuple< std::tuple< IntervalT, Intervals... > > {
+			typedef tuple_cat< 
+				typename transpose_interval_tuple< std::tuple< Intervals... > >::type, 
+				std::tuple< typename transpose_interval< IntervalT >::type > 
+			> type;
+		};
+
+		template< typename IntervalT >
+		struct transpose_interval_tuple< std::tuple< IntervalT > > {
+			typedef std::tuple< typename transpose_interval< IntervalT >::type > type;
+		};
+
+	} // namespace internal
+
+	/**
+	 * Collects all ALP matrix structures.
+	 * 
+	 * A matrix structure is characterized by having a member type \a inferred_structures.
+	 * \a inferred_structures is a tuple used to define a partial order over the 
+	 * structures based on their logical implication. So if having structure \f$B\f$ implies
+	 * also having structure \f$A\f$ than 
+	 * \code
+	 * is_same< B::inferred_structures, std::tuple<A, B> >::value == true
+	 * \endcode
+	 */
+	namespace structures {
+
+		/**
+		 * Check if a structure \a Structure is part of a given \a std::tuple \a Tuple.
+		 */
+		template< typename Structure, typename Tuple >
+		struct is_in;
+
+		template< typename Structure >
+		struct is_in< Structure, std::tuple<> > : std::false_type {};
+
+		template< typename Structure, typename TupleHead, typename... Structures >
+		struct is_in< Structure, std::tuple< TupleHead, Structures... > > : is_in< Structure, std::tuple< Structures... > > {};
+
+		template< typename Structure, typename... Structures >
+		struct is_in< Structure, std::tuple< Structure, Structures... > > : std::true_type {};
+
+
+		namespace internal {
+			/**
+			 * @internal WIP interface. Symmetry may be extended so to describe the
+			 * direction of the symmetry.
+			 */
+			enum SymmetryDirection {
+
+				unspecified,
+
+				north,
+
+				south,
+
+				east,
+
+				west,
+
+				/*
+				* Could specify symmetry with upper access
+				*/
+				north_west,
+
+				/*
+				* Could specify symmetry with lower access
+				*/
+				south_east,
+
+				/*
+				* Could specify persymmetry with upper access
+				*/
+				north_east,
+
+				/*
+				* Could specify persymmetry with lower access
+				*/
+				south_west
+			};
+		} // namespace internal 
+
+		struct BaseStructure {};
+
+		struct UpperTriangular;
+
+		struct General: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = std::tuple< General >;
+		};
+
+		struct Square: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< Square >, General::inferred_structures >::type;
+		};
+
+		/**
+		 * @brief Static and runtime check to determine if a matrix view of structure TargetStructure
+		 * 		  and index mapping functions (IMFs) \a imf_r and \a imf_c can be defined over \a SourceStructure.
+		 *
+		 * @tparam SourceStructure The underlying structure of the source view.
+		 * @tparam TargetStructure The underlying structure of the target view.
+		 * @param imf_r            The IMF applied to the rows of the source matrix.
+		 * @param imf_c            The IMF applied to the columns of the source matrix.
+
+		 * @return \a false if the function can determined that the new view may alter underlying assumptions
+		 * 			associated with the source structure \a SourceStructure; \a true otherwise.
+		 */
+		template< typename SourceStructure, typename TargetStructure >
+		struct isInstantiable {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void)imf_r;
+				(void)imf_c;
+				return false;
+			};
+		};
+
+		template<>
+		struct isInstantiable< General, General > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void)imf_r;
+				(void)imf_c;
+				return true;
+			};
+		};
+
+		template<>
+		struct isInstantiable< UpperTriangular, General > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return imf_r.map( imf_r.n - 1 ) <= imf_c.map( 0 );
+			};
+		};
+
+		template<>
+		struct isInstantiable< UpperTriangular, UpperTriangular > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return imf_r.isSame(imf_c);
+			};
+		};
+
+
+		template<>
+		struct isInstantiable< General, Square > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return (imf_r.n == imf_c.n);
+			};
+		};
+
+		template<>
+		struct isInstantiable< Square, General > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void) imf_r;
+				(void) imf_c;
+				return (true);
+			};
+		};
+
+		template<>
+		struct isInstantiable< Square, UpperTriangular > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return (imf_r.n == imf_c.n);
+			};
+		};
+
+		template<>
+		struct isInstantiable< Square, Square > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return (imf_r.n == imf_c.n);
+			};
+		};
+
+		/**
+		 * @brief A Band is a structure described by a compile-time tuple of 
+		 *        sorted, non-overlapping integer intervals which 
+		 *        list the groups of contiguous non-zero diagonals of a 
+		 *        matrix with this structure.
+		 *        Different intervals should be described considering the 
+		 *        position of the main diagonal as 0 reference. This enables 
+		 *        comparing intervals as well as sorting them.
+		 *        Subdiagonals have negative positions (the farer from the 
+		 *        main diagonal the smaller the position) while superdiagonals 
+		 *        have positive ones (the farer from the main diagonal the 
+		 *        larger the position).
+		 *        E.g., <tt>Band< alp::Interval<-1, 2> ></tt> is a band 
+		 *        structure that can be used to describe a tridiagonal matrix.
+		 *
+		 *        \note <tt>alp::Interval<a, b></tt> uses a past-the-end 
+		 *        notation for the intervals [a, b). @see alp::Interval.
+		 *
+		 *        The first value of the left-most (second value of the right-
+		 *        most) interval in the sequence is the lower (upper, resp.) 
+		 *        bandwidth (referred to as \a lb and \a ub) of the matrix.
+		 *        Such values may be open-ended if limited by the size of the 
+		 *        matrix. If the lower bandwith is finite and negative than 
+		 *        the number of rows
+		 *        \f$m \f$ at runtime must ensure \f$m > |lb| \f$. 
+		 *        Similarly, if the upper bandwith is finite and positive 
+		 *        than the number of columns 
+		 *        \f$n \f$ at runtime must ensure \f$n >= ub \f$. 
+		 *        The concept of <tt> Band< OpenInterval > </tt> is a very 
+		 *        general notion of Band and may be used for inference purposes 
+		 *        (e.g., checking if a matrix is a Band matrix irrespective 
+		 *        of specific bands in the structure).
+		 *
+		 * @tparam Intervals One or more \a alp::Interval types specifying the 
+		 *                   bands of the structure. These intervals should be 
+		 *                   non-overlapping and sorted according to the above 
+		 *                   assumption that all intervals are defined assuming 
+		 *                   the main diagonal has position zero.
+		 *                   \a alp::LeftOpenInterval ( \a alp::RightOpenInterval) 
+		 *                   can be used to indicate that the left bandwidth 
+		 *                   (right bandwidth, respectively) is defined by the 
+		 *                   size of the matrix at runtime.
+		 *
+		 */
+		template < typename... Intervals >
+		struct Band: BaseStructure {
+
+			typedef std::tuple< Intervals... > band_intervals;
+
+			static_assert( alp::internal::is_tuple_sorted_non_overlapping< band_intervals >::value );
+
+			typedef typename tuple_cat< std::tuple< Band< Intervals... > >, General::inferred_structures >::type inferred_structures;
+		};
+
+		template < typename IntervalTuple >
+		struct tuple_to_band {
+			// Can create Band only out of tuple of intervals
+			static_assert( sizeof(IntervalTuple *) == 0, "Non-tuple type provided." ); 
+		};
+
+		template < typename... Intervals >
+		struct tuple_to_band< std::tuple< Intervals... > > {
+			typedef Band< Intervals... > type;
+		};
+
+		struct Symmetric: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< Symmetric >, Square::inferred_structures >::type;
+		};
+
+		template<>
+		struct isInstantiable< General, Symmetric > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return ( imf_r.n == imf_c.n );
+			};
+		};
+
+		template<>
+		struct isInstantiable< Symmetric, General > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return (
+					( imf_r.map( imf_r.n - 1 ) <= imf_c.map( 0 ) ) ||
+					( imf_c.map( imf_c.n - 1 ) <= imf_r.map( 0 ) )
+				);
+			};
+		};
+
+		template<>
+		struct isInstantiable< Symmetric, Symmetric > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return imf_r.isSame(imf_c);
+			};
+		};
+
+		template<>
+		struct isInstantiable< Square, Symmetric > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return (imf_r.n == imf_c.n);
+			};
+		};
+
+		struct SymmetricPositiveDefinite: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< SymmetricPositiveDefinite >, Symmetric::inferred_structures >::type;
+		};
+
+		template<>
+		struct isInstantiable< General, SymmetricPositiveDefinite > : public isInstantiable< General, Symmetric > {
+		};
+
+		template<>
+		struct isInstantiable< SymmetricPositiveDefinite, General > : public isInstantiable< Symmetric, General > {
+		};
+
+		template<>
+		struct isInstantiable< SymmetricPositiveDefinite, SymmetricPositiveDefinite > : public isInstantiable< Symmetric, Symmetric > {
+		};
+
+		template<>
+		struct isInstantiable< Square, SymmetricPositiveDefinite > : public isInstantiable< Square, Symmetric > {
+		};
+
+		struct Hermitian: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< Hermitian >, Square::inferred_structures >::type;
+		};
+
+		template<>
+		struct isInstantiable< Hermitian, Hermitian > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return imf_r.isSame(imf_c);
+			};
+		};
+
+		template<>
+		struct isInstantiable< Square, Hermitian > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return ( imf_r.n == imf_c.n );
+			};
+		};
+
+		template<>
+		struct isInstantiable< Hermitian, General > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return (
+					( imf_r.map( imf_r.n - 1 ) <= imf_c.map( 0 ) ) ||
+					( imf_c.map( imf_c.n - 1 ) <= imf_r.map( 0 ) )
+				);
+			};
+		};
+
+		template<>
+		struct isInstantiable< General, Hermitian > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return ( imf_r.n == imf_c.n );
+			};
+		};
+
+		struct HermitianPositiveDefinite: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< HermitianPositiveDefinite >, Hermitian::inferred_structures >::type;
+		};
+
+		template<>
+		struct isInstantiable< General, HermitianPositiveDefinite > : public isInstantiable< General, Hermitian > {
+		};
+
+		template<>
+		struct isInstantiable< HermitianPositiveDefinite, General > : public isInstantiable< Hermitian, General > {
+		};
+
+		template<>
+		struct isInstantiable< HermitianPositiveDefinite, HermitianPositiveDefinite > : public isInstantiable< Hermitian, Hermitian > {
+		};
+
+		template<>
+		struct isInstantiable< Square, HermitianPositiveDefinite > : public isInstantiable< Square, Hermitian > {
+		};
+
+
+		struct Trapezoidal: BaseStructure {
+
+			using inferred_structures = tuple_cat< std::tuple< Trapezoidal >, Band< OpenInterval >::inferred_structures >::type;
+		};
+
+		struct Triangular: BaseStructure {
+
+			using inferred_structures = tuple_cat< std::tuple< Triangular >, Square::inferred_structures, Trapezoidal::inferred_structures >::type;
+		};
+
+		struct LowerTrapezoidal: BaseStructure {
+
+			typedef std::tuple< LeftOpenInterval< 1 > > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< LowerTrapezoidal >, Trapezoidal::inferred_structures >::type;
+		};
+
+		struct Diagonal;
+
+		template<>
+		struct isInstantiable< LowerTrapezoidal, Diagonal > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				static_assert( std::is_base_of< imf::Strided, ImfR >::value && std::is_base_of< imf::Strided, ImfC >::value );
+				return ( ( imf_r.n == imf_c.n ) && ( imf_r.b == imf_c.b ) && ( imf_r.s == imf_c.s ) );
+			};
+		};
+
+		template<>
+		struct isInstantiable< LowerTrapezoidal, LowerTrapezoidal > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return ( imf_c.map( 0 ) <= imf_r.map( imf_r.n - 1 ) );
+			};
+		};
+
+		template<>
+		struct isInstantiable< General, LowerTrapezoidal > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void) imf_r;
+				(void) imf_c;
+				return true;
+			};
+		};
+
+		template<>
+		struct isInstantiable< LowerTrapezoidal, Square  > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return (
+					( imf_r.n == imf_c.n ) &&
+					( imf_c.map( imf_c.n - 1 ) <= imf_r.map( 0 ) )
+				);
+			};
+		};
+
+		struct LowerTriangular: BaseStructure {
+
+			typedef std::tuple< LeftOpenInterval< 1 > > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< LowerTriangular >, Triangular::inferred_structures, LowerTrapezoidal::inferred_structures >::type;
+		};
+
+		template<>
+		struct isInstantiable< Square, LowerTriangular > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return (imf_r.n == imf_c.n);
+			};
+		};
+
+		template<>
+		struct isInstantiable< LowerTriangular, LowerTriangular > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return imf_r.isSame(imf_c);
+			};
+		};
+
+
+		struct UpperTrapezoidal: BaseStructure {
+
+			typedef std::tuple< RightOpenInterval< 0 > > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< UpperTrapezoidal >, Trapezoidal::inferred_structures >::type;
+
+		};
+
+		template<>
+		struct isInstantiable< General, UpperTrapezoidal > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void) imf_r;
+				(void) imf_c;
+				return true;
+			};
+		};
+
+		struct UpperTriangular: BaseStructure {
+
+			typedef std::tuple< RightOpenInterval< 0 > > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< UpperTriangular >, Triangular::inferred_structures, UpperTrapezoidal::inferred_structures >::type;
+		};
+
+		struct Tridiagonal: BaseStructure {
+
+			private:
+
+				typedef Interval< -1, 2 > I;
+
+			public:
+
+				typedef std::tuple< I > band_intervals;
+
+				using inferred_structures = tuple_cat<
+					std::tuple< Tridiagonal >,
+					Square::inferred_structures,
+					Band< I >::inferred_structures
+				>::type;
+		};
+
+		struct SymmetricTridiagonal: BaseStructure {
+
+			private:
+
+				typedef Interval< -1, 2 > I;
+
+			public:
+
+				typedef std::tuple< I > band_intervals;
+
+				using inferred_structures = tuple_cat<
+					std::tuple< SymmetricTridiagonal >,
+					Symmetric::inferred_structures,
+					Tridiagonal::inferred_structures
+				>::type;
+		};
+
+		template<>
+		struct isInstantiable< SymmetricTridiagonal, SymmetricTridiagonal > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return imf_r.isSame(imf_c);
+			};
+		};
+
+		struct HermitianTridiagonal: BaseStructure {
+
+			private:
+
+				typedef Interval< -1, 2 > I;
+
+			public:
+
+				typedef std::tuple< I > band_intervals;
+
+				using inferred_structures = tuple_cat<
+					std::tuple< HermitianTridiagonal >,
+					Hermitian::inferred_structures,
+					Tridiagonal::inferred_structures
+				>::type;
+		};
+
+		struct Bidiagonal: BaseStructure {
+			using inferred_structures = tuple_cat< std::tuple< Bidiagonal >, Triangular::inferred_structures, Tridiagonal::inferred_structures >::type;
+		};
+
+		struct RectangularUpperBidiagonal: BaseStructure {
+
+			typedef std::tuple< Interval< 0, 2 > > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< RectangularUpperBidiagonal >, UpperTrapezoidal::inferred_structures	>::type;
+		};
+
+		struct RectangularLowerBidiagonal: BaseStructure {
+
+			typedef std::tuple< Interval< -1, 1 > > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< RectangularLowerBidiagonal >, LowerTrapezoidal::inferred_structures	>::type;
+		};
+
+		struct LowerBidiagonal: BaseStructure {
+
+			typedef std::tuple< Interval< -1, 1 > > band_intervals;
+
+			using inferred_structures = tuple_cat<
+				std::tuple< LowerBidiagonal >,
+				RectangularLowerBidiagonal::inferred_structures,
+				Bidiagonal::inferred_structures,
+				LowerTriangular::inferred_structures
+			>::type;
+		};
+
+		struct UpperBidiagonal: BaseStructure {
+
+			typedef std::tuple< Interval< 0, 2 > > band_intervals;
+
+			using inferred_structures = tuple_cat<
+				std::tuple< UpperBidiagonal >,
+				RectangularUpperBidiagonal::inferred_structures,
+				Bidiagonal::inferred_structures,
+				UpperTriangular::inferred_structures
+			>::type;
+		};
+
+		struct RectangularDiagonal: BaseStructure {
+
+			typedef std::tuple< Interval< 0 > > band_intervals;
+
+			using inferred_structures = tuple_cat<
+				std::tuple< RectangularDiagonal >,
+				RectangularLowerBidiagonal::inferred_structures,
+				RectangularUpperBidiagonal::inferred_structures
+			>::type;
+		};
+
+		struct Diagonal: BaseStructure {
+
+			typedef std::tuple< Interval< 0 > > band_intervals;
+
+			using inferred_structures = tuple_cat<
+				std::tuple< Diagonal >,
+				RectangularDiagonal::inferred_structures,
+				LowerBidiagonal::inferred_structures,
+				UpperBidiagonal::inferred_structures
+			>::type;
+		};
+
+
+		struct FullRank: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< FullRank >, General::inferred_structures >::type;
+		};
+
+		struct NonSingular: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< NonSingular >, Square::inferred_structures, FullRank::inferred_structures >::type;
+		};
+
+		struct OrthogonalColumns: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< OrthogonalColumns >, FullRank::inferred_structures >::type;
+		};
+
+		struct OrthogonalRows: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< OrthogonalRows >, FullRank::inferred_structures >::type;
+		};
+
+		struct Orthogonal: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< Orthogonal >, NonSingular::inferred_structures, OrthogonalColumns::inferred_structures, OrthogonalRows::inferred_structures >::type;
+		};
+
+		template<>
+		struct isInstantiable< RectangularDiagonal, Diagonal > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				static_assert( std::is_base_of< imf::Strided, ImfR >::value && std::is_base_of< imf::Strided, ImfC >::value );
+				return ( ( imf_r.n == imf_c.n ) && ( imf_r.b == imf_c.b ) && ( imf_r.s == imf_c.s ) );
+			};
+		};
+
+		template<>
+		struct isInstantiable< General, Diagonal > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				static_assert( std::is_base_of< imf::Strided, ImfR >::value && std::is_base_of< imf::Strided, ImfC >::value );
+				return ( ( imf_r.n == imf_c.n ) && ( imf_r.b == imf_c.b ) && ( imf_r.s == imf_c.s ) );
+			};
+		};
+
+		template<>
+		struct isInstantiable< Orthogonal, Orthogonal > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				return imf_r.n == imf_c.n ;
+			};
+		};
+
+		template<>
+		struct isInstantiable< OrthogonalColumns, OrthogonalColumns > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void) imf_c;
+				static_assert( std::is_base_of< imf::Strided, ImfR >::value );
+				return( imf_r.map( imf_r.n - 1 ) == imf_r.n - 1	);
+			};
+		};
+
+		template<>
+		struct isInstantiable< Orthogonal, OrthogonalColumns > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void) imf_c;
+				static_assert( std::is_base_of< imf::Strided, ImfR >::value );
+				return( imf_r.map( imf_r.n - 1 ) == imf_r.n - 1	);
+			};
+		};
+
+
+		template<>
+		struct isInstantiable< OrthogonalRows, OrthogonalRows > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void) imf_r;
+				static_assert( std::is_base_of< imf::Strided, ImfC >::value );
+				return( imf_c.map( imf_c.n - 1 ) == imf_c.n - 1	);
+			};
+		};
+
+		template<>
+		struct isInstantiable< Orthogonal, OrthogonalRows > {
+			template< typename ImfR, typename ImfC >
+			static bool check( const ImfR &imf_r, const ImfC &imf_c ) {
+				(void) imf_r;
+				static_assert( std::is_base_of< imf::Strided, ImfC >::value );
+				return( imf_c.map( imf_c.n - 1 ) == imf_c.n - 1	);
+			};
+		};
+
+		struct Constant: BaseStructure {
+
+			typedef std::tuple< OpenInterval > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< Constant >, General::inferred_structures >::type;
+		};
+
+		struct Identity: BaseStructure {
+
+			typedef std::tuple< Interval< 0 > > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< Identity >, FullRank::inferred_structures, Diagonal::inferred_structures, Constant::inferred_structures >::type;
+		};
+
+		struct Zero: BaseStructure {
+
+			typedef std::tuple< > band_intervals;
+
+			using inferred_structures = tuple_cat< std::tuple< Zero >, Constant::inferred_structures >::type;
+		};
+
+		/**
+		 * @brief Checks if TestedStructure is a \a Structure according to the ALP's structure classification.
+		 *
+		 * @tparam TestedStructure   The structure to be tested.
+		 * @tparam Structure 		 The structure that should be implied by \a TestedStructure.
+		 */
+		template< typename TestedStructure, typename Structure >
+		struct is_a {
+
+			static_assert( std::is_base_of< structures::BaseStructure, TestedStructure >::value );
+
+			/**
+			 * \a value is true iff \a Structure is implied by \a TestedStructure.
+			 */
+			static constexpr bool value = is_in< Structure, typename TestedStructure::inferred_structures >::value;
+		};
+
+		/**
+		 * Exposes the structure obtained by applying a given view onto a given structure.
+		 *
+		 * By default, the exposed structure is equal to the input structure.
+		 * Cases where this is not true shall be specialized.
+		 */
+		template< enum view::Views view, typename Structure >
+		struct apply_view {
+			typedef Structure type;
+		};
+
+		template<>
+		struct apply_view< view::transpose, structures::LowerTriangular >{
+			typedef structures::UpperTriangular type;
+		};
+
+		template<>
+		struct apply_view< view::transpose, structures::UpperTriangular >{
+			typedef structures::LowerTriangular type;
+		};
+
+		template< typename... Intervals >
+		struct apply_view< view::transpose, structures::Band< Intervals... > >{
+			typedef structures::tuple_to_band< typename alp::internal::transpose_interval_tuple< std::tuple< Intervals... > >::type > type;
+		};
+
+		template< size_t band, typename Structure >
+		std::ptrdiff_t get_lower_limit( const size_t rows ) {
+
+			const std::ptrdiff_t m = static_cast< std::ptrdiff_t >( rows );
+			constexpr std::ptrdiff_t cl_a = std::tuple_element< band, typename Structure::band_intervals >::type::left;
+
+			const std::ptrdiff_t l_a = ( cl_a < -m + 1 ) ? -m + 1 : cl_a ;
+
+			return l_a;
+
+		}
+
+		template< size_t band, typename Structure >
+		std::ptrdiff_t get_upper_limit( const size_t cols ) {
+
+			const std::ptrdiff_t n = static_cast< std::ptrdiff_t >( cols );
+			constexpr std::ptrdiff_t cu_a = std::tuple_element< band, typename Structure::band_intervals >::type::right;
+
+			const std::ptrdiff_t u_a = ( cu_a > n ) ? n : cu_a ;
+
+			return u_a;
+
+		}
+
+	} // namespace structures
+
+} // namespace alp
+
+#endif
diff --git a/include/alp/type_traits.hpp b/include/alp/type_traits.hpp
new file mode 100644
index 000000000..975a8a14b
--- /dev/null
+++ b/include/alp/type_traits.hpp
@@ -0,0 +1,425 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 25th of March, 2019
+ */
+
+#ifndef _H_ALP_TYPE_TRAITS
+#define _H_ALP_TYPE_TRAITS
+
+#include <type_traits>
+#include <alp/backends.hpp>
+#include <alp/density.hpp>
+#include <alp/views.hpp>
+#include <alp/storage.hpp>
+
+
+namespace alp {
+
+	/**
+	 * Used to inspect whether a given type is an ALP scalar.
+	 *
+	 * @tparam T The type to inspect.
+	 *
+	 * \note An arbitrary type is not an ALP scalar.
+	 *
+	 */
+	template< typename T >
+	struct is_scalar : std::false_type {};
+
+	/**
+	 * Used to inspect whether a given type is an ALP vector.
+	 *
+	 * @tparam T The type to inspect.
+	 *
+	 * \note An arbitrary type is not an ALP vector.
+	 *
+	 */
+	template< typename T >
+	struct is_vector : std::false_type {};
+
+	/**
+	 * Used to inspect whether a given type is an ALP matrix.
+	 *
+	 * @tparam T The type to inspect.
+	 *
+	 * \note An arbitrary type is not an ALP matrix.
+	 *
+	 */
+	template< typename T >
+	struct is_matrix : std::false_type {};
+
+	/**
+	 * Used to inspect whether a given type is an ALP container.
+	 *
+	 * @tparam T The type to inspect.
+	 *
+	 * There are only three ALP containers:
+	 *  -# alp::Scalar,
+	 *  -# alp::Vector, and
+	 *  -# alp::Matrix.
+	 */
+	template< typename T >
+	struct is_container : std::integral_constant<
+		bool,
+		is_scalar< T >::value || is_vector< T >::value || is_matrix< T >::value
+	> {};
+
+	namespace internal {
+
+		/**
+		 * Used to inspect whether a given type is an internal container.
+		 *
+		 * @tparam T The type to inspect.
+		 *
+		 * There are only two internal containers:
+		 *  -# alp::internal::Vector, and
+		 *  -# alp::internal::Matrix.
+		 */
+		template< typename T >
+		struct is_container : std::false_type {};
+
+	} // namespace internal
+
+	/**
+	 * Used to inspect whether a given type is an ALP semiring.
+	 *
+	 * @tparam T The type to inspect.
+	 */
+	template< typename T >
+	struct is_semiring {
+		/** Base case: an arbitrary type is not a semiring. */
+		static const constexpr bool value = false;
+	};
+
+	/**
+	 * Used to inspect whether a given type is an ALP monoid.
+	 *
+	 * @tparam T The type to inspect.
+	 */
+	template< typename T >
+	struct is_monoid {
+		/** Base case: an arbitrary type is not a monoid. */
+		static const constexpr bool value = false;
+	};
+
+	/**
+	 * Used to inspect whether a given type is an ALP relation.
+	 *
+	 * @tparam T The type to inspect.
+	 */
+	template< typename T >
+	struct is_relation {
+		/** Base case: an arbitrary type is not a relation. */
+		static const constexpr bool value = false;
+	};
+
+	/**
+	 * Used to inspect whether a given type is an ALP operator.
+	 *
+	 * @tparam T The type to inspect.
+	 */
+	template< typename T >
+	struct is_operator {
+		/** Base case: an arbitrary type is not an operator. */
+		static const constexpr bool value = false;
+	};
+
+	/**
+	 * Used to inspect whether a given type is an ALP object.
+	 *
+	 * @tparam T The type to inspect.
+	 *
+	 * A ALP object is either a container, a semiring, a monoid, or an
+	 * operator.
+	 *
+	 * @see #is_monoid
+	 * @see #is_semiring
+	 * @see #is_operator
+	 * @see #is_container
+	 */
+	template< typename T >
+	struct is_object {
+		/** A ALP object is either a container, a semiring, a monoid, or an operator. */
+		static const constexpr bool value = is_container< T >::value ||
+			is_semiring< T >::value ||
+			is_monoid< T >::value ||
+			is_operator< T >::value;
+	};
+
+	/**
+	 * Used to inspect whether a given operator is idempotent.
+	 *
+	 * @tparam T The operator to inspect.
+	 *
+	 * An example of an idempotent operator is the logical OR,
+	 * #alp::operators::logical_or.
+	 */
+	template< typename T >
+	struct is_idempotent {
+		static_assert( is_operator< T >::value, "Template argument to alp::is_idempotent must be an operator!" );
+		static const constexpr bool value = false;
+	};
+
+	/**
+	 * Used to inspect whether a given semiring has immutable nonzeroes under
+	 * addition.
+	 *
+	 * @tparam T The semiring to inspect.
+	 *
+	 * An example of a monoid with an immutable identity is the logical OR,
+	 * #alp::operators::logical_or.
+	 */
+	template< typename T >
+	struct has_immutable_nonzeroes {
+		static_assert( is_semiring< T >::value,
+			"Template argument to alp::has_immutable_nonzeroes must be a "
+			"semiring!" );
+		static const constexpr bool value = false;
+	};
+
+	namespace internal {
+
+		/**
+		 * Whether or not a given operator could translate to a no-op;
+		 * i.e., leave its outputs unmodified. This can be relevant
+		 * because it indicates situations where alp::apply could leave
+		 * the output uninitialised, which may well not be as intended.
+		 *
+		 * An example of an operator that non-trivially may result in a
+		 * no-op is alp::operators::left_assign_if. Such operators must
+		 * overload this internal type trait.
+		 */
+		template< typename OP >
+		struct maybe_noop {
+			static_assert( is_operator< OP >::value,
+				"Argument to internal::maybe_noop must be an operator."
+			);
+			static const constexpr bool value = false;
+		};
+
+	} // end namespace alp::internal
+
+	/**
+	 * Used to get a structure type of the given ALP container
+	 *
+	 * @tparam T The ALP container to inspect.
+	 *
+	 */
+	template< typename Container >
+	struct inspect_structure {};
+
+	namespace internal {
+
+		/**
+		 * Used to get a View type of the given ALP container
+		 *
+		 * @tparam T The ALP container to inspect.
+		 *
+		 */
+		template< typename Container >
+		struct inspect_view {};
+
+		/**
+		 * Inspects whether a view corresponds to a storage-based ALP container.
+		 *
+		 * ALP containers can either be storage-based or functor-based.
+		 *
+		 * @tparam T The view to inspect.
+		 *
+		 * \note A Matrix is storage-based if it has
+		 *       - an original view over void, or
+		 *       - any type of view over another storage-based matrix.
+		 *
+		 */
+		template< typename View >
+		struct is_view_over_storage : is_view_over_storage<
+			typename inspect_view< typename View::applied_to >::type
+		> {};
+
+		/** Original view over void is by definition storage based ALP container. */
+		template<>
+		struct is_view_over_storage< view::Original< void > > : std::true_type {};
+
+		/** Functor views are not storage-based ALP containers. */
+		template< typename LambdaType >
+		struct is_view_over_storage< view::Functor< LambdaType > > : std::false_type {};
+
+		/**
+		 * A helper type trait for \a is_view_over_functor.
+		 * Needed to expose the type the provided view is applied to.
+		 *
+		 * @tparam View       The view to inspect.
+		 * @tparam AppliedTo  The type that View is applied to.
+		 *
+		 * @see is_view_over_functor
+		 *
+		 */
+		template< typename View, typename AppliedTo >
+		struct is_view_over_functor_helper : is_view_over_functor_helper<
+			/** The view of the ALP container this view is applied to */
+			typename inspect_view< typename View::applied_to >::type,
+			/** What the above view is applied to */
+			typename inspect_view< typename View::applied_to >::type::applied_to
+		> {};
+
+		/** Functor view over a lambda type is by definition functor-based ALP container. */
+		template< typename AppliedTo >
+		struct is_view_over_functor_helper< view::Functor< AppliedTo >, AppliedTo > : std::true_type {};
+
+		template< typename AppliedTo >
+		struct is_view_over_functor_helper< view::Original< void >, AppliedTo > : std::false_type {};
+
+		/**
+		 * Inspects whether a view corresponds to a functor-based ALP container.
+		 *
+		 * ALP containers can either be storage-based or functor-based.
+		 *
+		 * @tparam View  The view to inspect.
+		 *
+		 * \note A Matrix is functor-based if it has
+		 *       - a functor view over a lambda type, or
+		 *       - any type of view over another functor-based matrix.
+		 *
+		 * @see is_view_over_functor_helper
+		 *
+		 */
+		template< typename View >
+		struct is_view_over_functor : is_view_over_functor_helper< View, typename View::applied_to > {};
+
+		/**
+		 * Inspects a type is a storage-based ALP container.
+		 *
+		 * @tparam T  The type to inspect.
+		 *
+		 * \note A Matrix is storage-based if its view is a view over a
+		 *       storage-based container.
+		 *
+		 * @see is_view_over_storage
+		 *
+		 */
+		template< typename T >
+		struct is_storage_based : is_view_over_storage< typename internal::inspect_view< T >::type > {
+			static_assert( is_matrix< T >::value || is_vector< T >::value,
+				"The argument to internal::is_storage_based must be an ALP matrix or an ALP vector." );
+		};
+
+		/**
+		 * Inspects a type is a functor-based ALP container.
+		 *
+		 * @tparam T  The type to inspect.
+		 *
+		 * \note A Matrix is functor-based if its view is a view over a
+		 *       storage-based container.
+		 *
+		 * @see is_view_over_functor
+		 *
+		 */
+		template< typename T >
+		struct is_functor_based : is_view_over_functor< typename internal::inspect_view< T >::type > {
+			static_assert( is_matrix< T >::value || is_vector< T >::value,
+				"The argument to internal::is_functor_based must be an ALP matrix or an ALP vector." );
+		};
+
+		/**
+		 * Inspects whether a provided view is associated with an ALP container
+		 * that allocates the container data-related memory (either the storage
+		 * or the functor), or, in other words,
+		 * whether it is a view over another ALP container.
+		 *
+		 * @tparam T The view type to inspect.
+		 *
+		 * The value is true if the provided view corresponds to an ALP container that
+		 * - allocates memory for container storage, or
+		 * - allocates memory for a functor
+		 * The value is false otherwise, i.e., if the provided view type corresponds
+		 * to a view over another ALP container, and, therefore, does not need to
+		 * allocate memory for storage/functor.
+		 *
+		 */
+		template< typename View >
+		struct requires_allocation : std::integral_constant<
+			bool,
+			std::is_same< view::Original< void >, View >::value ||
+			std::is_same< view::Functor< typename View::applied_to >, View >::value
+		> {};
+
+	} // namespace internal
+
+	namespace internal {
+		/**
+		 * Defines a new ALP container type form the provided original type
+		 * with the modification of the desired nested template parameter.
+		 */
+		template< typename ContainerType >
+		struct new_container_type_from {};
+
+		template<
+			template< typename, typename, enum Density, typename, typename, typename, enum Backend > typename Container,
+			typename T, typename Structure, enum Density density, typename View, typename ImfR, typename ImfC, enum Backend backend
+		>
+		struct new_container_type_from< Container< T, Structure, density, View, ImfR, ImfC, backend > > {
+
+			typedef Container< T, Structure, density, View, ImfR, ImfC, backend > original_container;
+			static_assert( is_matrix< original_container >::value || is_vector< original_container >::value , "ModifyType supports only ALP Matrix and Vector types." );
+
+			template< template< typename, typename, enum Density, typename, typename, typename, enum Backend > typename NewContainer >
+			struct change_container {
+				typedef NewContainer< T, Structure, density, View, ImfR, ImfC, backend > type;
+				typedef new_container_type_from< type > _and_;
+			};
+
+			template< typename NewStructure >
+			struct change_structure {
+				typedef Container< T, NewStructure, density, View, ImfR, ImfC, backend > type;
+				typedef new_container_type_from< type > _and_;
+			};
+
+			template< typename NewView >
+			struct change_view {
+				typedef Container< T, Structure, density, NewView, ImfR, ImfC, backend > type;
+				typedef new_container_type_from< type > _and_;
+			};
+
+			template< typename NewImfR >
+			struct change_imfr {
+				typedef Container< T, Structure, density, View, NewImfR, ImfC, backend > type;
+				typedef new_container_type_from< type > _and_;
+			};
+
+			template< typename NewImfC >
+			struct change_imfc {
+				typedef Container< T, Structure, density, View, ImfR, NewImfC, backend > type;
+				typedef new_container_type_from< type > _and_;
+			};
+
+			template< enum Backend new_backend >
+			struct change_backend {
+				typedef Container< T, Structure, density, View, ImfR, ImfC, new_backend > type;
+				typedef new_container_type_from< type > _and_;
+			};
+
+			private:
+				new_container_type_from() = delete;
+		};
+	} // namespace internal
+
+} // namespace alp
+
+#endif
diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp
new file mode 100644
index 000000000..1dd319fc6
--- /dev/null
+++ b/include/alp/utils.hpp
@@ -0,0 +1,318 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 8th of August, 2016
+ */
+
+#ifndef _H_ALP_UTILITIES
+#define _H_ALP_UTILITIES
+
+#include <assert.h>
+
+#include <cmath>  //fabs
+#include <functional> // modulus
+#include <limits> //numeric_limits
+#include <type_traits>
+
+#include <alp/descriptors.hpp>
+
+
+namespace alp {
+
+	/**
+	 * Some utility classes used that may be used throughout this GraphBLAS
+	 * implementation.
+	 *
+	 * Utilities that rely on external libraries or system calls should \em not be
+	 * added here-- those should reside in their own compilation units so that
+	 * backends can decide on an individual basis whether or not to include them.
+	 * This is especially useful when writing a backend for an architecture without
+	 * extensive coverage of standard extensions or libraries.
+	 */
+	namespace utils {
+
+		/**
+		 * Checks whether two values are equal.
+		 *
+		 * This function simply performs a bit-wise comparison on native \em integral
+		 * data types, or this function assumes a properly overloaded == operator.
+		 *
+		 * @\tparam T The numerical type.
+		 *
+		 * @param a One of the two values to comare against.
+		 * @param b One of the two values to comare against.
+		 * @returns Whether a == b.
+		 */
+		template< typename T >
+		static bool equals( const T & a, const T & b, typename std::enable_if< ! std::is_floating_point< T >::value >::type * = NULL ) {
+			// simply do standard compare
+			return a == b;
+		}
+
+		/**
+		 * Checks whether two floating point values are equal.
+		 *
+		 * This function takes into account round-off errors due to machine precision.
+		 *
+		 * \warning This does not take into account accumulated numerical errors
+		 *          due to previous operations on the given values.
+		 *
+		 * @\tparam T The numerical type.
+		 *
+		 * @param a One of the two values to comare against.
+		 * @param b One of the two values to comare against.
+		 * @param epsilons How many floating point errors may have accumulated.
+		 *                 Should be chosen larger or equal to one.
+		 *
+		 * @returns Whether a == b.
+		 */
+		template< typename T, typename U >
+		static bool equals( const T &a, const T &b, const U epsilons,
+			typename std::enable_if< std::is_floating_point< T >::value >::type * = NULL
+		) {
+			assert( epsilons >= 1 );
+
+			// if they are bit-wise equal, it's easy
+			if( a == b ) {
+#ifdef _DEBUG
+ #ifndef _ALP_NO_STDIO
+				std::cout << "\t Bit-wise equal\n";
+ #else
+				printf( "\t Bit-wise equal\n" );
+ #endif
+#endif
+				return true;
+			}
+
+			// if not, we need to look at the absolute values
+			const T absA = fabs( a );
+			const T absB = fabs( b );
+			const T absDiff = fabs( a - b );
+			const T absPlus = absA + absB;
+
+			// find the effective epsilon
+			const T eps = static_cast< T >( epsilons ) * std::numeric_limits< T >::epsilon();
+
+			// find the minimum and maximum *normal* values.
+			const T min = std::numeric_limits< T >::min();
+			const T max = std::numeric_limits< T >::max();
+
+			// if the difference is a subnormal number, it should be smaller than machine epsilon times min;
+			// if this is not the case, then we cannot safely conclude anything from this small a difference.
+			// The same is true if a or b are zero.
+			if( a == 0 || b == 0 || absPlus < min ) {
+#ifdef _DEBUG
+ #ifndef _ALP_NO_STDIO
+				std::cout << "\t Zero or close to zero difference\n";
+ #else
+				printf( "\t Zero or close to zero difference\n" );
+ #endif
+#endif
+				return absDiff < eps * min;
+			}
+
+			// we wish to normalise absDiff by (absA + absB),
+			// However, absA + absB might overflow.
+			if( absA > absB ) {
+				if( absB > max - absA ) {
+#ifdef _DEBUG
+ #ifndef _ALP_NO_STDIO
+					std::cout << "\t Normalising absolute difference by max (I)\n";
+ #else
+					printf( "\t Normalising absolute difference by max (I)\n" );
+ #endif
+#endif
+					return absDiff / max < eps;
+				}
+			} else {
+				if( absA > max - absB ) {
+#ifdef _DEBUG
+ #ifndef _ALP_NO_STDIO
+					std::cout << "\t Normalising absolute difference by max (II)\n";
+ #else
+					printf( "\t Normalising absolute difference by max (II)\n" );
+ #endif
+#endif
+					return absDiff / max < eps;
+				}
+			}
+			// use of relative error should be safe
+#ifdef _DEBUG
+ #ifndef _ALP_NO_STDIO
+			std::cout << "\t Using relative error\n";
+ #else
+			printf( "\t Using relative error\n" );
+ #endif
+#endif
+			return absDiff / absPlus < eps;
+		}
+
+		/**
+		 * A templated max function that is different from std::max in that the
+		 * return value is a constexpr. (This was fixed in C++14.)
+		 */
+		template< typename T >
+		constexpr const T & static_max( const T &a, const T &b ) {
+			return a > b ? a : b;
+		}
+
+		/**
+		 * A templated min function that is different from std::min in that the
+		 * return value is a constexpr. (This was fixed in C++14.)
+		 */
+		template< typename T >
+		constexpr const T & static_min( const T & a, const T & b ) {
+			return a < b ? a : b;
+		}
+
+		/**
+		 * A sizeof that is safe w.r.t. void types.
+		 *
+		 */
+		template< typename T >
+		class SizeOf {
+		public:
+			/**
+			 * If \a T is <tt>void</tt>, this value equals 0 and
+			 * equal to <tt>sizeof(T)</tt> otherwise.
+			 */
+			static constexpr const size_t value = sizeof( T );
+		};
+
+		// void-specialisation of the above class
+		template<>
+		class SizeOf< void > {
+		public:
+			static constexpr const size_t value = 0;
+		};
+
+		/**
+		 * Given a (combination) of descriptors, evaluates a mask at a given
+		 * position.
+		 *
+		 * @tparam descriptor The given descriptor.
+		 * @tparam T          The type of an element of the mask.
+		 *
+		 * @param[in] assigned Whether there was an element in the mask.
+		 * @param[in] val      Pointer to memory area where mask elements reside.
+		 * @param[in] offset   Offset in \a val to the mask value to be interpreted.
+		 *
+		 * The memory area pointed to by \a val shall not be dereferenced if
+		 * \a assigned is false.
+		 *
+		 * @return If the descriptor includes alp::descriptors::structoral,
+		 *         returns \a assigned. If additionally alp::descriptors::invert_mask
+		 *         was defined, instead returns the negation of \a assigned.
+		 * @return If the descriptor includes alp::descriptors::structural_complement,
+		 *         returns the negation of \a assigned. If additionally
+		 *         alp::descriptors::invert_mask was defined, instead returns
+		 *         \a assigned.
+		 * @return If the descriptor does not include alp::descriptors::structural
+		 *         nor alp::descriptors::structural_complement and if \a assigned
+		 *         is false, then the entry is ignored and uninterpreted, thus
+		 *         returning \a false.
+		 * @return If the descriptor includes alp::descriptors::invert_mask,
+		 *         returns the negation of the dereferenced value of \a val
+		 *         which is first cast to a \a bool.
+		 * @return Otherwise, returns the dereferenced value of \a val,
+		 *         cast to a \a bool.
+		 *
+		 * If \a descriptor contains both alp::descriptors::structural and
+		 * alp::descriptors::structural_complement, the code shall not
+		 * compile.
+		 */
+		template< Descriptor descriptor, typename T >
+		static bool interpretMask( const bool & assigned, const T * const val, const size_t offset ) {
+			// set default mask to false
+			bool ret = false;
+			// if we request a structural mask, decide only on passed assigned variable
+			if( descriptor & descriptors::structural ) {
+				ret = assigned;
+			} else {
+				// if based on value, if there is a value, cast it to bool
+				if( assigned ) {
+					ret = static_cast< bool >( val[ offset ] );
+				}
+				// otherwise there is no value and false is assumed
+			}
+			// check whether we should return the inverted value
+			if( descriptor & descriptors::invert_mask ) {
+				return ! ret;
+			} else {
+				return ret;
+			}
+		}
+
+		/** Specialisation for void-valued masks */
+		template< Descriptor descriptor >
+		static bool interpretMask( const bool & assigned, const void * const, const size_t ) {
+			// set default mask to false
+			bool ret = assigned;
+			// check whether we should return the inverted value
+			if( descriptor & descriptors::invert_mask ) {
+				return ! ret;
+			} else {
+				return ret;
+			}
+		}
+
+		/**
+		 * @brief Simple range [ \a start, \a end) with optional \a stride.
+		 * 
+		 */
+		struct range {
+			size_t start, end, stride;
+
+			range(): start( 0 ), end( std::numeric_limits< size_t >::max() ), stride( 1 ) {}
+
+			range(size_t start, size_t end, size_t stride=1): start( start ), end( end ), stride( stride ) {}
+
+			inline size_t count() const {
+				return (end - start)/stride;
+			}
+
+			inline bool is_full() const {
+				return ( count() == std::numeric_limits< size_t >::max() );
+			}
+
+			inline bool is_empty() const {
+				return ( count() == 0 );
+			}
+
+		};
+		
+		
+		/**
+		 * Computation of modulus operation \f$ mod(x, n) \f$ based on remainder 
+		 * of division for type \a T within range [0, n).
+		 * Assumes \a T implements: \a operator%, \a operator<, and \a operator+.
+		 */
+		template < typename T >
+		T modulus( const T x, const T n ) {
+			
+			const T rem = x % n;
+			
+			return ( rem < static_cast< T >( 0 ) ) ? rem + n : rem;
+		};
+					
+	} // namespace utils
+
+} // namespace alp
+
+#endif
diff --git a/include/alp/utils/SynchronizedNonzeroIterator.hpp b/include/alp/utils/SynchronizedNonzeroIterator.hpp
new file mode 100644
index 000000000..91b35a827
--- /dev/null
+++ b/include/alp/utils/SynchronizedNonzeroIterator.hpp
@@ -0,0 +1,356 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 2nd of August, 2017
+ */
+
+#ifndef _H_ALP_SYNCHRONIZEDNONZEROITERATOR
+#define _H_ALP_SYNCHRONIZEDNONZEROITERATOR
+
+#ifdef _DEBUG
+#ifndef _ALP_NO_STDIO
+#include <iostream>
+#endif
+#endif
+#include <utility> //std::pair
+
+#include <assert.h>
+
+
+namespace alp {
+	namespace utils {
+
+		template< typename S1, typename S2, typename V, typename fwd_it1, typename fwd_it2, typename fwd_it3 >
+		class SynchronizedNonzeroIterator {
+
+			template< typename X1, typename X2, typename X3, typename X4, typename X5, typename X6 >
+			friend std::ostream & operator<<( std::ostream &, const SynchronizedNonzeroIterator< X1, X2, X3, X4, X5, X6 > & );
+
+		private:
+			// iterators to synchronise:
+			fwd_it1 row_it, row_end;
+			fwd_it2 col_it, col_end;
+			fwd_it3 val_it, val_end;
+
+			/** The currently active nonzero. */
+			mutable std::pair< std::pair< S1, S2 >, V > nonzero;
+
+			/** Whether #nonzero is up to date. */
+			mutable bool updated;
+
+			/** Updates the #nonzero fields using the current iterator values. */
+			inline void update() const {
+				assert( row_it != row_end );
+				assert( col_it != col_end );
+				assert( val_it != val_end );
+				nonzero.first.first = *row_it;
+				nonzero.first.second = *col_it;
+				nonzero.second = *val_it;
+				updated = true;
+			}
+
+		public:
+			// STL typedefs:
+			typedef std::ptrdiff_t difference_type;
+			typedef std::pair< std::pair< S1, S2 >, V > value_type;
+			typedef value_type & reference;
+			typedef value_type * pointer;
+			typedef std::forward_iterator_tag iterator_category;
+
+			// GraphBLAS typedefs:
+			typedef S1 row_coordinate_type;
+			typedef S2 column_coordinate_type;
+			typedef V nonzero_value_type;
+
+			/** Base constructor. Takes three sub-iterators as arguments. */
+			SynchronizedNonzeroIterator( fwd_it1 it1, fwd_it2 it2, fwd_it3 it3, fwd_it1 it1_end, fwd_it2 it2_end, fwd_it3 it3_end ) :
+				row_it( it1 ), row_end( it1_end ), col_it( it2 ), col_end( it2_end ), val_it( it3 ), val_end( it3_end ), updated( false ) {
+				if( it1 != it1_end && it2 != it2_end && it3 != it3_end ) {
+					update();
+					updated = false;
+				}
+			}
+
+			/** Copy constructor. */
+			SynchronizedNonzeroIterator( const SynchronizedNonzeroIterator< S1, S2, V, fwd_it1, fwd_it2, fwd_it3 > & other ) :
+				row_it( other.row_it ), row_end( other.row_end ), col_it( other.col_it ), col_end( other.col_end ), val_it( other.val_it ), val_end( other.val_end ), updated( other.updated ) {
+				if( updated && row_it != row_end && col_it != col_end && val_it != val_end ) {
+					update();
+				}
+			}
+
+			/** Assignment operator. */
+			SynchronizedNonzeroIterator< S1, S2, V, fwd_it1, fwd_it2, fwd_it3 > & operator=( const SynchronizedNonzeroIterator< S1, S2, V, fwd_it1, fwd_it2, fwd_it3 > & other ) {
+				row_it = other.row_it;
+				row_end = other.row_end;
+				col_it = other.col_it;
+				col_end = other.col_end;
+				val_it = other.val_it;
+				val_end = other.val_end;
+				updated = other.updated;
+				if( updated && row_it != row_end && col_it != col_end && val_it != val_end ) {
+					update();
+				}
+				return *this;
+			}
+
+			/** Equality check. */
+			bool operator==( const SynchronizedNonzeroIterator< S1, S2, V, fwd_it1, fwd_it2, fwd_it3 > & other ) const {
+				return row_it == other.row_it && col_it == other.col_it && val_it == other.val_it;
+			}
+
+			/** Inequality check. */
+			bool operator!=( const SynchronizedNonzeroIterator< S1, S2, V, fwd_it1, fwd_it2, fwd_it3 > & other ) const {
+				return row_it != other.row_it || col_it != other.col_it || val_it != other.val_it;
+			};
+
+			/** Increment operator. */
+			SynchronizedNonzeroIterator< S1, S2, V, fwd_it1, fwd_it2, fwd_it3 > & operator++() {
+				(void)++row_it;
+				(void)++col_it;
+				(void)++val_it;
+				updated = false;
+				return *this;
+			}
+
+			/** Direct derefence operator. */
+			reference operator*() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end && val_it != val_end );
+					update();
+				}
+				return nonzero;
+			}
+
+			/** Pointer update. */
+			const std::pair< std::pair< S1, S2 >, V > * operator->() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end && val_it != val_end );
+					update();
+				}
+				return &nonzero;
+			}
+
+			/** Returns the row coordinate. */
+			const S1 & i() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end && val_it != val_end );
+					update();
+				}
+				return nonzero.first.first;
+			}
+
+			/** Returns the column coordinate. */
+			const S2 & j() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end && val_it != val_end );
+					update();
+				}
+				return nonzero.first.second;
+			}
+
+			/** Returns the nonzero coordinate. */
+			const V & v() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end && val_it != val_end );
+					update();
+				}
+				return nonzero.second;
+			}
+		};
+
+		template< typename S1, typename S2, typename fwd_it1, typename fwd_it2 >
+		class SynchronizedNonzeroIterator< S1, S2, void, fwd_it1, fwd_it2, void > {
+
+			template< typename X1, typename X2, typename X3, typename X4 >
+			friend std::ostream & operator<<( std::ostream &, const SynchronizedNonzeroIterator< X1, X2, void, X3, X4, void > & );
+
+		private:
+			// iterators to synchronise:
+			fwd_it1 row_it, row_end;
+			fwd_it2 col_it, col_end;
+
+			/** The currently active nonzero. */
+			mutable std::pair< S1, S2 > nonzero;
+
+			/** Whether #nonzero is up to date. */
+			mutable bool updated;
+
+			/** Updates the #nonzero fields using the current iterator values. */
+			inline void update() const {
+				assert( row_it != row_end );
+				assert( col_it != col_end );
+				nonzero.first = *row_it;
+				nonzero.second = *col_it;
+				updated = true;
+			}
+
+		public:
+			// STL typedefs:
+			typedef std::ptrdiff_t difference_type;
+			typedef std::pair< S1, S2 > value_type;
+			typedef value_type & reference;
+			typedef value_type * pointer;
+			typedef std::forward_iterator_tag iterator_category;
+
+			// GraphBLAS typedefs:
+			typedef S1 row_coordinate_type;
+			typedef S2 column_coordinate_type;
+			typedef void nonzero_value_type;
+
+			/** Base constructor. Takes two sub-iterators as arguments. */
+			SynchronizedNonzeroIterator( fwd_it1 it1, fwd_it2 it2, fwd_it1 it1_end, fwd_it2 it2_end ) : row_it( it1 ), row_end( it1_end ), col_it( it2 ), col_end( it2_end ), updated( false ) {
+				if( it1 != it1_end && it2 != it2_end ) {
+					update();
+				}
+			}
+
+			/** Copy constructor. */
+			SynchronizedNonzeroIterator( const SynchronizedNonzeroIterator< S1, S2, void, fwd_it1, fwd_it2, void > & other ) :
+				row_it( other.row_it ), row_end( other.row_end ), col_it( other.col_it ), col_end( other.col_end ), updated( other.updated ) {
+				if( updated && row_it != row_end && col_it != col_end ) {
+					update();
+				}
+			}
+
+			/** Assignment operator. */
+			SynchronizedNonzeroIterator< S1, S2, void, fwd_it1, fwd_it2, void > & operator=( const SynchronizedNonzeroIterator< S1, S2, void, fwd_it1, fwd_it2, void > & other ) {
+				row_it = other.row_it;
+				row_end = other.row_end;
+				col_it = other.col_it;
+				col_end = other.col_end;
+				updated = other.updated;
+				if( updated && row_it != row_end && col_it != col_end ) {
+					update();
+				}
+				return *this;
+			}
+
+			/** Equality check. */
+			bool operator==( const SynchronizedNonzeroIterator< S1, S2, void, fwd_it1, fwd_it2, void > & other ) const {
+				return row_it == other.row_it && col_it == other.col_it;
+			}
+
+			/** Inequality check. */
+			bool operator!=( const SynchronizedNonzeroIterator< S1, S2, void, fwd_it1, fwd_it2, void > & other ) const {
+				return row_it != other.row_it || col_it != other.col_it;
+			};
+
+			/** Increment operator. */
+			SynchronizedNonzeroIterator< S1, S2, void, fwd_it1, fwd_it2, void > & operator++() {
+				(void)++row_it;
+				(void)++col_it;
+				updated = false;
+				return *this;
+			}
+
+			/** Direct derefence operator. */
+			reference operator*() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end );
+					update();
+				}
+				return nonzero;
+			}
+
+			/** Pointer update. */
+			const std::pair< S1, S2 > * operator->() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end );
+					update();
+				}
+				return &nonzero;
+			}
+
+			/** Returns the row coordinate. */
+			const S1 & i() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end );
+					update();
+				}
+				return nonzero.first;
+			}
+
+			/** Returns the column coordinate. */
+			const S2 & j() const {
+				if( ! updated ) {
+					assert( row_it != row_end && col_it != col_end );
+					update();
+				}
+				return nonzero.second;
+			}
+		};
+
+		template< typename S1, typename S2, typename V, typename fwd_it1, typename fwd_it2, typename fwd_it3 >
+		std::ostream & operator<<( std::ostream & os, const SynchronizedNonzeroIterator< S1, S2, V, fwd_it1, fwd_it2, fwd_it3 > & it ) {
+			if( ! it.updated ) {
+				it.update();
+			}
+			os << it.nonzero.first.first << ", " << it.nonzero.first.second << ", " << it.nonzero.second;
+			return os;
+		}
+
+		template< typename S1, typename S2, typename fwd_it1, typename fwd_it2 >
+		std::ostream & operator<<( std::ostream & os, const SynchronizedNonzeroIterator< S1, S2, void, fwd_it1, fwd_it2, void > & it ) {
+			if( ! it.updated ) {
+				it.update();
+			}
+			os << it.nonzero.first << ", " << it.nonzero.second;
+			return os;
+		}
+
+		template< typename S1, typename S2 >
+		SynchronizedNonzeroIterator< S1, S2, void, const S1 *, const S2 *, void > makeSynchronized( const S1 * const it1, const S2 * const it2, const S1 * const it1_end, const S2 * const it2_end ) {
+#ifdef _DEBUG
+#ifndef _ALP_NO_STDIO
+			std::cout << "SynchronizedNonzeroIterator::makeSynchronized "
+						 "received iterators "
+					  << it1 << " (start) and " << it2 << " (end)\n";
+#else
+			printf( "SynchronizedNonzeroIterator::makeSynchronized received "
+					"iterators %p (start) and %p (end)\n",
+				it1, it2 );
+#endif
+#endif
+			return SynchronizedNonzeroIterator< S1, S2, void, const S1 *, const S2 *, void >( it1, it2, it1_end, it2_end );
+		}
+
+		template< typename fwd_it1, typename fwd_it2 >
+		SynchronizedNonzeroIterator< typename fwd_it1::value_type, typename fwd_it2::value_type, void, fwd_it1, fwd_it2, void >
+		makeSynchronized( const fwd_it1 it1, const fwd_it2 it2, const fwd_it1 it1_end, const fwd_it2 it2_end ) {
+			return SynchronizedNonzeroIterator< typename fwd_it1::value_type, typename fwd_it2::value_type, void, fwd_it1, fwd_it2, void >( it1, it2, it1_end, it2_end );
+		}
+
+		template< typename S1, typename S2, typename V >
+		SynchronizedNonzeroIterator< S1, S2, V, const S1 *, const S2 *, const V * >
+		makeSynchronized( const S1 * const it1, const S2 * const it2, const V * const it3, const S1 * const it1_end, const S2 * const it2_end, const V * const it3_end ) {
+			return SynchronizedNonzeroIterator< S1, S2, V, const S1 *, const S2 *, const V * >( it1, it2, it3, it1_end, it2_end, it3_end );
+		}
+
+		template< typename fwd_it1, typename fwd_it2, typename fwd_it3 >
+		SynchronizedNonzeroIterator< typename fwd_it1::value_type, typename fwd_it2::value_type, typename fwd_it3::value_type, fwd_it1, fwd_it2, fwd_it3 >
+		makeSynchronized( const fwd_it1 it1, const fwd_it2 it2, const fwd_it3 it3, const fwd_it1 it1_end, const fwd_it2 it2_end, const fwd_it3 it3_end ) {
+			return SynchronizedNonzeroIterator< typename fwd_it1::value_type, typename fwd_it2::value_type, typename fwd_it3::value_type, fwd_it1, fwd_it2, fwd_it3 >(
+				it1, it2, it3, it1_end, it2_end, it3_end );
+		}
+
+	} // namespace utils
+} // namespace alp
+
+#endif // end ``_H_ALP_SYNCHRONIZEDNONZEROITERATOR''
diff --git a/include/alp/utils/autodeleter.hpp b/include/alp/utils/autodeleter.hpp
new file mode 100644
index 000000000..29d3d913c
--- /dev/null
+++ b/include/alp/utils/autodeleter.hpp
@@ -0,0 +1,187 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Collection of helper functions to deal with deleting shared pointers.
+ *
+ * @author A. N. Yzelman
+ * @date 22nd of April, 2017
+ */
+
+#ifndef _H_ALP_UTILS_AUTO_DELETER
+#define _H_ALP_UTILS_AUTO_DELETER
+
+#include <memory>
+#ifndef _ALP_NO_LIBNUMA
+ #include <numa.h>
+#endif
+
+#include <alp/config.hpp>
+
+
+namespace alp {
+
+	namespace utils {
+
+		namespace internal {
+
+			template< enum Backend implementation >
+			class DeleterFunctions {
+			private:
+				/** Prevent instantiation. */
+				DeleterFunctions() {}
+
+			public:
+				/** \todo documentation */
+				template< typename T >
+				static void safe_free( T * const pointer ) {
+					if( pointer != NULL ) {
+						free( pointer );
+					}
+				}
+
+#ifndef _ALP_NO_LIBNUMA
+				/** \todo documentation */
+				template< typename T >
+				class safe_numa_free {
+				private:
+					size_t size;
+
+				public:
+					safe_numa_free( const size_t size_in ) : size( size_in ) {}
+					void operator()( T * const pointer ) {
+						numa_free( pointer, size );
+					}
+				};
+#endif
+			};
+
+		} // namespace internal
+
+	} // namespace utils
+
+} // namespace alp
+
+// now define user API:
+namespace alp {
+
+	namespace utils {
+
+		/**
+		 * This function is compatible with \a posix_memalign and the standard
+		 * practice to allow \a NULL pointers for empty arrays.
+		 *
+		 * \todo expand documentation
+		 *
+		 * \warning This function is \em not thread safe!
+		 *
+		 * \note The difference between this class and the std::shared_ptr< T > is
+		 *       the deleter wrapper. This class also provides no way to reference
+		 *       the underlying pointer, nor is functionality provided to access
+		 *       the underlying type instance. This class thus only provides a
+		 *       means to easily delete memory areas that are shared between owners
+		 *       (which is in fact quite different from the philosophy of a
+		 *        \a shared_ptr which were introduced to `forget' about deletes.)
+		 */
+		template< typename T, enum Backend implementation = config::default_backend >
+		class AutoDeleter {
+
+		private:
+			/** Where the implementation of the free function(s) reside. */
+			typedef typename internal::DeleterFunctions< implementation > functions;
+
+			/** Functionality is provided by shared pointer. */
+			std::shared_ptr< T > _shPtr;
+
+		public:
+			/**
+			 * Constructs a new AutoDeleter from a pointer. When this instance and all
+			 * instances copied from this one are destroyed, the pointer will be freed
+			 * if and only if it is not equal to \a NULL.
+			 *
+			 * The pointer is assumed to be allocated using \a posix_memalign, which is
+			 * compitable with the C-standard \a free. Thus pointers that cannot be
+			 * free'd in this manner should never be passed to this AutoDeleter
+			 * constructor.
+			 *
+			 * @throws std::bad_alloc If the system cannot allocate enough memory.
+			 */
+			AutoDeleter( T * const pointer = NULL, const size_t size = 0 ) {
+#ifdef _ALP_NO_LIBNUMA
+				(void)size;
+				const auto free_p = &( functions::template safe_free< T > );
+				_shPtr = std::shared_ptr< T >( pointer, free_p );
+#else
+				if( size > 0 ) {
+					typedef typename functions::template safe_numa_free< T > FreeFunctor;
+					const FreeFunctor free_f( size );
+					_shPtr = std::shared_ptr< T >( pointer, free_f );
+				} else {
+					const auto free_p = &( functions::template safe_free< T > );
+					_shPtr = std::shared_ptr< T >( pointer, free_p );
+				}
+#endif
+			};
+
+			/**
+			 * Copies an \a other AutoDeleter. The underlying pointer will only be freed
+			 * if at least this new AutoDeleter and the \a other AutoDeleter are
+			 * destroyed. (The preceding says `at least' because other copies may have
+			 * been made previously.)
+			 *
+			 * @throws std::bad_alloc If the system cannot allocate enough memory.
+			 */
+			AutoDeleter( const AutoDeleter< T > &other ) : _shPtr( other._shPtr ) {}
+
+			/**
+			 * Creates an AutoDeleter from a temporary instance by stealing its
+			 * resources.
+			 */
+			AutoDeleter( AutoDeleter< T > &&other ) noexcept {
+				_shPtr = std::move( other._shPtr );
+			}
+
+			/** Signals auto-deletion no longer is necessary. */
+			void clear() noexcept {
+				_shPtr.reset();
+			}
+
+			/**
+			 * Relies on std::move. Equals-operator only works on temporary RHS.
+			 */
+			AutoDeleter< T > & operator=( AutoDeleter< T > &&other ) {
+				_shPtr = std::move( other._shPtr );
+				return *this;
+			}
+
+			/**
+			 * Relies on copying the underlying shared pointer.
+			 */
+			AutoDeleter< T > & operator=( const AutoDeleter< T > &other ) {
+				_shPtr = other._shPtr;
+				return *this;
+			}
+		};
+
+	} // namespace utils
+
+} // namespace alp
+
+#endif
+
diff --git a/include/alp/utils/parser/MatrixFileIterator.hpp b/include/alp/utils/parser/MatrixFileIterator.hpp
new file mode 100644
index 000000000..634177c38
--- /dev/null
+++ b/include/alp/utils/parser/MatrixFileIterator.hpp
@@ -0,0 +1,357 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef _H_MATRIXFILEITERATOR
+#define _H_MATRIXFILEITERATOR
+
+#include <cstddef> //std::ptrdiff_t
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept> //std::runtime_error
+
+#include "MatrixFileProperties.hpp"
+
+namespace alp {
+	namespace utils {
+		namespace internal {
+
+			template< typename T >
+			class MatrixFileIterator {
+
+				private:
+
+				/** The output type of the base iterator. */
+				typedef T OutputType;
+
+				/** The underlying MatrixReader. */
+				MatrixFileProperties &properties;
+
+				/** The input stream. */
+				std::ifstream infile;
+
+				/** The input stream position. */
+				std::streampos spos;
+
+				/** The curent value */
+				OutputType val;
+
+				/** The i index counter. */
+				size_t colidx;
+
+				/** The j index counter. */
+				size_t rowidx;
+
+				/** Whether the \a infile stream \em and \a buffer have been depleted. */
+				bool ended;
+
+				/** Whether the first fill of the buffer is held until the first dereference of this iterator is taking place. */
+				bool started;
+
+				/** A function to apply to convert input values on the fly. */
+				std::function< void( T & ) > converter;
+
+				/** Strips comments and possible MatrixMarket header from input stream start. */
+				void preprocess() {
+					// check if first header indicates MatrixMarket
+					const std::streampos start = infile.tellg();
+					// assume input is matrix market until we detect otherwise
+					bool mmfile = true;
+					// try and parse header
+					std::string mmheader;
+					if( !std::getline( infile, mmheader ) ) {
+						// some type of error occurred-- rewind and let a non-mmfile parse try
+						mmfile = false;
+						(void)infile.seekg( start );
+					}
+					if( mmfile && ( mmheader.size() < 14 || mmheader.substr( 0, 14 ) != "%%MatrixMarket" ) ) {
+						// some type of error occurred-- rewind and let a non-mmfile parse try
+						mmfile = false;
+						(void)infile.seekg( start );
+					}
+					// ignore all comment lines
+					char peek = infile.peek();
+					while( infile.good() && ( peek == '%' || peek == '#' ) ) {
+						(void)infile.ignore( std::numeric_limits< std::streamsize >::max(), '\n' );
+						peek = infile.peek();
+					}
+					// ignore non-comment matrix market header if we expect one
+					if( mmfile ) {
+						std::string ignore;
+						std::getline( infile, ignore );
+					} else {
+						mmheader.clear();
+					}
+					// done
+				}
+
+				void start() {
+					if( started ) {
+						preprocess();
+						started = false;
+						(void)operator++();
+					}
+				}
+
+
+				public:
+
+				// standard STL iterator typedefs
+				typedef std::ptrdiff_t difference_type;
+				typedef OutputType value_type;
+				typedef OutputType &reference;
+				typedef OutputType *pointer;
+				typedef std::forward_iterator_tag iterator_category;
+
+				typedef T nonzero_value_type;
+
+				/** Base constructor, starts in begin position. */
+				MatrixFileIterator(
+					MatrixFileProperties &prop,
+					IOMode mode, const std::function< void( T & ) > valueConverter,
+					const bool end = false
+				) :
+					properties( prop ), infile( properties._fn ), spos(),
+					colidx( 0 ), rowidx( 0 ), ended( end ),
+					started( !end ),
+					converter( valueConverter ) {
+					if( mode != SEQUENTIAL ) {
+						throw std::runtime_error(
+							"Only sequential IO is supported by this iterator at "
+							"present."
+						);
+					}
+				}
+
+				/** Copy constructor. */
+				MatrixFileIterator( const MatrixFileIterator< T > &other ) :
+					properties( other.properties ), infile( properties._fn ), spos( other.spos ),
+					colidx( other.colidx ), rowidx( other.rowidx ), ended( other.ended ), started( other.started ),
+					converter( other.converter ) {
+					// set latest stream position
+					(void)infile.seekg( spos );
+				}
+
+				/** Base destructor. */
+				~MatrixFileIterator() {
+				}
+
+				/** Copies an iterator state. */
+				MatrixFileIterator &operator=( const MatrixFileIterator &x ) {
+					// copy ended state
+					ended = x.ended;
+					// copy started state
+					started = x.started;
+					// copy converter
+					converter = x.converter;
+					// check if we are done already
+					if( ended ) {
+						return *this;
+					}
+					// check if both iterators have a similar ifstream
+					if( properties._fn == x.properties._fn ) {
+						// NO, so re-create infile
+						if( infile.is_open() ) {
+							infile.close();
+						}
+						infile.open( x.properties._fn );
+						properties = x.properties;
+					}
+					// not yet done, copy input file stream position
+					(void)infile.seekg( x.spos );
+					spos = x.spos;
+					return *this;
+				}
+
+				/** Standard check for equality. */
+				bool operator==( const MatrixFileIterator &x ) const {
+					// check if both are in end position
+					if( ended && x.ended ) {
+						return true;
+					}
+#ifndef NDEBUG
+					if( properties._fn != x.properties._fn ) {
+						std::cerr << "Warning: comparing two instances of "
+							"MatrixFileIterator that are 1) nonempty "
+							"*and* 2) not reading from the same "
+							"file.\n";
+					}
+#endif
+					// check if both are in start position
+					if( started && x.started ) {
+						return true;
+					}
+
+					// otherwise, not equal
+					return false;
+				}
+
+				/** Standard check for inequality, relies on equality check. */
+				bool operator!=( const MatrixFileIterator &x ) const {
+					return !( operator==( x ) );
+				}
+
+				// this assumes full triplet data
+				MatrixFileIterator &operator++() {
+					// if ended, stop
+					if( ended ) {
+						return *this;
+					}
+					// if this is the first function call on this iterator, call preprocess first
+					start();
+
+					if( !( infile >> val ) ) {
+						ended = true;
+					} else {
+#ifdef _DEBUG
+						T temp = val;
+						converter( temp );
+						std::cout << "MatrixFileIterator::operator++  parsed line ``"
+							  << val  << "'', with value after conversion "
+							  << temp << "\n";
+#endif
+						// convert value
+						converter( val );
+
+						if(
+							( properties._symmetry == MatrixFileProperties::MMsymmetries::SYMMETRIC ) ||
+							( properties._symmetry == MatrixFileProperties::MMsymmetries::HERMITIAN )
+						) {
+							++rowidx;
+							if( rowidx  == properties._n + 1 ) {
+								++colidx;
+								rowidx = colidx + 1;
+							}
+						} else if (
+							properties._symmetry == MatrixFileProperties::MMsymmetries::SKEWSYMMETRIC
+						) {
+							throw std::runtime_error(
+								"Not implemented i,j: SKEWSYMMETRIC."
+							);
+						} else if (
+							properties._symmetry == MatrixFileProperties::MMsymmetries::GENERAL
+						) {
+							++rowidx;
+							if( rowidx == properties._m + 1 ) {
+								rowidx = 1;
+								++colidx;
+							}
+
+						}
+
+					}
+
+#ifdef _DEBUG
+					std::cout << "MatrixFileIterator::operator++ "
+						": buffer at index "
+						  << i << " now contains " << val << "\n";
+#endif
+
+					// done
+					return *this;
+				}
+
+				/** Standard dereferencing of iterator. */
+				const OutputType &operator*() {
+					start();
+
+					if( ended ) {
+						throw std::runtime_error(
+							"Attempt to dereference (via operator*) "
+							"MatrixFileIterator in end "
+							"position." );
+					}
+					return val;
+				}
+
+				/** Standard pointer request of iterator. */
+				const OutputType * operator->() {
+					start();
+
+					if( ended ) {
+						throw std::runtime_error(
+							"Attempt to dereference (via "
+							"operator->) "
+							"MatrixFileIterator in end "
+							"position."
+						);
+					}
+					return &val;
+				}
+
+				/** Returns the current col index. */
+				size_t j() {
+					if( started ) {
+						const_cast< MatrixFileIterator< T > * >( this )->preprocess();
+						const_cast< MatrixFileIterator< T > * >( this )->started = false;
+						(void)const_cast< MatrixFileIterator< T > * >( this )->operator++();
+					}
+					if( ended ) {
+						throw std::runtime_error(
+							"Attempt to dereference (via "
+							"operator*) "
+							"MatrixFileIterator in end "
+							"position."
+						);
+					}
+					return colidx;
+				}
+
+				/** Returns the current row index. */
+				size_t i() {
+					if( started ) {
+						const_cast< MatrixFileIterator< T > * >( this )->preprocess();
+						const_cast< MatrixFileIterator< T > * >( this )->started = false;
+						(void)const_cast< MatrixFileIterator< T > * >( this )->operator++();
+					}
+					if( ended ) {
+						throw std::runtime_error(
+							"Attempt to dereference (via "
+							"operator*) "
+							"MatrixFileIterator in end "
+							"position."
+						);
+					}
+					return rowidx - 1;
+				}
+
+				/** Returns the current nonzero value. */
+				T & v() {
+					if( started ) {
+						const_cast< MatrixFileIterator< T > * >( this )->preprocess();
+						const_cast< MatrixFileIterator< T > * >( this )->started = false;
+						(void)const_cast< MatrixFileIterator< T > * >( this )->operator++();
+					}
+					if( ended ) {
+						throw std::runtime_error(
+							"Attempt to dereference (via "
+							"operator*) "
+							"MatrixFileIterator in end "
+							"position."
+						);
+					}
+
+					return val;
+				}
+			};
+
+		} // namespace internal
+	} // namespace utils
+} // namespace alp
+
+#endif // end ``_H_MATRIXFILEITERATOR''
diff --git a/include/alp/utils/parser/MatrixFileProperties.hpp b/include/alp/utils/parser/MatrixFileProperties.hpp
new file mode 100644
index 000000000..adf2ebaca
--- /dev/null
+++ b/include/alp/utils/parser/MatrixFileProperties.hpp
@@ -0,0 +1,84 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_MATRIXFILE_PROPERTIES
+#define _H_MATRIXFILE_PROPERTIES
+
+#include <string>
+
+namespace alp {
+	namespace utils {
+		namespace internal {
+
+			struct MatrixFileProperties {
+
+				/** The various files supported for reading. */
+				enum Type { MATRIX_MARKET, SNAP };
+
+				/** Matrix Market formats. */
+				enum MMformats { COORDINATE, ARRAY };
+
+				/** Matrix Market formats. */
+				enum MMsymmetries { GENERAL, SYMMETRIC, SKEWSYMMETRIC, HERMITIAN };
+
+				/** Matrix Market formats. */
+				enum MMdatatype { REAL, COMPLEX };
+
+				/** The filename of the matrix file. */
+				std::string _fn;
+
+				/** The number of rows. */
+				size_t _m;
+
+				/** The number of columns. */
+				size_t _n;
+
+				/**
+				 * The number of nonzeroes.
+				 *
+				 * Should the number of nonzeroes be unknown a priori, this value shall
+				 * equal the maximum representable value in a <tt>size_t</tt>.
+				 */
+				size_t _nz;
+
+				/**
+				 * The number of entries in the file.
+				 *
+				 * \note This needs not be the same as \a _nz in case of symmetric data
+				 *       files.
+				 */
+				size_t _entries;
+
+				/** The type of the file. */
+				Type _type;
+
+				/** The type MM format. */
+				MMformats _mmformat;
+
+				/** The symmetry type MM format. */
+				MMsymmetries _symmetry;
+
+				/** The MM data format. */
+				MMdatatype _datatype;
+
+			};
+
+		} // namespace internal
+	}     // namespace utils
+} // namespace alp
+
+#endif // end ``_H_MATRIXFILE_PROPERTIES''
diff --git a/include/alp/utils/parser/MatrixFileReader.hpp b/include/alp/utils/parser/MatrixFileReader.hpp
new file mode 100644
index 000000000..0196894e2
--- /dev/null
+++ b/include/alp/utils/parser/MatrixFileReader.hpp
@@ -0,0 +1,139 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 25th of May, 2017
+ */
+
+#ifndef _H_MATRIXFILEREADER
+#define _H_MATRIXFILEREADER
+
+#include <algorithm> //std::max
+#include <iostream>
+#include <string>
+#include <type_traits>
+
+#include "MatrixFileIterator.hpp"
+#include "MatrixFileProperties.hpp"
+#include "MatrixFileReaderBase.hpp"
+
+namespace alp {
+	namespace utils {
+
+		template< typename T, typename S = size_t >
+		class MatrixFileReader : public internal::MatrixFileReaderBase< T, S > {
+
+			static_assert( std::is_integral< S >::value, "The template parameter S to MatrixFileReader must be integral." );
+
+			template< typename U, typename V >
+			friend std::ostream & operator<<( std::ostream & out, const MatrixFileReader< U, V > & A );
+
+			public:
+
+			/**
+			 * Constructs a matrix reader using minimal information.
+			 *
+			 * This constructor will parse the file in its entirety once. The use of an iterator will parse the file \em again.
+			 *
+			 * @param[in] filename Which file to read.
+			 * @param[in] direct   (Optional) Whether the file uses direct indexing.
+			 *                     If not, new indices will be automatically inferred.
+			 *                     Default value is \a true.
+			 * @param[in] symmetricmap (Optional) In case \a direct is \a false, whether
+			 *                         the row map should equal the column map.
+			 *
+			 * Defaults for \a direct and \a symmetricmap are <tt>true</tt>.
+			 *
+			 * @throws std::runtime_error If the given file does not exist.
+			 *
+			 */
+			MatrixFileReader( const std::string filename ) {
+				internal::MatrixFileProperties & properties = this->properties;
+				// set properties
+				properties._fn = filename;
+				// check for existance of file
+				this->exists();
+				// open up file stream to infer remainder properties
+				std::ifstream infile( properties._fn );
+				// try and find header
+				this->findHeader( infile ) ;
+				this->ignoreComments( infile ) ;
+				// print info to stdout
+				this->coda();
+#ifdef _DEBUG
+				std::cout << *this << "\n";
+#endif
+			}
+
+			// we do not provide non-const iterators, as we will not modify files
+			// all other iterators follow the MatrixFileIterator codes:
+
+			/**
+			 * This is an alias to cbegin() -- we only allow read-only access to the
+			 * underlying matrix.
+			 */
+			internal::MatrixFileIterator< T > begin(
+				const IOMode mode = SEQUENTIAL,
+				const std::function< void( T & ) > valueConverter = []( T & ) {}
+			) {
+				return cbegin( mode, valueConverter );
+			}
+
+			/**
+			 * This is an alias to cbegin() -- we only allow read-only access to the
+			 * underlying matrix.
+			 */
+			internal::MatrixFileIterator< T > end(
+				const IOMode mode = SEQUENTIAL,
+				const std::function< void( T & ) > valueConverter = []( T & ) {}
+			) {
+				return cend( mode, valueConverter );
+			}
+
+			/**
+			 * Reads out the nonzeroes from the underlying matrix file. The returned
+			 * iterator points to the first nonzero in the collection. No order of
+			 * iteration is defined.
+			 *
+			 * @param[in] mode           Which I/O mode to employ.
+			 * @param[in] valueConverter Optional value conversion lambda.
+			 *
+			 * The valueConverter must be a lambda function with signature
+			 *   <tt> void f( T& ); </tt>
+			 *
+			 * The default value for \a mode is #SEQUENTIAL.
+			 * The default for \a valueConverter is a no-op.
+			 */
+			internal::MatrixFileIterator< T > cbegin(
+				const IOMode mode = SEQUENTIAL,
+				const std::function< void( T & ) > valueConverter = []( T & ) {} ) {
+				return internal::MatrixFileIterator< T >( internal::MatrixFileReaderBase< T, S >::properties, mode, valueConverter, false );
+			}
+
+			/** Matching end iterator to cbegin(). */
+			internal::MatrixFileIterator< T > cend(
+				const IOMode mode = SEQUENTIAL,
+				const std::function< void( T & ) > valueConverter = []( T & ) {} ) {
+				return internal::MatrixFileIterator< T >( internal::MatrixFileReaderBase< T, S >::properties, mode, valueConverter, true );
+			}
+		};
+
+	} // namespace utils
+} // namespace alp
+
+#endif //``_H_MATRIXFILEREADER''
diff --git a/include/alp/utils/parser/MatrixFileReaderBase.hpp b/include/alp/utils/parser/MatrixFileReaderBase.hpp
new file mode 100644
index 000000000..697d8dde8
--- /dev/null
+++ b/include/alp/utils/parser/MatrixFileReaderBase.hpp
@@ -0,0 +1,309 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 25th of May, 2017
+ */
+
+#ifndef _H_MATRIXFILEREADERBASE
+#define _H_MATRIXFILEREADERBASE
+
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+
+#include <sys/stat.h> //C-style stat, used to check file existance
+
+#include "MatrixFileProperties.hpp"
+
+namespace alp {
+	namespace utils {
+		namespace internal {
+
+			/**
+			 * Parses SNAP files & Matrix Market files.
+			 *
+			 * @tparam T The type a nonzero value iterator should return. Can be set to \a void in case the values are not of interest.
+			 * @tparam S (Optional) The type an nonzero index iterator should return. Default value: \a size_t.
+			 */
+			template< typename T, typename S = size_t >
+			class MatrixFileReaderBase {
+
+				protected:
+				/** Properties, including filename etc. */
+				MatrixFileProperties properties;
+
+				/** Checks whether \a fn exists on the file system. If not, throws a runtime error. */
+				void exists() {
+					// declare buffer (see man 2 stat)
+					struct stat buf;
+					// try and fill buffer
+					const int rc = stat( properties._fn.c_str(), &buf );
+					// if call was successful then the file exists (we ignore contents of buf)
+					if( rc != 0 ) {
+						throw std::runtime_error( "The given file " + properties._fn + " does not exist." );
+					}
+				}
+
+				/** Forwards the stream until we hit a non-comment line. */
+				void ignoreComments( std::ifstream & infile ) {
+					char peek = infile.peek();
+					while( infile.good() && ( peek == '%' || peek == '#' ) ) {
+						(void)infile.ignore( std::numeric_limits< std::streamsize >::max(), '\n' );
+						peek = infile.peek();
+					}
+				}
+
+				/** Checks whether we have MatrixMarket input. If yes, use that to set \a _m, \a _n, and \a _nz. If yes, returns \a true. If not, returns \a false. */
+				bool findHeader( std::ifstream & infile ) {
+					// check if first header indicates MatrixMarket
+					const std::streampos start = infile.tellg();
+					// assume input is matrix market until we detect otherwise
+					bool mmfile = true;
+					std::string line;
+
+					// try and parse header
+					if( ! std::getline( infile, line ) ) {
+						// some type of error occurred-- rewind and let a non-mmfile parse try
+						mmfile = false;
+						(void)infile.seekg( start );
+					}
+
+					std::stringstream streamline(line);
+					std::string wordinline;
+
+					if( mmfile && ( streamline >> wordinline ) && wordinline != "%%MatrixMarket" ) {
+						// some type of error occurred-- rewind and let a non-mmfile parse try
+						mmfile = false;
+						(void)infile.seekg( start );
+					}
+
+					if( mmfile ) {
+						std::cerr << "Info: MatrixMarket file detected. Header line: ``"
+							  << line << "''\n";
+						// matrix market files are always 1-based
+						// properties._oneBased = true;
+
+						// parse header: object type
+						if( !( streamline >> wordinline ) || wordinline != "matrix" ) {
+							throw std::runtime_error(
+								"MatrixMarket file does "
+								"not describe a "
+								"matrix."
+							);
+						}
+						// parse header: format type
+						if ( streamline >> wordinline ) {
+							if( wordinline == "coordinate" ) {
+								properties._mmformat = MatrixFileProperties::MMformats::COORDINATE;
+								throw std::runtime_error(
+									"Matrix Market Coordinate format "
+									"should not be used in ALP."
+									"Please use GRB parser for sparse matrices (Coordinate format)."
+								);
+							} else if ( wordinline == "array" ) {
+								properties._mmformat = MatrixFileProperties::MMformats::ARRAY;
+							} else {
+								throw std::runtime_error( "This parser only "
+											  "understands coordinate and array "
+											  "matrix storage." );
+							}
+						} else {
+							std::cout << "wordinline = " << wordinline << "\n";
+							throw std::runtime_error(
+								"Cannot parse matrix file header file."
+							);
+						}
+						// parse header: nonzero value type
+						if ( streamline >> wordinline ) {
+							if( wordinline == "real" ) {
+								properties._datatype = MatrixFileProperties::MMdatatype::REAL;
+							} else if ( wordinline == "complex" ) {
+								properties._datatype = MatrixFileProperties::MMdatatype::COMPLEX;
+								throw std::runtime_error( "Complex  matrices still not supported." );
+							} else {
+								throw std::runtime_error(
+									"This parser only "
+									"understands real or complex  matrices."
+								);
+							}
+						} else {
+#ifdef DEBUG
+							std::cout << "wordinline = " << wordinline << "\n";
+#endif
+							throw std::runtime_error(
+								"Cannot parse matrix file header file."
+							);
+						}
+
+						// parse header: structural information
+						if ( streamline >> wordinline ) {
+							if( wordinline == "symmetric" ) {
+								properties._symmetry = MatrixFileProperties::MMsymmetries::SYMMETRIC;
+							} else if ( wordinline == "general" ) {
+								properties._symmetry = MatrixFileProperties::MMsymmetries::GENERAL;
+							} else {
+								throw std::runtime_error(
+									"This parser only understands "
+									"symmetric or general matrices."
+								);
+							}
+						} else {
+							std::cout << "wordinline = " << wordinline << "\n";
+							throw std::runtime_error(
+								"Cannot parse matrix file header file."
+							);
+						}
+
+						// ignore all comment lines
+						ignoreComments( infile );
+						// parse first header line
+						std::streampos start = infile.tellg();
+						if( ! std::getline( infile, line ) ) {
+							// could not read first non-comment line-- let a non-mtx parser try
+							mmfile = false;
+						} else {
+							// parse first non-comment non-header line
+							std::istringstream iss( line );
+							// set defaults
+							properties._m = properties._n = properties._nz = properties._entries = 0;
+							if ( properties._mmformat == MatrixFileProperties::MMformats::COORDINATE ) {
+								throw std::runtime_error( "Matrix market Coordinate format not supported." );
+							} else if ( properties._mmformat == MatrixFileProperties::MMformats::ARRAY ) {
+								if( ! ( iss >> properties._m >> properties._n ) ) {
+									// could not read length line-- let a non-mtx parser try
+									mmfile = false;
+								} else {
+									properties._nz = properties._m * properties._n;
+									// header parse OK, set nonzeroes field if we can:
+									if( properties._symmetry == MatrixFileProperties::MMsymmetries::GENERAL ) {
+										properties._entries = properties._nz;
+									} else if ( properties._symmetry == MatrixFileProperties::MMsymmetries::SYMMETRIC ) {
+										if( properties._n != properties._m ) {
+											throw std::runtime_error( "Matrix market Symmetric should be square: N x N." );
+											properties._nz = static_cast< size_t >( -1 );
+										}
+										properties._entries = properties._n * ( properties._n + 1 ) / 2;
+									} else {
+										throw std::runtime_error( "Not implemented." );
+									}
+
+								}
+							}
+
+						}
+						// if we found a matrix market header but found non-standard lines
+						if( ! mmfile ) {
+							// rewind to let other parser try
+							(void)infile.seekg( start );
+							// and print warning
+							std::cerr << "Warning: first line of file "
+								"indicated MatrixMarket format-- "
+								"however, no valid header line after "
+								"comment block was found. ";
+							std::cerr << "Attempting to continue as though "
+								"this is *not* a MatrixMarket file.\n";
+						}
+					}
+					// if header was successfully parsed, record type of file
+					if( mmfile ) {
+						properties._type = MatrixFileProperties::Type::MATRIX_MARKET;
+					}
+					// done
+					return mmfile;
+				}
+
+				/** Prints info to stdout, to be called after successful construction. */
+				void coda() const noexcept {
+					std::cerr << "Info: MatrixFileReader constructed for "
+						  << properties._fn << ": an " << properties._m << " times "
+						  << properties._n << " matrix holding " << properties._entries
+						  << " entries. ";
+					std::cerr << " type  = " << properties._type << " " ;
+					std::cerr << " symmetry  = " << properties._symmetry << " " ;
+					std::cerr << ".\n";
+				}
+
+				/** Base construtor, does not initialise anything. */
+				MatrixFileReaderBase() {}
+
+
+				public:
+
+				/** Returns the underlying file name. */
+				std::string filename() const noexcept {
+					return properties._fn;
+				}
+
+				/** Returns the number of rows in the matrix file. */
+				size_t m() const noexcept {
+					return properties._m;
+				}
+
+				/** Returns the number of columns in the matrix file. */
+				size_t n() const noexcept {
+					return properties._n;
+				}
+
+				/** Check if matrix is symmetric. */
+				bool isSymmetric() const noexcept {
+					return properties._symmetry == MatrixFileProperties::MMsymmetries::SYMMETRIC;
+				}
+
+				/** Check if matrix is symmetric. */
+				bool isGeneral() const noexcept {
+					return properties._symmetry == MatrixFileProperties::MMsymmetries::GENERAL;
+				}
+
+				/**
+				 * If known, returns the number of nonzeroes contained in the matrix file.
+				 *
+				 * \warning If the number is not known, this function will throw an
+				 *          exception. Therefore, only use this function inside of a try-
+				 *          catch.
+				 *
+				 * @returns The number of nonzeroes in the matrix file.
+				 *
+				 * @thows runtime_error In case the number of nonzeroes was not known a
+				 *                      priori.
+				 */
+				size_t nz() const {
+					if( properties._nz == static_cast< size_t >( -1 ) ) {
+						throw std::runtime_error( "File header or parse mode "
+									  "does not allow for an "
+									  "a-priori count of "
+									  "nonzeroes." );
+					}
+					return properties._nz;
+				}
+
+				/** Returns the number of entries in the underlying file. */
+				size_t entries() const noexcept {
+					return properties._entries;
+				}
+
+			};
+
+		} // namespace internal
+	}     // namespace utils
+} // namespace alp
+
+#endif // end ``_H_MATRIXFILEREADERBASE''
diff --git a/include/alp/utils/suppressions.h b/include/alp/utils/suppressions.h
new file mode 100644
index 000000000..71cbda591
--- /dev/null
+++ b/include/alp/utils/suppressions.h
@@ -0,0 +1,50 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Supresses warnings that may be unnecessarily emitted by various compilers.
+ *
+ * Use of the macros defined in this file should always be accompanied with an
+ * explanation why it is safe to ignore the compiler warnings that the macro
+ * suppresses.
+ *
+ * @author A. N. Yzelman
+ * @date 19th of January 2022
+ */
+
+#ifndef _H_ALP_UTILS_SUPRESSIONS
+#define _H_ALP_UTILS_SUPRESSIONS
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+ // here are the macros for GCC
+ #define ALP_UTIL_IGNORE_MAYBE_UNINITIALIZED \
+  _Pragma( "GCC diagnostic push" ) ;\
+  _Pragma( "GCC diagnostic ignored \"-Wmaybe-uninitialized\"" );\
+
+ #define ALP_UTIL_RESTORE_WARNINGS \
+  _Pragma( "GCC diagnostic pop" );\
+
+#else
+ // here are empty default macros
+ #define ALP_UTIL_IGNORE_MAYBE_UNINITIALIZED
+ #define ALP_UTIL_RESTORE_WARNINGS
+#endif
+
+#endif // end ``_H_ALP_REFERENCE_BLAS2''
+
diff --git a/include/alp/vector.hpp b/include/alp/vector.hpp
new file mode 100644
index 000000000..e1e0629e0
--- /dev/null
+++ b/include/alp/vector.hpp
@@ -0,0 +1,78 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 10th of August, 2016
+ */
+
+#ifndef _H_ALP_VECTOR
+#define _H_ALP_VECTOR
+
+#include "base/config.hpp"
+#include "base/vector.hpp"
+
+// now include all specialisations contained in the backend directories:
+#ifdef _ALP_WITH_REFERENCE
+ #include <alp/reference/vector.hpp>
+#endif
+#ifdef _ALP_WITH_DISPATCH
+ #include <alp/dispatch/vector.hpp>
+#endif
+#ifdef _ALP_WITH_OMP
+ #include <alp/omp/vector.hpp>
+#endif
+
+// specify default only if requested during compilation
+#ifdef _ALP_BACKEND
+namespace alp {
+	namespace internal {
+		template< typename D, Backend implementation = config::default_backend >
+		class Vector;
+	} // namespace internal
+
+	/*
+	 * The default value of \a density could also be made conditional (Dense or Sparse) depending on \a config::default_backend
+	 */
+	template<
+		typename T,
+		typename Structure = structures::General,
+		enum Density density = Density::Dense,
+		typename View = view::Original< void >,
+		typename ImfR = imf::Id,
+		typename ImfC = imf::Zero,
+		enum Backend backend = config::default_backend
+	>
+	class Vector;
+
+	/** Specializations of ALP backend-agnostic type traits */
+	template< typename T, typename Structure, enum Density density, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	struct inspect_structure< Vector< T, Structure, density, View, ImfR, ImfC, backend > > {
+		typedef Structure type;
+	};
+
+	template< typename T, typename Structure, enum Density density, typename View, typename ImfR, typename ImfC, enum Backend backend >
+	struct internal::inspect_view< Vector< T, Structure, density, View, ImfR, ImfC, backend > > {
+		typedef View type;
+	};
+
+
+}
+#endif
+
+#endif
+
diff --git a/include/alp/views.hpp b/include/alp/views.hpp
new file mode 100644
index 000000000..e5a3c954c
--- /dev/null
+++ b/include/alp/views.hpp
@@ -0,0 +1,118 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @file This file registers (static) views on matrix containers that are either
+ *       implemented, under implementation, or were at any point in time
+ *       conceived and noteworthy enough to be recorded for future consideration.
+ *       A static view represents a particular \em perspective on a container that
+ *       can be defined at compile-time and that can always be applied to a container
+ *       irrespective of dynamic features such as its dimensions.
+ * 		 A View provides information about the structured matrix it is applied to,
+ *       including its type (member type \a applied_to), or how read its dimensions.
+ */
+
+#ifndef _H_ALP_VIEWS
+#define _H_ALP_VIEWS
+
+#include <algorithm>
+#include <utility>
+
+namespace alp {
+
+	namespace view {
+
+		/**
+		 * Lists the view types exposed to the user.
+		 *
+		 * \note View type "_internal" shall not be used by the user and
+		 *       its use may result in an undefined behaviour.
+		 *
+		 * \note \internal "_internal" value is added so that all view types have
+		 *                 a defined type_id field, which is used by internal
+		 *                 type traits.
+		 *
+		 */
+		enum Views {
+			original,
+			gather,
+			transpose,
+			diagonal,
+			matrix,
+			_internal
+		};
+
+		template< typename OriginalType >
+		struct Original {
+
+			using applied_to = OriginalType;
+
+			static constexpr Views type_id = Views::original;
+
+		};
+
+		template< typename OriginalType >
+		struct Gather {
+
+			using applied_to = OriginalType;
+
+			static constexpr Views type_id = Views::gather;
+
+		};
+
+		template< typename OriginalType >
+		struct Transpose {
+
+			using applied_to = OriginalType;
+
+			static constexpr Views type_id = Views::transpose;
+
+		};
+
+		template< typename OriginalType >
+		struct Diagonal {
+
+			using applied_to = OriginalType;
+
+			static constexpr Views type_id = Views::diagonal;
+
+		};
+
+		template< typename OriginalType >
+		struct Matrix {
+
+			using applied_to = OriginalType;
+
+			static constexpr Views type_id = Views::matrix;
+
+		};
+
+		template< typename LambdaFunctionType >
+		struct Functor {
+
+			using applied_to = LambdaFunctionType;
+
+			/** Functor views are not exposed to the user */
+			static constexpr Views type_id = Views::_internal;
+
+		};
+
+	}; // namespace view
+
+} // namespace alp
+
+#endif // _H_ALP_VIEWS
diff --git a/include/graphblas/CMakeLists.txt b/include/graphblas/CMakeLists.txt
new file mode 100644
index 000000000..9a976e479
--- /dev/null
+++ b/include/graphblas/CMakeLists.txt
@@ -0,0 +1,145 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Definition of GraphBLAS include targets: all targets here defined
+# are interface targets for headers and basic definitions required to build
+# GraphBLAS backends and tests. Importing targets have all basic dependencies
+# and definitions to compile against each backend, but MUST explicitly
+# set a default backend (if they want to do so).
+#
+assert_defined_variables( REFERENCE_INCLUDE_DEFS REFERENCE_OMP_INCLUDE_DEFS LPF_INCLUDE_DEFS
+	WITH_REFERENCE_BACKEND_HEADERS WITH_OMP_BACKEND_HEADERS WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND
+)
+assert_valid_variables( INCLUDE_INSTALL_DIR HEADERS_REGEX )
+
+# to avoid flaky acrobatics with regex or glob expressions, copy main files directly
+install( FILES "../graphblas.hpp" DESTINATION "${INCLUDE_INSTALL_DIR}" )
+set( root_files
+	"../graphblas.hpp" "backends.hpp" "benchmark.hpp"
+	"blas0.hpp" "blas1.hpp" "blas2.hpp"
+	"blas3.hpp" "collectives.hpp" "config.hpp"
+	"coordinates.hpp" "descriptors.hpp" "distribution.hpp"
+	"exec.hpp" "identities.hpp" "init.hpp"
+	"internalops.hpp" "io.hpp" "iomode.hpp"
+	"matrix.hpp" "monoid.hpp" "ops.hpp"
+	"phase.hpp" "pinnedvector.hpp" "properties.hpp"
+	"rc.hpp" "semiring.hpp" "spmd.hpp"
+	"tags.hpp" "type_traits.hpp" "utils.hpp"
+	"vector.hpp" "SynchronizedNonzeroIterator.hpp"
+	"NonzeroStorage.hpp"
+)
+set( GRB_INCLUDE_INSTALL_DIR "${INCLUDE_INSTALL_DIR}/graphblas")
+install( FILES ${root_files} DESTINATION "${GRB_INCLUDE_INSTALL_DIR}" )
+
+# copy base headers and all its subdirectories (if any)
+# note: leave the slash "/" at the end of the DIRECTORY path,
+# othwerise CMake will create a "graphblas/base" directory inside DESTINATION !!!
+# https://cmake.org/cmake/help/latest/command/install.html#installing-directories
+install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/base/"
+	DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/base"
+	FILES_MATCHING REGEX "${HEADERS_REGEX}"
+)
+
+# copy utils headers
+install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/utils/"
+	DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/utils"
+	FILES_MATCHING REGEX "${HEADERS_REGEX}"
+)
+
+if( WITH_REFERENCE_BACKEND_HEADERS )
+	add_library( backend_reference_headers INTERFACE )
+	target_link_libraries( backend_reference_headers INTERFACE backend_headers_nodefs )
+	target_compile_definitions( backend_reference_headers INTERFACE "${REFERENCE_INCLUDE_DEFS}" )
+	# currently, the OMP header files and definitions are required also for the reference backend
+	# TODO: "dis-entangle" code from OMP backend dependence and remove this
+	target_compile_definitions( backend_reference_headers INTERFACE "${REFERENCE_OMP_INCLUDE_DEFS}" )
+	target_link_libraries( backend_reference_headers INTERFACE OpenMP::OpenMP_CXX )
+
+	install( TARGETS backend_reference_headers EXPORT GraphBLASTargets )
+endif()
+
+if( WITH_REFERENCE_BACKEND OR WITH_OMP_BACKEND )
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/reference/"
+		DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/reference"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+endif()
+
+if( WITH_OMP_BACKEND_HEADERS )
+	add_library( backend_reference_omp_headers INTERFACE )
+	target_link_libraries( backend_reference_omp_headers INTERFACE backend_headers_nodefs )
+	target_link_libraries( backend_reference_omp_headers INTERFACE OpenMP::OpenMP_CXX )
+	target_compile_definitions( backend_reference_omp_headers INTERFACE
+		"${REFERENCE_INCLUDE_DEFS}" "${REFERENCE_OMP_INCLUDE_DEFS}"
+	)
+
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/omp/"
+		DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/omp"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+	install( TARGETS backend_reference_omp_headers EXPORT GraphBLASTargets )
+
+endif()
+
+
+if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
+	# copy headers, which are common to both distributed backends
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/bsp/"
+		DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/bsp"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/bsp1d/"
+		DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/bsp1d"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+endif()
+
+if( WITH_BSP1D_BACKEND )
+	add_library( backend_bsp1d_headers INTERFACE )
+	target_link_libraries( backend_bsp1d_headers INTERFACE Numa::Numa backend_reference_headers )
+	target_compile_definitions( backend_bsp1d_headers INTERFACE "${LPF_INCLUDE_DEFS}" )
+
+	install( TARGETS backend_bsp1d_headers EXPORT GraphBLASTargets )
+endif()
+
+if( WITH_HYBRID_BACKEND )
+	add_library( backend_hybrid_headers INTERFACE )
+	# the hybrid backend needs OMP by design
+	target_link_libraries( backend_hybrid_headers INTERFACE Numa::Numa backend_reference_omp_headers )
+	target_compile_definitions( backend_hybrid_headers INTERFACE "${LPF_INCLUDE_DEFS}" )
+
+	install( TARGETS backend_hybrid_headers EXPORT GraphBLASTargets )
+endif()
+
+# this target lists the algorithms implemented on top of the generic functionalities,
+# hence it depends only on backend_headers_nodefs
+add_library( algorithms INTERFACE )
+target_link_libraries( algorithms INTERFACE backend_headers_nodefs )
+
+target_include_directories(
+	algorithms INTERFACE
+
+	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/graphblas/algorithms>
+	$<INSTALL_INTERFACE:graphblas/algorithms>
+)
+
+install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/"
+	DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/algorithms"
+	FILES_MATCHING REGEX "${HEADERS_REGEX}"
+)
+
+install( TARGETS algorithms EXPORT GraphBLASTargets )
diff --git a/include/graphblas/base/matrix.hpp b/include/graphblas/base/matrix.hpp
index 2decaf636..023d1f9e0 100644
--- a/include/graphblas/base/matrix.hpp
+++ b/include/graphblas/base/matrix.hpp
@@ -28,15 +28,16 @@
 #define _H_GRB_MATRIX_BASE
 
 #include <iterator>
-
 #include <stddef.h>
-
+#include <type_traits>
 #include <utility>
+#include <memory>
 
 #include <graphblas/backends.hpp>
 #include <graphblas/descriptors.hpp>
 #include <graphblas/ops.hpp>
 #include <graphblas/rc.hpp>
+#include <graphblas/utils.hpp>
 
 
 namespace grb {
diff --git a/include/graphblas/base/scalar.hpp b/include/graphblas/base/scalar.hpp
new file mode 100644
index 000000000..06075d3e8
--- /dev/null
+++ b/include/graphblas/base/scalar.hpp
@@ -0,0 +1,228 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_GRB_SCALAR_BASE
+#define _H_GRB_SCALAR_BASE
+
+#include <cstdlib>  //size_t
+#include <stdexcept>
+
+#include <graphblas/backends.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+
+namespace grb {
+
+	/**
+	 * \brief An ALP scalar.
+	 *
+	 * This is an opaque data type for scalars.
+	 *
+	 * @tparam T         The type of the vector elements. \a T shall not
+	 *                   be an ALP type.
+	 * @tparam Structure One of the structures. One of possible use cases
+	 *                   for a structured scalar is a random structure.
+	 *                   Depending on the backend implementation, this may mean,
+	 *                   for example, randomizing the scalar value on each
+	 *                   interaction with the scalar.
+	 *
+	 * \warning Creating a grb::Scalar of other ALP types is
+	 *                <em>not allowed</em>.
+	 *          Passing a ALP type as template parameter will lead to
+	 *          undefined behaviour.
+	 *
+	 */
+	template< typename T, typename Structure, enum Backend backend >
+	class Scalar {
+
+		public:
+			/** @see Vector::value_type. */
+			typedef T value_type;
+
+			/** @see Vector::lambda_reference */
+			typedef T& lambda_reference;
+
+			/**
+			 * The default ALP scalar constructor.
+			 *
+			 * The constructed object will be uninitalised after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar() {}
+
+			/**
+			 * The ALP scalar constructor for converting a reference to C/C++ scalar
+			 * to ALP scalar.
+			 *
+			 * The constructed object will be initialized after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 * \warning This constructor saves the reference to the provied value.
+			 *          Therefore, the changes to the container or the value will
+			 *          be mirrored into each-other. For preserving the separation,
+			 *          use Scalar( const T ) version.
+			 *
+			 */
+			explicit Scalar( T &value ) {
+				(void)value;
+			}
+
+			/**
+			 * The ALP scalar constructor for converting a C/C++ scalar to ALP scalar.
+			 *
+			 * The constructed object will be initialized after successful construction.
+			 *
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor may allocate \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ data movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			explicit Scalar( T value ) {
+				(void)value;
+			}
+
+			/**
+			 * Copy constructor.
+			 *
+			 * @param other The scalar to copy. The initialization state of the copy
+			 *              reflects the state of \a other.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor allocates \f$ \Theta(1) \f$ bytes
+			 *           of dynamic memory.
+			 *        -# This constructor incurs \f$ \Theta(1) \f$ of data
+			 *           movement.
+			 *        -# This constructor \em may make system calls.
+			 * \endparblock
+			 *
+			 */
+			Scalar( const Scalar &other ) noexcept {
+				(void)other;
+			}
+
+			/**
+			 * Move constructor. The new scalar equals the given
+			 * scalar. Invalidates the use of the input scalar.
+			 *
+			 * @param[in] other The ALP scalar to move to this new instance.
+			 *
+			 * \parblock
+			 * \par Performance semantics.
+			 *        -# This constructor entails \f$ \Theta(1) \f$ amount of work.
+			 *        -# This constructor will not allocate any new dynamic memory.
+			 *        -# This constructor will use \f$ \Theta(1) \f$ extra bytes of
+			 *           memory beyond that at constructor entry.
+			 *        -# This constructor will move \f$ \Theta(1) \f$ bytes of data.
+			 * \endparblock
+			 */
+			Scalar( Scalar &&other ) noexcept {
+				(void)other;
+			}
+
+			/**
+			 * Returns a lambda reference to the value of this Scalar. The user
+			 * ensures that the requested reference only corresponds to a pre-existing
+			 * nonzero in this scalar, <em>or undefined behaviour will occur</em>.
+			 * This addresses the sparse specialization of scalars. In the dense
+			 * context, scalar is considered to have a nonzero value \em iff initialized.
+			 *
+			 * A lambda reference to the value of this scalar is only valid when used
+			 * inside a lambda function evaluated via grb::eWiseLambda. Outside this
+			 * scope the returned reference incurs undefined behaviour.
+			 *
+			 *
+			 * \warning In parallel contexts the use of a returned lambda reference
+			 *          outside the context of an eWiseLambda will incur at least one of
+			 *          the following ill effects: it may
+			 *            -# fail outright,
+			 *            -# work on stale data,
+			 *            -# work on incorrect data, or
+			 *            -# incur high communication costs to guarantee correctness.
+			 *          In short, such usage causes undefined behaviour. Implementers are
+			 *          \em not advised to provide GAS-like functionality through this
+			 *          interface, as it invites bad programming practices and bad
+			 *          algorithm design decisions. This operator is instead intended to
+			 *          provide for generic BLAS0-type operations only.
+			 *
+			 * \note    For I/O, use the iterator retrieved via cbegin() instead of
+			 *          relying on a lambda_reference.
+			 *
+			 * @return      A lambda reference to the value of this scalar
+			 *
+			 * \par Example.
+			 * See grb::eWiseLambda() for a practical and useful example.
+			 *
+			 * \warning There is no similar concept in the official GraphBLAS specs.
+			 *
+			 * @see lambda_reference For more details on the returned reference type.
+			 * @see grb::eWiseLambda For one legal way in which to use the returned
+			 *      #lambda_reference.
+			 */
+			lambda_reference operator*() noexcept {
+#ifndef _GRB_NO_EXCEPTIONS
+				assert( false ); // Requesting lambda reference of unimplemented Scalar backend.
+#endif
+			}
+
+			/** Returns a constant reference to the scalar value.
+			 * See the non-constant variant for further details.
+			 */
+			const lambda_reference operator*() const noexcept {
+#ifndef _GRB_NO_EXCEPTIONS
+				assert( false ); // Requesting lambda reference of unimplemented Scalar backend.
+#endif
+			}
+
+	}; // class Scalar
+
+} // namespace grb
+
+#endif // _H_GRB_SCALAR_BASE
diff --git a/include/graphblas/coordinates.hpp b/include/graphblas/coordinates.hpp
index 43f5c9845..49dc9318a 100644
--- a/include/graphblas/coordinates.hpp
+++ b/include/graphblas/coordinates.hpp
@@ -33,7 +33,7 @@
  #include "graphblas/nonblocking/coordinates.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-// #include <graphblas/bsp1d/coordinates.hpp>
+ // #include <graphblas/bsp1d/coordinates.hpp>
 #endif
 #ifdef _GRB_WITH_BANSHEE
  #include <graphblas/banshee/coordinates.hpp>
diff --git a/include/graphblas/matrix.hpp b/include/graphblas/matrix.hpp
index bccb0f928..657b6e5a3 100644
--- a/include/graphblas/matrix.hpp
+++ b/include/graphblas/matrix.hpp
@@ -37,10 +37,10 @@
  #include "graphblas/nonblocking/matrix.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
- #include <graphblas/bsp1d/matrix.hpp>
+#include <graphblas/bsp1d/matrix.hpp>
 #endif
 #ifdef _GRB_WITH_BANSHEE
- #include <graphblas/banshee/matrix.hpp>
+#include <graphblas/banshee/matrix.hpp>
 #endif
 
 // specify default only if requested during compilation
@@ -54,7 +54,8 @@ namespace grb {
 		typename NonzeroIndexType = config::NonzeroIndexType
 	>
 	class Matrix;
-}
+
+} // namespace grb
 #endif
 
 #endif // end ``_H_GRB_MATRIX''
diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp
index 3ab508eec..869f821ac 100644
--- a/include/graphblas/utils.hpp
+++ b/include/graphblas/utils.hpp
@@ -385,6 +385,31 @@ namespace grb {
 			}
 		}
 
+		/**
+		 * @brief Simple range [ \a start, \a end) with optional \a stride.
+		 * 
+		 */
+		struct range {
+			size_t start, end, stride;
+
+			range(): start( 0 ), end( std::numeric_limits< size_t >::max() ), stride( 1 ) {}
+
+			range(size_t start, size_t end, size_t stride=1): start( start ), end( end ), stride( stride ) {}
+
+			inline size_t count() const {
+				return (end - start)/stride;
+			}
+
+			inline bool is_full() const {
+				return ( count() == std::numeric_limits< size_t >::max() );
+			}
+
+			inline bool is_empty() const {
+				return ( count() == 0 );
+			}
+
+		};
+
 		/** Specialisation for void-valued matrice's masks */
 		template< Descriptor descriptor, typename MatrixDataType, typename ValuesType >
 		static bool interpretMatrixMask(
diff --git a/include/graphblas/utils/iscomplex.hpp b/include/graphblas/utils/iscomplex.hpp
index 3a272d233..caf43ca40 100644
--- a/include/graphblas/utils/iscomplex.hpp
+++ b/include/graphblas/utils/iscomplex.hpp
@@ -49,11 +49,6 @@ namespace grb {
 		template< typename C >
 		class is_complex {
 
-			static_assert(
-				std::is_floating_point< C >::value,
-				"is_complex: C is not a floating point type"
-			);
-
 			public:
 
 				/**
@@ -115,8 +110,8 @@ namespace grb {
 		{
 
 			static_assert(
-				std::is_floating_point< T >::value,
-				"is_complex: T is not a floating point type"
+				std::is_arithmetic< T >::value,
+				"is_complex: C is not a numeric (arithmetic) type"
 			);
 
 			public:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 50e731a30..fc86b98e9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,6 +21,7 @@
 
 ### ADD BACKENDS
 add_subdirectory( graphblas )
+add_subdirectory( alp )
 
 ### Add transition path targets
 add_subdirectory( transition )
diff --git a/src/alp/CMakeLists.txt b/src/alp/CMakeLists.txt
new file mode 100644
index 000000000..cf3ac9d73
--- /dev/null
+++ b/src/alp/CMakeLists.txt
@@ -0,0 +1,52 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# This file creates the basic target(s) needed by all backends
+#
+
+assert_defined_targets( backend_headers_nodefs backend_flags )
+
+### ACTUAL BACKEND IMPLEMENTATIONS
+
+# target to compile all backend libraries
+# add_custom_target( libs ALL )
+
+# these sources are common to all backends, although some are in reference
+set( backend_alp_reference_srcs
+	${CMAKE_CURRENT_SOURCE_DIR}/descriptors.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/rc.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/reference/init.cpp
+	# ${CMAKE_CURRENT_SOURCE_DIR}/reference/config.cpp
+)
+
+if( WITH_ALP_REFERENCE_BACKEND )
+	add_subdirectory( reference )
+endif()
+
+if( WITH_ALP_OMP_BACKEND )
+	add_subdirectory( omp )
+endif()
+
+set( backend_alp_dispatch_srcs
+	${CMAKE_CURRENT_SOURCE_DIR}/descriptors.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/rc.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/dispatch/init.cpp
+)
+
+if( WITH_ALP_DISPATCH_BACKEND )
+	add_subdirectory( dispatch )
+endif()
diff --git a/src/alp/descriptors.cpp b/src/alp/descriptors.cpp
new file mode 100644
index 000000000..838faf784
--- /dev/null
+++ b/src/alp/descriptors.cpp
@@ -0,0 +1,60 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+
+#include <alp/descriptors.hpp>
+
+std::string alp::descriptors::toString( const alp::Descriptor descr ) {
+	std::ostringstream os;
+	if( descr == 0 ) {
+		os << "no-op descriptor\n";
+	} else {
+		os << "specialised descriptor:\n";
+		if( descr & invert_mask ) {
+			os << " inverted mask\n";
+		}
+		if( descr & transpose_matrix ) {
+			os << " transpose input matrix\n";
+		}
+		if( descr & no_duplicates ) {
+			os << " user guarantees no duplicate coordinate on input\n";
+		}
+		if( descr & structural ) {
+			os << " mask must be interpreted structurally, and not by value\n";
+		}
+		if( descr & dense ) {
+			os << " user guarantees all vectors in this call are dense\n";
+		}
+		if( descr & add_identity ) {
+			os << " an identity matrix is added to the input matrix\n";
+		}
+		if( descr & use_index ) {
+			os << " instead of using input vector elements, use their index "
+				  "instead\n";
+		}
+		if( descr & explicit_zero ) {
+			os << " the operation should take zeroes into account explicitly "
+				  "when computing output\n";
+		}
+		if( descr & no_casting ) {
+			os << " disallow casting between types during the requested "
+				  "computation\n";
+		}
+	}
+	return os.str();
+}
diff --git a/src/alp/dispatch/CMakeLists.txt b/src/alp/dispatch/CMakeLists.txt
new file mode 100644
index 000000000..01f6dc3a8
--- /dev/null
+++ b/src/alp/dispatch/CMakeLists.txt
@@ -0,0 +1,54 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+assert_valid_variables( BACKEND_LIBRARY_OUTPUT_NAME VERSION
+	ALP_DISPATCH_BACKEND_INSTALL_DIR INCLUDE_INSTALL_DIR
+	ALP_DISPATCH_BACKEND_DEFAULT_NAME
+	ALP_DISPATCH_SELECTION_DEFS
+)
+
+# sources for ALP dispatch backend
+set( backend_alp_dispatch_srcs
+	${CMAKE_CURRENT_SOURCE_DIR}/../descriptors.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/../rc.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+)
+
+add_library( backend_alp_dispatch_static STATIC "${backend_alp_dispatch_srcs}" )
+
+target_link_libraries( backend_alp_dispatch_static PUBLIC backend_alp_dispatch_headers )
+
+set_target_properties( backend_alp_dispatch_static PROPERTIES
+	OUTPUT_NAME "${BACKEND_LIBRARY_OUTPUT_NAME}"
+)
+
+set_target_properties( backend_alp_dispatch_static PROPERTIES
+	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/dispatch"
+)
+
+target_compile_definitions( backend_alp_dispatch_static PUBLIC "${ALP_DISPATCH_SELECTION_DEFS}" )
+target_link_libraries( backend_alp_dispatch_static PRIVATE backend_flags )
+
+add_dependencies( libs backend_alp_dispatch_static )
+
+install( TARGETS backend_alp_dispatch_static
+	EXPORT GraphBLASTargets
+	ARCHIVE DESTINATION "${ALP_DISPATCH_BACKEND_INSTALL_DIR}"
+	)
+
+# this is an alias for add_grb_executables() to select the backend to link against
+# DO NOT CHANGE THE ALIAS NAME!
+add_library( "${ALP_DISPATCH_BACKEND_DEFAULT_NAME}" ALIAS backend_alp_dispatch_static )
diff --git a/src/alp/dispatch/init.cpp b/src/alp/dispatch/init.cpp
new file mode 100644
index 000000000..7f3aa7963
--- /dev/null
+++ b/src/alp/dispatch/init.cpp
@@ -0,0 +1,42 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <alp/dispatch/init.hpp>
+#include <alp/rc.hpp>
+
+template<>
+alp::RC alp::init< alp::dispatch >( const size_t s, const size_t P, void * const data ) {
+	// we don't use any implementation-specific init data
+	(void)data;
+	// print output
+	std::cerr << "Info: alp::init (dispatch) called.\n";
+	// sanity checks
+	if( P > 1 ) {
+		return alp::UNSUPPORTED;
+	}
+	if( s > 0 ) {
+		return alp::PANIC;
+	}
+	// done
+	return alp::SUCCESS;
+}
+
+
+template<>
+alp::RC alp::finalize< alp::dispatch >() {
+	return alp::SUCCESS;
+}
diff --git a/src/alp/omp/CMakeLists.txt b/src/alp/omp/CMakeLists.txt
new file mode 100644
index 000000000..9cf4ff7c1
--- /dev/null
+++ b/src/alp/omp/CMakeLists.txt
@@ -0,0 +1,54 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+assert_valid_variables( BACKEND_LIBRARY_OUTPUT_NAME VERSION
+	ALP_OMP_BACKEND_INSTALL_DIR INCLUDE_INSTALL_DIR
+	ALP_OMP_BACKEND_DEFAULT_NAME
+	ALP_OMP_SELECTION_DEFS
+)
+
+# sources for ALP OMP backend
+set( backend_alp_omp_srcs
+	${CMAKE_CURRENT_SOURCE_DIR}/../descriptors.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/../rc.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+)
+
+add_library( backend_alp_omp_static STATIC "${backend_alp_omp_srcs}" )
+
+target_link_libraries( backend_alp_omp_static PUBLIC backend_alp_omp_headers )
+
+set_target_properties( backend_alp_omp_static PROPERTIES
+	OUTPUT_NAME "${BACKEND_LIBRARY_OUTPUT_NAME}"
+)
+
+set_target_properties( backend_alp_omp_static PROPERTIES
+	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/omp"
+)
+
+target_compile_definitions( backend_alp_omp_static PUBLIC "${ALP_OMP_SELECTION_DEFS}" )
+target_link_libraries( backend_alp_omp_static PRIVATE backend_flags )
+
+add_dependencies( libs backend_alp_omp_static )
+
+install( TARGETS backend_alp_omp_static
+	EXPORT GraphBLASTargets
+	ARCHIVE DESTINATION "${ALP_OMP_BACKEND_INSTALL_DIR}"
+	)
+
+# this is an alias for add_grb_executables() to select the backend to link against
+# DO NOT CHANGE THE ALIAS NAME!
+add_library( "${ALP_OMP_BACKEND_DEFAULT_NAME}" ALIAS backend_alp_omp_static )
diff --git a/src/alp/omp/init.cpp b/src/alp/omp/init.cpp
new file mode 100644
index 000000000..f57351285
--- /dev/null
+++ b/src/alp/omp/init.cpp
@@ -0,0 +1,60 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 2nd of February, 2017
+ */
+
+#include <alp/reference/init.hpp>
+#include <alp/omp/config.hpp>
+#include <alp/rc.hpp>
+
+#ifndef _GRB_NO_LIBNUMA
+ #include <numa.h> //numa_set_localalloc
+#endif
+#include <omp.h> //omp_get_num_threads
+
+
+template<>
+alp::RC alp::init< alp::omp >( const size_t s, const size_t P, void * const data ) {
+	(void) data;
+	RC rc = alp::SUCCESS;
+	// print output
+	const auto T = config::OMP::threads();
+	std::cout << "Info: alp::init (omp) called. OpenMP is set to utilise " << T << " threads.\n";
+
+	// sanity checks
+	if( P > 1 ) {
+		return alp::UNSUPPORTED;
+	}
+	if( s > 0 ) {
+		return alp::PANIC;
+	}
+#ifndef _GRB_NO_LIBNUMA
+	// set memory policy
+	numa_set_localalloc();
+#endif
+	return rc;
+}
+
+template<>
+alp::RC alp::finalize< alp::omp >() {
+	std::cout << "Info: alp::finalize (omp) called.\n";
+	return alp::SUCCESS;
+}
+
diff --git a/src/alp/rc.cpp b/src/alp/rc.cpp
new file mode 100644
index 000000000..f15b9b178
--- /dev/null
+++ b/src/alp/rc.cpp
@@ -0,0 +1,58 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "alp/rc.hpp"
+
+std::string alp::toString( const alp::RC code ) {
+	std::string ret;
+	switch( code ) {
+		case alp::SUCCESS:
+			ret = "Success";
+			break;
+		case alp::PANIC:
+			ret = "Panic (unrecoverable)";
+			break;
+		case alp::OUTOFMEM:
+			ret = "Out-of-memory";
+			break;
+		case alp::MISMATCH:
+			ret = "Mismatching dimensions during call";
+			break;
+		case alp::OVERLAP:
+			ret = "Overlapping containers given while this defined to be "
+				  "illegal";
+			break;
+		case alp::OVERFLW:
+			ret = "A cast of a given argument to a given smaller data type "
+				  "would result in overflow";
+			break;
+		case alp::UNSUPPORTED:
+			ret = "The chosen backend does not support the requested call";
+			break;
+		case alp::ILLEGAL:
+			ret = "An illegal user argument was detected";
+			break;
+		case alp::FAILED:
+			ret = "A GraphBLAS algorithm has failed to achieve its intended "
+				  "result (e.g., has not converged)";
+			break;
+		default:
+			ret = "Uninterpretable error code detected, please notify the "
+				  "developers.";
+	}
+	return ret;
+}
diff --git a/src/alp/reference/CMakeLists.txt b/src/alp/reference/CMakeLists.txt
new file mode 100644
index 000000000..96024c903
--- /dev/null
+++ b/src/alp/reference/CMakeLists.txt
@@ -0,0 +1,54 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+assert_valid_variables( BACKEND_LIBRARY_OUTPUT_NAME VERSION
+	ALP_REFERENCE_BACKEND_INSTALL_DIR INCLUDE_INSTALL_DIR
+	ALP_REFERENCE_BACKEND_DEFAULT_NAME
+	ALP_REFERENCE_SELECTION_DEFS 
+)
+
+# sources for ALP reference backend
+set( backend_alp_reference_srcs
+	${CMAKE_CURRENT_SOURCE_DIR}/../descriptors.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/../rc.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+)
+
+add_library( backend_alp_reference_static STATIC "${backend_alp_reference_srcs}" )
+
+target_link_libraries( backend_alp_reference_static PUBLIC backend_alp_reference_headers )
+
+set_target_properties( backend_alp_reference_static PROPERTIES
+	OUTPUT_NAME "${BACKEND_LIBRARY_OUTPUT_NAME}"
+)
+
+set_target_properties( backend_alp_reference_static PROPERTIES
+	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/reference"
+)
+
+target_compile_definitions( backend_alp_reference_static PUBLIC "${ALP_REFERENCE_SELECTION_DEFS}" )
+target_link_libraries( backend_alp_reference_static PRIVATE backend_flags )
+
+add_dependencies( libs backend_alp_reference_static )
+
+install( TARGETS backend_alp_reference_static
+	EXPORT GraphBLASTargets
+	ARCHIVE DESTINATION "${ALP_REFERENCE_BACKEND_INSTALL_DIR}"
+	)
+
+# this is an alias for add_grb_executables() to select the backend to link against
+# DO NOT CHANGE THE ALIAS NAME!
+add_library( "${ALP_REFERENCE_BACKEND_DEFAULT_NAME}" ALIAS backend_alp_reference_static )
diff --git a/src/alp/reference/init.cpp b/src/alp/reference/init.cpp
new file mode 100644
index 000000000..079579c56
--- /dev/null
+++ b/src/alp/reference/init.cpp
@@ -0,0 +1,42 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <alp/reference/init.hpp>
+#include <alp/rc.hpp>
+
+template<>
+alp::RC alp::init< alp::reference >( const size_t s, const size_t P, void * const data ) {
+	// we don't use any implementation-specific init data
+	(void)data;
+	// print output
+	std::cerr << "Info: alp::init (reference) called.\n";
+	// sanity checks
+	if( P > 1 ) {
+		return alp::UNSUPPORTED;
+	}
+	if( s > 0 ) {
+		return alp::PANIC;
+	}
+	// done
+	return alp::SUCCESS;
+}
+
+
+template<>
+alp::RC alp::finalize< alp::reference >() {
+	return alp::SUCCESS;
+}
diff --git a/tests/mlir/quick_test.cpp b/tests/mlir/quick_test.cpp
new file mode 100644
index 000000000..1bb702ac6
--- /dev/null
+++ b/tests/mlir/quick_test.cpp
@@ -0,0 +1,78 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <tuple>
+#include <type_traits>
+
+// template< typename ... S >
+// struct structure_list {};
+
+// template<>
+// struct structure_list<> {};
+
+// template< typename SA, typename SB >
+// struct structure_list_cat;
+
+// template< typename ... SA, typename ... SB >
+// struct structure_list_cat< structure_list< SA ... >, structure_list< SB ... > >
+//     { typedef structure_list< SA ..., SB ... > type; };
+
+template< typename ... t >
+struct structure_tuple_cat
+    { typedef decltype( std::tuple_cat( std::declval< t >() ... ) ) type; };
+
+struct Square {
+  
+  using inferred_structures = std::tuple<Square>;
+
+};
+
+struct FullRank {
+  
+  using inferred_structures = std::tuple<FullRank>;
+
+};
+
+struct NonSingular {
+  
+  using inferred_structures = structure_tuple_cat< Square::inferred_structures, FullRank::inferred_structures >::type;
+
+};
+
+template <typename Structure, typename List>
+struct is_in;
+
+template <typename Structure>
+struct is_in<Structure, std::tuple<>> : std::false_type {};
+
+template <typename Structure, typename ListHead, typename... Structures>
+struct is_in< Structure, std::tuple<ListHead, Structures...> > : is_in<Structure, std::tuple<Structures...>> {};
+
+template <typename Structure, typename... Structures>
+struct is_in<Structure, std::tuple<Structure, Structures...>> : std::true_type {};
+
+template <typename Structure, typename Test>
+struct is_a {
+  static constexpr bool value = is_in< Test, typename Structure::inferred_structures >::value;
+};
+
+int main(int argc, char **argv) {
+
+  std::cout << is_a< NonSingular, Square  >::value << std::endl;
+  return 0;
+}
diff --git a/tests/mlir/test.x b/tests/mlir/test.x
new file mode 100755
index 000000000..f7f67c790
Binary files /dev/null and b/tests/mlir/test.x differ
diff --git a/tests/performance/CMakeLists.txt b/tests/performance/CMakeLists.txt
index 24dc351d9..e49635f3f 100644
--- a/tests/performance/CMakeLists.txt
+++ b/tests/performance/CMakeLists.txt
@@ -111,6 +111,14 @@ add_grb_executables( driver_spmspm spmspm.cpp
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
+add_grb_executables( alp_cholesky_perf alp_cholesky.cpp
+	BACKENDS alp_reference alp_dispatch
+)
+
+add_grb_executables( alp_mxm_perf alp_mxm.cpp
+	BACKENDS alp_reference alp_dispatch alp_omp
+)
+
 # targets to list and build the test for this category
 get_property( performance_tests_list GLOBAL PROPERTY tests_category_performance )
 add_custom_target( "list_tests_category_performance"
diff --git a/tests/performance/alp_cholesky.cpp b/tests/performance/alp_cholesky.cpp
new file mode 100644
index 000000000..4363df3da
--- /dev/null
+++ b/tests/performance/alp_cholesky.cpp
@@ -0,0 +1,224 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include <alp.hpp>
+#include <alp/algorithms/cholesky.hpp>
+
+//#include "klapack.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-9;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+  	size_t repeat = 1;
+};
+
+void print( const char * name, const double* matrix, int N ) {
+	printf( "\nMatrix %s size %d :\n", name, N );
+	printf( " %s = array ( [", name );
+	for( int i = 0; i < N; i++ ) {
+		printf( "\n  [" );
+		for( int j = 0; j < N; j++ ) {
+			printf("%.10f, ", matrix[j*N + i]);
+		}
+		printf(" ],");
+	}
+	printf("\n])\n");
+}
+
+#ifdef _ALP_WITH_DISPATCH
+alp::RC check_solution( const double* matrixH, const double* matrixL, size_t N ) {
+	alp::RC rc = alp::SUCCESS;
+	double fnorm1 = 0;
+	double fnorm2 = 0;
+	for( size_t i = 0; i < N; i++ ){
+		for( size_t j = 0; j < N; j++ ){
+			double val_mxm = 0;
+			for( size_t k = 0; k <= std::min( i, j ); k++ ) {
+				val_mxm += matrixL[ k * N + i ] * matrixL[ k * N + j ] ;
+			}
+			fnorm1 += std::abs( matrixH[ i * N + j] - val_mxm );
+			fnorm2 += std::abs( matrixH[ i * N + j] );
+		}
+	}
+	if( std::abs( fnorm1 ) / std::abs( fnorm2 ) > tol ) {
+	  std::cout << " fnorm1 = " << std::abs( fnorm1 ) << "\n";
+	  std::cout << " fnorm2 = " << std::abs( fnorm2 ) << "\n";
+	  std::cout << " fnorm1 / fnorm1 = " << std::abs( fnorm1 ) / std::abs( fnorm2 ) << "\n";
+	  rc = alp::FAILED;
+	} 
+	return rc;
+}
+#endif
+
+//** gnerate upper/lower triangular part of a SPD matrix */
+template< typename T >
+void generate_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() != N * N ) {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j < N; ++j ) {
+			size_t k = i * N + j;
+			if( i <= j ) {
+				data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+			}
+			if( i == j ) {
+				data[ k ] = data[ k ] + static_cast< T >( N );
+			}
+			if( i > j ) {
+				data[ i * N + j ] = data[ j * N + i ];
+			}
+
+		}
+	}
+}
+
+
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+
+  rc = alp::SUCCESS;
+
+	size_t N = unit.N;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+	alp::Semiring<
+		alp::operators::add< ScalarType >,
+		alp::operators::mul< ScalarType >,
+		alp::identities::zero,
+		alp::identities::one
+	> ring;
+	
+	std::vector< ScalarType > matrix_data( N * N );
+	generate_spd_matrix_full( N, matrix_data );
+
+	std::cout << "Testing Cholesky decomposition U^T * U = S, with S SPD of size ( " << N << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	times = 0;
+	alp::Matrix< ScalarType, alp::structures::Square, alp::Dense > LL_original( N );
+	alp::Matrix< ScalarType, alp::structures::Square, alp::Dense > LL( N );
+	rc = rc ? rc : alp::buildMatrix( LL_original, matrix_data.begin(), matrix_data.end() );
+	constexpr size_t bs = 64;
+	
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  rc = rc ? rc : alp::set( LL, LL_original );
+	  timer.reset();
+	  rc = rc ? rc : alp::algorithms::cholesky_uptr_blk( LL, bs, ring );
+	  times += timer.time();
+	  if (rc != alp::SUCCESS) {
+	    std::cout << "cholesky_uptr_blk retured error\n";
+	    return;
+	  } 
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+#ifdef _ALP_WITH_DISPATCH
+	//print("matrix_data", &( matrix_data[ 0 ] ), N );
+	auto LL_original_data = alp::internal::getRawPointerToFirstElement( LL_original );
+	//print("LL_original_data", LL_original_data, N );
+	auto LL_output_data = alp::internal::getRawPointerToFirstElement( LL );
+	//print("LL_output_data", LL_output_data, N );
+	rc = rc ? rc : check_solution(LL_original_data, LL_output_data, N);
+#endif
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+	      
+	    }
+
+	    
+	  }
+	  
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program(in, rc);
+	if (rc == alp::SUCCESS) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/alp_mxm.cpp b/tests/performance/alp_mxm.cpp
new file mode 100644
index 000000000..ffd53680b
--- /dev/null
+++ b/tests/performance/alp_mxm.cpp
@@ -0,0 +1,211 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include <alp_blas.h>
+
+#include <alp.hpp>
+
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+template< typename MatType, typename T >
+void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda,
+                                                 const MatType & mA, double threshold=1e-7 ) {
+
+  if( std::is_same< typename MatType::structure, alp::structures::General >::value ) {
+                for( size_t row = 0; row < m; ++row ) {
+                        for( size_t col = 0; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+                                double vm = ( double )( alp::internal::access( mA, alp::internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+                                if( re > threshold ) {
+                                        std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl;
+                                }
+			}
+                }
+	}
+  std::cout << "check ok\n";
+}
+
+
+/** gnerate random rectangular matrix data */
+template< typename T >
+void generate_random_matrix_data( size_t n, std::vector<T> &data ) {
+	if( data.size() != n ) {
+		std::cout << "Error: generate_random_matrix_data: Provided container does not have adequate size\n";
+	}
+	for( size_t i = 0; i < n; ++i ) {
+		data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+	}
+}
+
+
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+
+  rc = alp::SUCCESS;
+
+	const size_t N = unit.N;
+	const size_t K = 1 * N;
+	const size_t M = 1 * N;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+	alp::Semiring<
+		alp::operators::add< ScalarType >,
+		alp::operators::mul< ScalarType >,
+		alp::identities::zero,
+		alp::identities::one
+	> ring;
+
+	times = 0;
+	alp::Matrix< ScalarType, alp::structures::General, alp::Dense > A( N, K );
+	alp::Matrix< ScalarType, alp::structures::General, alp::Dense > B( K, M );
+	alp::Matrix< ScalarType, alp::structures::General, alp::Dense > C( N, M );
+
+	std::vector< ScalarType > Amatrix_data( N * K );
+	generate_random_matrix_data( N * K, Amatrix_data );
+	rc = rc ? rc : alp::buildMatrix( A, Amatrix_data.begin(), Amatrix_data.end() );
+
+	std::vector< ScalarType > Bmatrix_data( K * M );
+	generate_random_matrix_data( K * M, Bmatrix_data );
+	rc = rc ? rc : alp::buildMatrix( B, Bmatrix_data.begin(), Bmatrix_data.end() );
+
+	std::cout << "Testing  C(" << nrows( C ) << " x " << ncols( C )
+		  << ") += A(" << nrows( A ) << " x " << ncols( A )
+		  << ") x B(" << nrows( B ) << " x " << ncols( B )
+		  << ") "  << unit.repeat << " times.\n";
+
+	alp::Scalar< ScalarType > zero( ring.template getZero< ScalarType >() );
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+		rc = rc ? rc : alp::set( C, zero );
+		timer.reset();
+		rc = rc ? rc : alp::mxm( C, A, B, ring );
+		times += timer.time();
+	}
+
+
+	std::vector< ScalarType > Cmatrix_data( N * M, *zero );
+	cblas_dgemm(
+		CblasRowMajor,
+		CblasNoTrans,
+		CblasNoTrans,
+		N,
+		M,
+		K,
+		1,
+		&(Amatrix_data[0]),
+		K,
+		&(Bmatrix_data[0]),
+		M,
+		1,
+		&(Cmatrix_data[0]),
+		M
+	);
+	diff_stdvec_matrix( Cmatrix_data, N, M, M, C );
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program(in, rc);
+	if (rc == alp::SUCCESS) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/blas_mxm.cpp b/tests/performance/blas_mxm.cpp
new file mode 100644
index 000000000..1bbdbe7fb
--- /dev/null
+++ b/tests/performance/blas_mxm.cpp
@@ -0,0 +1,188 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include <alp_blas.h> // for gemm
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+template< typename T >
+void print(const char * name, const std::vector<T> &matrix, int M, int N )
+{
+  std::cout <<  name << " = array ( [\n";
+  for (int i = 0; i < M; i++){
+    std::cout << "  [";
+    for (int j = 0; j < N; j++){
+      std::cout << matrix[i * N + j ] << ", ";
+    }
+    std::cout << " ],\n";
+  }
+  std::cout << "\n])\n";
+}
+
+
+
+/** gnerate random rectangular matrix data */
+template< typename T >
+void generate_random_matrix_data( size_t n, std::vector<T> &data ) {
+	if( data.size() != n ) {
+	        std::cout << "Error: generate_random_matrix_data: Provided container does not have adequate size\n";
+	}
+        for( size_t i = 0; i < n; ++i ) {
+                data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+        }
+}
+
+void alp_program( const inpdata &unit, bool &rc ) {
+  rc = true;
+
+  const size_t N = unit.N;
+  const size_t K = 1 * N;
+  const size_t M = 1 * N;
+  grb::utils::Timer timer;
+  timer.reset();
+  double times;
+
+  std::vector< ScalarType > Amatrix_data( N * K );
+  std::vector< ScalarType > Bmatrix_data( K * M );
+  std::vector< ScalarType > Cmatrix_data( N * M );
+  generate_random_matrix_data( N * K, Amatrix_data );
+  generate_random_matrix_data( K * M, Bmatrix_data );
+
+  // print("A ", Amatrix_data, N, K );
+  // print("B ", Bmatrix_data, K, M );
+  // print("C ", Cmatrix_data, N, M );
+
+
+  std::cout << "Testing cblas_dgemm for C(" << N << " x " << M
+	    << ") +=   A(" << N << " x " << K
+	    << ") x B(" << K << " x " << M
+	    << ")  "  << unit.repeat << " times.\n";
+
+   times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  timer.reset();
+	  cblas_dgemm(
+		CblasRowMajor,
+		CblasNoTrans,
+		CblasNoTrans,
+		N,
+		M,
+		K,
+		1,
+		&(Amatrix_data[0]),
+		K,
+		&(Bmatrix_data[0]),
+		M,
+		1,
+		&(Cmatrix_data[0]),
+		M
+	  );
+	  times += timer.time();
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	// print("C ", Cmatrix_data, N, M );
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_cholesky.cpp b/tests/performance/lapack_cholesky.cpp
new file mode 100644
index 000000000..aa041ab72
--- /dev/null
+++ b/tests/performance/lapack_cholesky.cpp
@@ -0,0 +1,187 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include "lapacke.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+void print(const char * name, const double* matrix, int N)
+{
+  printf("\nMatrix %s size %d :\n", name, N);
+  printf(" %s = array ( [", name);
+  for (int i = 0; i < N; i++){
+  printf("\n  [");
+    for (int j = 0; j < N; j++){
+      printf("%.10f, ", matrix[j*N + i]);
+    }
+    printf(" ],");
+  }
+  printf("\n])\n");
+}
+
+
+//** gnerate upper/lower triangular part of a SPD matrix */
+template< typename T >
+void generate_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() != N * N ) {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j < N; ++j ) {
+			size_t k = i * N + j;
+			if( i <= j ) {
+				data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+			}
+			if( i == j ) {
+				data[ k ] = data[ k ] + static_cast< T >( N );
+			}
+			if( i > j ) {
+				data[ i * N + j ] = data[ j * N + i ];
+			}
+
+		}
+	}
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int N = unit.N;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::vector< ScalarType > matrix_data( N * N );
+	generate_spd_matrix_full( N, matrix_data );
+	//print("matrix_data", &(matrix_data[0]), N);
+
+
+	std::cout << "Testing dpotrf_ for U^T * U = S, with S SPD of size ( " << N << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	char uplo='U';
+	int info;
+
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > matrix_data_work( matrix_data );
+	  timer.reset();
+	  dpotrf_( &uplo, &N, &(matrix_data_work[0]),  &N,  &info );
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_dgeqrf.cpp b/tests/performance/lapack_dgeqrf.cpp
new file mode 100644
index 000000000..39ee1df44
--- /dev/null
+++ b/tests/performance/lapack_dgeqrf.cpp
@@ -0,0 +1,198 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include "lapacke.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+void print(const char * name, const double* matrix, int N)
+{
+  printf("\nMatrix %s size %d :\n", name, N);
+  printf(" %s = array ( [", name);
+  for (int i = 0; i < N; i++){
+  printf("\n  [");
+    for (int j = 0; j < N; j++){
+      printf("%.10f, ", matrix[j*N + i]);
+    }
+    printf(" ],");
+  }
+  printf("\n])\n");
+}
+
+
+//** generate vector or upper/lower triangular part of an SPD matrix */
+template< typename T >
+void generate_vec_or_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() == N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+		}
+	} else if( data.size() == N * N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			for( size_t j = 0; j < N; ++j ) {
+				size_t k = i * N + j;
+				if( i <= j ) {
+					data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+				}
+				if( i == j ) {
+					data[ k ] = data[ k ] + static_cast< T >( N );
+				}
+				if( i > j ) {
+					data[ i * N + j ] = data[ j * N + i ];
+				}
+
+			}
+		}
+	} else {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int M = unit.N;
+	int N = 2 * M;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::cout << "Testing dgeqrf_  ( " << M << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	std::vector< ScalarType > mat_a( M * N );
+	generate_vec_or_spd_matrix_full( M * N, mat_a );
+	std::vector< ScalarType > vec_tau( std::min( M, N ) );
+	ScalarType wopt;
+	int lwork = -1;
+	int info;
+	
+	dgeqrf_(&M, &N, &( mat_a[0] ), &M, &( vec_tau[0] ), &wopt, &lwork, &info);
+	lwork = (int)wopt;
+	std::vector< ScalarType > work( lwork );
+	
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > mat_a_work( mat_a );
+	  timer.reset();
+	  dgeqrf_(&M, &N, &( mat_a_work[0] ), &M, &( vec_tau[0] ), &( work[0] ), &lwork, &info);
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_dgesvd.cpp b/tests/performance/lapack_dgesvd.cpp
new file mode 100644
index 000000000..3f2ab6036
--- /dev/null
+++ b/tests/performance/lapack_dgesvd.cpp
@@ -0,0 +1,202 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include "lapacke.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+void print(const char * name, const double* matrix, int N)
+{
+  printf("\nMatrix %s size %d :\n", name, N);
+  printf(" %s = array ( [", name);
+  for (int i = 0; i < N; i++){
+  printf("\n  [");
+    for (int j = 0; j < N; j++){
+      printf("%.10f, ", matrix[j*N + i]);
+    }
+    printf(" ],");
+  }
+  printf("\n])\n");
+}
+
+
+//** generate vector or upper/lower triangular part of an SPD matrix */
+template< typename T >
+void generate_vec_or_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() == N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+		}
+	} else if( data.size() == N * N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			for( size_t j = 0; j < N; ++j ) {
+				size_t k = i * N + j;
+				if( i <= j ) {
+					data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+				}
+				if( i == j ) {
+					data[ k ] = data[ k ] + static_cast< T >( N );
+				}
+				if( i > j ) {
+					data[ i * N + j ] = data[ j * N + i ];
+				}
+
+			}
+		}
+	} else {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int M = unit.N;
+	int N = 2 * M;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::cout << "Testing dgesvd_  ( " << M << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	char jobu = 'A';
+	char jobvt = 'A';
+	std::vector< ScalarType > mat_a( M * N );
+	generate_vec_or_spd_matrix_full( M * N, mat_a );
+	std::vector< ScalarType > vec_s( std::min( M, N ) );
+	std::vector< ScalarType > mat_u( M * M );
+	std::vector< ScalarType > mat_vt( N * N );
+	ScalarType wopt;
+	int lwork = -1;
+	int info;
+	
+	dgesvd_(&jobu, &jobvt, &M, &N, &( mat_a[0] ), &M, &( vec_s[0] ), &( mat_u[0] ), &M, &( mat_vt[0] ), &N, &wopt, &lwork, &info);
+	lwork = (int)wopt;
+	std::vector< ScalarType > work( lwork );
+	
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > mat_a_work( mat_a );
+	  timer.reset();
+	  dgesvd_(&jobu, &jobvt, &M, &N, &( mat_a_work[0] ), &M, &( vec_s[0] ), &( mat_u[0] ), &M, &( mat_vt[0] ), &N, &( work[0] ), &lwork, &info);
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_dgetrf.cpp b/tests/performance/lapack_dgetrf.cpp
new file mode 100644
index 000000000..d4b96ea6b
--- /dev/null
+++ b/tests/performance/lapack_dgetrf.cpp
@@ -0,0 +1,192 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include "lapacke.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+void print(const char * name, const double* matrix, int N)
+{
+  printf("\nMatrix %s size %d :\n", name, N);
+  printf(" %s = array ( [", name);
+  for (int i = 0; i < N; i++){
+  printf("\n  [");
+    for (int j = 0; j < N; j++){
+      printf("%.10f, ", matrix[j*N + i]);
+    }
+    printf(" ],");
+  }
+  printf("\n])\n");
+}
+
+
+//** generate vector or upper/lower triangular part of an SPD matrix */
+template< typename T >
+void generate_vec_or_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() == N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+		}
+	} else if( data.size() == N * N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			for( size_t j = 0; j < N; ++j ) {
+				size_t k = i * N + j;
+				if( i <= j ) {
+					data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+				}
+				if( i == j ) {
+					data[ k ] = data[ k ] + static_cast< T >( N );
+				}
+				if( i > j ) {
+					data[ i * N + j ] = data[ j * N + i ];
+				}
+
+			}
+		}
+	} else {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int M = unit.N;
+	int N = 2 * M;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::cout << "Testing dgetrf_ ( " << M << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	std::vector< ScalarType > mat_a( M * N );
+	generate_vec_or_spd_matrix_full( M * N, mat_a );
+	std::vector< int > vec_ipiv( std::min( M, N ) );
+	int info;
+
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > mat_a_work( mat_a );
+	  timer.reset();
+	  dgetrf_( &M, &N, &( mat_a_work[0] ),  &M, &( vec_ipiv[0] ), &info );
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_dpotri.cpp b/tests/performance/lapack_dpotri.cpp
new file mode 100644
index 000000000..4d92ebd3f
--- /dev/null
+++ b/tests/performance/lapack_dpotri.cpp
@@ -0,0 +1,191 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include "lapacke.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+void print(const char * name, const double* matrix, int N)
+{
+  printf("\nMatrix %s size %d :\n", name, N);
+  printf(" %s = array ( [", name);
+  for (int i = 0; i < N; i++){
+  printf("\n  [");
+    for (int j = 0; j < N; j++){
+      printf("%.10f, ", matrix[j*N + i]);
+    }
+    printf(" ],");
+  }
+  printf("\n])\n");
+}
+
+
+//** generate vector or upper/lower triangular part of an SPD matrix */
+template< typename T >
+void generate_vec_or_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() == N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+		}
+	} else if( data.size() == N * N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			for( size_t j = 0; j < N; ++j ) {
+				size_t k = i * N + j;
+				if( i <= j ) {
+					data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+				}
+				if( i == j ) {
+					data[ k ] = data[ k ] + static_cast< T >( N );
+				}
+				if( i > j ) {
+					data[ i * N + j ] = data[ j * N + i ];
+				}
+
+			}
+		}
+	} else {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int N = unit.N;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::cout << "Testing dpotri_ ( " << N << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	char uplo = 'U';
+	std::vector< ScalarType > mat_a( N * N );
+	generate_vec_or_spd_matrix_full( N, mat_a );
+	int info;
+
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > mat_a_work( mat_a );
+	  timer.reset();
+	  dpotri_( &uplo, &N, &( mat_a_work[0] ), &N, &info );
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_dstedc.cpp b/tests/performance/lapack_dstedc.cpp
new file mode 100644
index 000000000..65ac572fb
--- /dev/null
+++ b/tests/performance/lapack_dstedc.cpp
@@ -0,0 +1,205 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include "lapacke.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+void print(const char * name, const double* matrix, int N)
+{
+  printf("\nMatrix %s size %d :\n", name, N);
+  printf(" %s = array ( [", name);
+  for (int i = 0; i < N; i++){
+  printf("\n  [");
+    for (int j = 0; j < N; j++){
+      printf("%.10f, ", matrix[j*N + i]);
+    }
+    printf(" ],");
+  }
+  printf("\n])\n");
+}
+
+
+//** generate vector or upper/lower triangular part of an SPD matrix */
+template< typename T >
+void generate_vec_or_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() == N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+		}
+	} else if( data.size() == N * N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			for( size_t j = 0; j < N; ++j ) {
+				size_t k = i * N + j;
+				if( i <= j ) {
+					data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+				}
+				if( i == j ) {
+					data[ k ] = data[ k ] + static_cast< T >( N );
+				}
+				if( i > j ) {
+					data[ i * N + j ] = data[ j * N + i ];
+				}
+
+			}
+		}
+	} else {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int N = unit.N;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::cout << "Testing dstedc_  ( " << N << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	char compz='I';
+	std::vector< ScalarType > vec_d( N );
+	generate_vec_or_spd_matrix_full( N, vec_d );
+	std::vector< ScalarType > vec_e( N - 1 );
+	generate_vec_or_spd_matrix_full( N - 1, vec_e );
+	std::vector< ScalarType > mat_z( N * N );
+	ScalarType wopt;
+	int lwork = -1;
+	int iwopt;
+	int liwork = -1;
+	int info;
+	
+	dstedc_(&compz, &N, &( vec_d[0] ), &( vec_e[0] ), &( mat_z[0] ), &N, &wopt, &lwork, &iwopt, &liwork, &info);
+	lwork = (int)wopt;
+	std::vector< ScalarType > work( lwork );
+	liwork = iwopt;
+	std::vector< int > iwork( liwork );
+	
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > vec_d_work( vec_d );
+	  std::vector< ScalarType > vec_e_work( vec_e );
+	  std::vector< ScalarType > mat_z_work( mat_z );
+	  timer.reset();
+	  dstedc_(&compz, &N, &( vec_d_work[0] ), &( vec_e_work[0] ), &( mat_z_work[0] ), &N, &( work[0] ), &lwork, &( iwork[0] ), &liwork, &info);
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_dsyevd.cpp b/tests/performance/lapack_dsyevd.cpp
new file mode 100644
index 000000000..c334e94af
--- /dev/null
+++ b/tests/performance/lapack_dsyevd.cpp
@@ -0,0 +1,202 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include "lapacke.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+void print(const char * name, const double* matrix, int N)
+{
+  printf("\nMatrix %s size %d :\n", name, N);
+  printf(" %s = array ( [", name);
+  for (int i = 0; i < N; i++){
+  printf("\n  [");
+    for (int j = 0; j < N; j++){
+      printf("%.10f, ", matrix[j*N + i]);
+    }
+    printf(" ],");
+  }
+  printf("\n])\n");
+}
+
+
+//** generate vector or upper/lower triangular part of an SPD matrix */
+template< typename T >
+void generate_vec_or_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() == N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+		}
+	} else if( data.size() == N * N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			for( size_t j = 0; j < N; ++j ) {
+				size_t k = i * N + j;
+				if( i <= j ) {
+					data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+				}
+				if( i == j ) {
+					data[ k ] = data[ k ] + static_cast< T >( N );
+				}
+				if( i > j ) {
+					data[ i * N + j ] = data[ j * N + i ];
+				}
+
+			}
+		}
+	} else {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int N = unit.N;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::cout << "Testing dsyevd_  ( " << N << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	char jobz = 'V';
+	char uplo = 'U';
+	std::vector< ScalarType > mat_a( N * N );
+	generate_vec_or_spd_matrix_full( N, mat_a );
+	std::vector< ScalarType > vec_w( N );
+	ScalarType wopt;
+	int lwork = -1;
+	int iwopt;
+	int liwork = -1;
+	int info;
+	
+	dsyevd_(&jobz, &uplo, &N, &( mat_a[0] ), &N, &( vec_w[0] ), &wopt, &lwork, &iwopt, &liwork, &info);
+	lwork = (int)wopt;
+	std::vector< ScalarType > work( lwork );
+	liwork = iwopt;
+	std::vector< int > iwork( liwork );
+	
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > mat_a_work( mat_a );
+	  timer.reset();
+	  dsyevd_(&jobz, &uplo, &N, &( mat_a_work[0] ), &N, &( vec_w[0] ), &( work[0] ), &lwork, &( iwork[0] ), &liwork, &info);
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_dsytrd.cpp b/tests/performance/lapack_dsytrd.cpp
new file mode 100644
index 000000000..5cdbd6a76
--- /dev/null
+++ b/tests/performance/lapack_dsytrd.cpp
@@ -0,0 +1,199 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/Timer.hpp>
+
+#include "lapacke.h"
+
+typedef double ScalarType;
+constexpr ScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+void print(const char * name, const double* matrix, int N)
+{
+  printf("\nMatrix %s size %d :\n", name, N);
+  printf(" %s = array ( [", name);
+  for (int i = 0; i < N; i++){
+  printf("\n  [");
+    for (int j = 0; j < N; j++){
+      printf("%.10f, ", matrix[j*N + i]);
+    }
+    printf(" ],");
+  }
+  printf("\n])\n");
+}
+
+
+//** generate vector or upper/lower triangular part of an SPD matrix */
+template< typename T >
+void generate_vec_or_spd_matrix_full( size_t N, std::vector<T> &data ) {
+	if( data.size() == N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			data[ i ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+		}
+	} else if( data.size() == N * N ) {
+		for( size_t i = 0; i < N; ++i ) {
+			for( size_t j = 0; j < N; ++j ) {
+				size_t k = i * N + j;
+				if( i <= j ) {
+					data[ k ] = static_cast< T >( std::rand() ) / static_cast< T >( RAND_MAX );
+				}
+				if( i == j ) {
+					data[ k ] = data[ k ] + static_cast< T >( N );
+				}
+				if( i > j ) {
+					data[ i * N + j ] = data[ j * N + i ];
+				}
+
+			}
+		}
+	} else {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int N = unit.N;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::cout << "Testing dsytrd_  ( " << N << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	char uplo = 'U';
+	std::vector< ScalarType > mat_a( N * N );
+	generate_vec_or_spd_matrix_full( N, mat_a );
+	std::vector< ScalarType > vec_d( N );
+	std::vector< ScalarType > vec_e( N - 1 );
+	std::vector< ScalarType > vec_tau( N - 1 );
+	ScalarType wopt;
+	int lwork = -1;
+	int info;
+	
+	dsytrd_(&uplo, &N, &( mat_a[0] ), &N, &( vec_d[0] ), &( vec_e[0] ), &( vec_tau[0] ), &wopt, &lwork, &info);
+	lwork = (int)wopt;
+	std::vector< ScalarType > work( lwork );
+	
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > mat_a_work( mat_a );
+	  timer.reset();
+	  dsytrd_(&uplo, &N, &( mat_a_work[0] ), &N, &( vec_d[0] ), &( vec_e[0] ), &( vec_tau[0] ), &( work[0] ), &lwork, &info);
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/performance/lapack_zhetrd.cpp b/tests/performance/lapack_zhetrd.cpp
new file mode 100644
index 000000000..7dcd65c41
--- /dev/null
+++ b/tests/performance/lapack_zhetrd.cpp
@@ -0,0 +1,187 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <complex>
+#include <cmath>
+#include <iomanip>
+
+#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+
+#include "lapacke.h"
+
+using BaseScalarType = double;
+using ScalarType = std::complex< BaseScalarType >;
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N=0;
+  	size_t repeat=1;
+};
+
+
+//** generate vector or upper/lower triangular part of an SPD matrix */
+template<
+	typename T
+>
+void generate_symmherm_matrix_data(
+	size_t N,
+	std::vector< T > &data,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::fill(data.begin(), data.end(), static_cast< T >( 0 ) );
+	std::srand( RNDSEED );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			T val( std::rand(), std::rand() );
+			data[ i * N + j ] = val / std::abs( val );
+			data[ j * N + i ] += grb::utils::is_complex< T >::conjugate( data[ i * N + j ] );
+		}
+	}
+}
+
+
+
+void alp_program( const inpdata &unit, bool &rc ) {
+	rc = true;
+
+	int N = unit.N;
+	grb::utils::Timer timer;
+	timer.reset();
+	double times;
+
+
+	std::cout << "Testing zhetrd_  ( " << N << " x " << N << " )\n";
+	std::cout << "Test repeated " << unit.repeat << " times.\n";
+
+	char uplo = 'U';
+	std::vector< ScalarType > mat_a( N * N );
+	generate_symmherm_matrix_data( N, mat_a );
+	std::vector< BaseScalarType > vec_d( N );
+	std::vector< BaseScalarType > vec_e( N - 1 );
+	std::vector< ScalarType > vec_tau( N - 1 );
+	ScalarType wopt;
+	int lwork = -1;
+	int info;
+	
+	zhetrd_(&uplo, &N, ( lapack_complex_double * )( &( mat_a[0] ) ), &N, 
+		&( vec_d[0] ), &( vec_e[0] ), ( lapack_complex_double * )( &( vec_tau[0] ) ), ( lapack_complex_double * )( &( wopt ) ), &lwork, &info);
+	lwork = (int)( wopt.real() );
+	std::vector< ScalarType > work( lwork );
+	
+	times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+	  std::vector< ScalarType > mat_a_work( mat_a );
+	  timer.reset();
+	  zhetrd_(&uplo, &N, ( lapack_complex_double * )( &( mat_a_work[0] ) ), &N, 
+	  	&( vec_d[0] ), &( vec_e[0] ), ( lapack_complex_double * )( &( vec_tau[0] ) ), ( lapack_complex_double * )( &( work[0] ) ), &lwork, &info);
+	  times += timer.time();
+	  if( info != 0 ) {
+	    std::cout << " info = " << info << "\n";
+	    rc = false;
+	    return;
+	  }
+	}
+
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+	//print("matrix_data", &(matrix_data[0]), N);
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+	   ( argc == 3 ) || ( argc == 5 )
+	   ) {
+	  std::string readflag;
+	  std::istringstream ss1( argv[ 1 ] );
+	  std::istringstream ss2( argv[ 2 ] );
+	  if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	    std::cerr << "Error parsing\n";
+	    printUsage = true;
+	  } else if(
+		    readflag != std::string( "-n" )
+		    ) {
+	    std::cerr << "Given first argument is unknown\n";
+	    printUsage = true;
+	  } else {
+	    if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    }
+	  }
+
+	  if( argc == 5 ) {
+	    std::string readflag;
+	    std::istringstream ss1( argv[ 3 ] );
+	    std::istringstream ss2( argv[ 4 ] );
+	    if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+	      std::cerr << "Error parsing\n";
+	      printUsage = true;
+	    } else if(
+		      readflag != std::string( "-repeat" )
+		      ) {
+	      std::cerr << "Given third argument is unknown\n";
+	      printUsage = true;
+	    } else {
+	      if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+		std::cerr << "Error parsing\n";
+		printUsage = true;
+	      }
+
+	    }
+
+	  }
+
+	} else {
+	  std::cout << "Wrong number of arguments\n" ;
+	  printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	bool rc = true;
+	alp_program(in, rc);
+	if (rc) {
+	  std::cout << "Tests OK\n";
+	} else {
+	  std::cout << "Tests FAILED\n";
+	}
+	return 0;
+}
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 7e7f2af4a..ec5cbca3a 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -185,6 +185,90 @@ add_grb_executables( kcore_decomposition kcore_decomposition.cpp
 	BACKENDS reference reference_omp hyperdags nonblocking bsp1d hybrid
 )
 
+add_grb_executables( alp_gemm alp_gemm.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( alp_cholesky alp_cholesky.cpp
+	BACKENDS alp_reference alp_dispatch
+)
+
+# add_grb_executables( alp_cholesky_complex alp_cholesky.cpp
+# 	BACKENDS alp_reference
+# 	COMPILE_DEFINITIONS _COMPLEX
+# )
+
+add_grb_executables( alp_zhetrd alp_zhetrd.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_zhetrd_complex alp_zhetrd.cpp
+        BACKENDS alp_reference
+        COMPILE_DEFINITIONS _COMPLEX
+)
+
+add_grb_executables( alp_zgeqrf alp_zgeqrf.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_zgeqrf_complex alp_zgeqrf.cpp
+        BACKENDS alp_reference
+	COMPILE_DEFINITIONS _COMPLEX
+)
+
+add_grb_executables( alp_zgesvd alp_zgesvd.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_zgesvd_complex alp_zgesvd.cpp
+        BACKENDS alp_reference
+	COMPILE_DEFINITIONS _COMPLEX
+)
+
+add_grb_executables( alp_zgetrf alp_zgetrf.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_zgetrf_complex alp_zgetrf.cpp
+        BACKENDS alp_reference
+	COMPILE_DEFINITIONS _COMPLEX
+)
+
+add_grb_executables( alp_dstedc alp_stedc.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_backsubstitution alp_backsubstitution.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_backsubstitution_complex alp_backsubstitution.cpp
+        BACKENDS alp_reference
+	COMPILE_DEFINITIONS _COMPLEX
+)
+
+add_grb_executables( alp_forwardsubstitution alp_forwardsubstitution.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_forwardsubstitution_complex alp_forwardsubstitution.cpp
+        BACKENDS alp_reference
+	COMPILE_DEFINITIONS _COMPLEX
+)
+
+add_grb_executables( alp_syevd alp_zheevd.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_potri alp_potri.cpp
+        BACKENDS alp_reference
+)
+
+add_grb_executables( alp_potri_complex alp_potri.cpp
+        BACKENDS alp_reference
+	COMPILE_DEFINITIONS _COMPLEX
+)      
+
 # targets to list and build the test for this category
 get_property( smoke_tests_list GLOBAL PROPERTY tests_category_smoke )
 add_custom_target( "list_tests_category_smoke"
@@ -215,3 +299,39 @@ add_custom_target( tests_smoke
 # custom target "smoketests" just to keep old naming
 add_custom_target( smoketests DEPENDS tests_smoke )
 
+set( ALP_SCRIPTS_COMMON_ARGS "--backends" )
+
+foreach( b ${AVAILABLE_BACKENDS} )
+	string( FIND ${b} "alp" alp_pos_in_b )
+	# message("alp_pos_in_b = ${alp_pos_in_b}")
+	if( "${alp_pos_in_b}" STREQUAL "-1" )
+		continue()
+	endif()
+	# message("Adding ALP backend ${b} to the test list")
+	get_property( bt GLOBAL PROPERTY tests_backend_${b} )
+	list( APPEND ALP_TEST_BACKENDS "${bt}" )
+	list( APPEND ALP_SCRIPTS_COMMON_ARGS ${b} )
+endforeach()
+
+# message("ALP backends enabled for testing: ${ALP_SCRIPTS_COMMON_ARGS}")
+# message("Collected ALP test targets: ${ALP_TEST_BACKENDS}")
+
+set( alp_smoke_tests_list ${ALP_TEST_BACKENDS})
+list(APPEND S_minus_T ${ALP_TEST_BACKENDS})
+list(REMOVE_ITEM S_minus_T ${smoke_tests_list})
+list(REMOVE_ITEM alp_smoke_tests_list "${S_minus_T}" )
+
+add_custom_target( smoketests_alp
+	COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/smoketests.sh
+		${ALP_SCRIPTS_COMMON_ARGS}
+		"--test-bin-dir" "\"${CMAKE_CURRENT_BINARY_DIR}\""
+		"--test-out-dir" "\"${CMAKE_CURRENT_BINARY_DIR}/output\""
+		"--output-verification-dir" "\"${CMAKE_CURRENT_SOURCE_DIR}/output_verification\""
+		"--test-data-dir" "\"${CMAKE_CURRENT_SOURCE_DIR}/test_data\""
+
+	WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+	DEPENDS ${alp_smoke_tests_list} # add dependency on this target
+	                                   # to automatically build before running
+	COMMAND_EXPAND_LISTS
+	USES_TERMINAL
+)
diff --git a/tests/smoke/alp_backsubstitution.cpp b/tests/smoke/alp_backsubstitution.cpp
new file mode 100644
index 000000000..a2ed99524
--- /dev/null
+++ b/tests/smoke/alp_backsubstitution.cpp
@@ -0,0 +1,355 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <alp/algorithms/backsubstitution.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+using BaseScalarType = double;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+#else
+using ScalarType = BaseScalarType;
+#endif
+
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+template< typename T >
+T random_value();
+
+template<>
+BaseScalarType random_value< BaseScalarType >() {
+	return static_cast< BaseScalarType >( rand() ) / RAND_MAX;
+}
+
+template<>
+std::complex< BaseScalarType > random_value< std::complex< BaseScalarType > >() {
+	const BaseScalarType re = random_value< BaseScalarType >();
+	const BaseScalarType im = random_value< BaseScalarType >();
+	return std::complex< BaseScalarType >( re, im );
+}
+
+
+/** generate data */
+template< typename T >
+std::vector< T > generate_data( size_t N ) {
+	std::vector< T > data( N );
+	for( size_t i = 0; i < N; ++i ) {
+		data[ i ] = random_value< T >();
+	}
+	return( data );
+}
+
+/** generate real upper triangular positive definite matrix data */
+template<
+	typename T,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+>
+std::vector< T > generate_upd_matrix( size_t N ) {
+	std::vector< T > data( ( N * ( N + 1 ) ) / 2 );
+	size_t k = 0;
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			data[ k ] = random_value< T >();
+			if( i == j ) {
+				data[ k ] = data[ k ] + static_cast< T >( N );
+			}
+			++k;
+		}
+	}
+	return( data );
+}
+
+/** generate complex upper triangular positive definite matrix data */
+template<
+	typename T,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+>
+std::vector< T > generate_upd_matrix( size_t N  ) {
+	std::vector< T > data( ( N * ( N + 1 ) ) / 2 );
+	size_t k = 0;
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			data[ k ] = random_value< T >();
+			if( i == j ) {
+				data[ k ] = data[ k ] + static_cast< T >( N );
+			}
+			++k;
+		}
+	}
+	return ( data );
+}
+
+/** check if Ax == b */
+template<
+	typename D = double,
+	typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	typename Minus = operators::subtract< D >
+>
+RC check_solution(
+	Matrix< D, structures::UpperTriangular, Dense > &A,
+	Vector< D > &x,
+	Vector< D > &b,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+
+	RC rc = SUCCESS;
+
+	const size_t n = nrows( A );
+
+	alp::Vector< D > lhs( n );
+	rc = rc ? rc : alp::set( lhs, zero );
+	auto lhs_matview =  get_view< alp::view::matrix >( lhs );
+	rc = rc ? rc : alp::mxm( lhs_matview, A, x, ring );
+	rc = rc ? rc : alp::foldl( lhs_matview, b, minus );
+
+	D alpha = ring.template getZero< D >();
+	rc = rc ? rc : alp::norm2( alpha, lhs, ring );
+	if( std::abs( alpha ) > tol ) {
+		std::cout << "Numerical error too large: |Ax-b| = " << alpha << ".\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+/** check if AX == B */
+template<
+	typename D = double,
+	typename StructX,
+	typename StructB,
+	typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	typename Minus = operators::subtract< D >,
+	typename Divide = operators::divide< D >
+>
+RC check_solution(
+	Matrix< D, structures::UpperTriangular, Dense > &A,
+	Matrix< D, StructX, Dense > &X,
+	Matrix< D, StructB, Dense > &B,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus(),
+	const Divide &divide = Divide()
+) {
+	(void) divide;
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+
+	RC rc = SUCCESS;
+
+	if( ncols( A ) != nrows( X ) ){
+		std::cerr << "Asked to check incompatible structures.\n";
+		return FAILED;
+	}
+
+	const size_t n = nrows( A );
+	const size_t m = ncols( X );
+
+	alp::Matrix< D, StructB > LHS( n, m );
+	rc = rc ? rc : alp::set( LHS, zero );
+	rc = rc ? rc : alp::mxm( LHS, A, X, ring );
+	rc = rc ? rc : alp::foldl( LHS, B, minus );
+
+	//Frobenius norm
+	D fnorm = 0;
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			fnorm += val * val;
+		},
+		LHS
+	);
+	fnorm = std::sqrt( fnorm );
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << " FrobeniusNorm(AX-B) = " << fnorm << " is too large\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+
+		// dimensions of upper triangular matrix
+		const size_t N = unit.N;
+
+		alp::Vector< ScalarType > b( N );
+		alp::Vector< ScalarType > x( N );
+		alp::Matrix< ScalarType, structures::UpperTriangular > A( N );
+		{
+			std::srand( RNDSEED );
+			auto matrix_data = generate_upd_matrix< ScalarType >( N );
+			rc = rc ? rc : alp::buildMatrix( A, matrix_data.begin(), matrix_data.end() );
+		}
+		rc = rc ? rc : alp::set( b, Scalar< ScalarType >( ring.template getOne< ScalarType >() ) );
+		rc = rc ? rc : alp::set( x, Scalar< ScalarType >( ring.template getZero< ScalarType >() ) );
+
+#ifdef DEBUG
+		print_matrix( " input matrix A ", A );
+		print_vector( " input vector b ", b );
+#endif
+
+		rc = rc ? rc : algorithms::backsubstitution( A, x, b, ring );
+
+#ifdef DEBUG
+		print_vector( " output vector x ", x );
+#endif
+		rc = rc ? rc : check_solution( A, x, b );
+
+		const size_t M = N / 2;
+		// version with matrices
+		alp::Matrix< ScalarType, structures::General > X( N, M );
+		alp::Matrix< ScalarType, structures::General > B( N, M );
+		rc = rc ? rc : alp::set( X, Scalar< ScalarType >( ring.template getZero< ScalarType >() ) );
+		{
+			auto matrix_data = generate_data< ScalarType >( N * M );
+			rc = rc ? rc : alp::buildMatrix( B, matrix_data.begin(), matrix_data.end() );
+		}
+#ifdef DEBUG
+		print_matrix( " input matrix B ", B );
+#endif
+
+		timer.reset();
+
+		rc = rc ? rc : algorithms::backsubstitution( A, X, B, ring );
+
+		times += timer.time();
+
+		rc = rc ? rc : check_solution( A, X, B );
+
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_cholesky.cpp b/tests/smoke/alp_cholesky.cpp
new file mode 100644
index 000000000..afac6b404
--- /dev/null
+++ b/tests/smoke/alp_cholesky.cpp
@@ -0,0 +1,430 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#include <alp/algorithms/cholesky.hpp>
+#include <alp/utils/parser/MatrixFileReader.hpp>
+#include "../utils/print_alp_containers.hpp"
+
+using namespace alp;
+
+using BaseScalarType = double;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+//not fully implemented structures
+using HermitianOrSymmetricPD = structures::HermitianPositiveDefinite;
+#else
+using ScalarType = BaseScalarType;
+//fully implemented structures
+using HermitianOrSymmetricPD = structures::SymmetricPositiveDefinite;
+#endif
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+	std::string fname = "";
+};
+
+template< typename T >
+T random_value();
+
+template<>
+BaseScalarType random_value< BaseScalarType >() {
+	return static_cast< BaseScalarType >( rand() ) / RAND_MAX;
+}
+
+template<>
+std::complex< BaseScalarType > random_value< std::complex< BaseScalarType > >() {
+	const BaseScalarType re = random_value< BaseScalarType >();
+	const BaseScalarType im = random_value< BaseScalarType >();
+	return std::complex< BaseScalarType >( re, im );
+}
+
+/** Generate full storage Symmetric or Hermitian
+ *   positive definite matrix for in-place tests
+ */
+template< typename T >
+void generate_symmherm_pos_def_mat_data_full( size_t N, std::vector<T> &data ) {
+	if( data.size() != N * N ) {
+		std::cout << "Error: generate_spd_matrix_full: Provided container does not have adequate size\n";
+	}
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			data[ i * N + j ] = random_value< T >();
+			data[ j * N + i ] += grb::utils::is_complex< T >::conjugate( data[ i * N + j ] );
+			if( i == j ) {
+				data[ j * N + i ] += static_cast< T >( N );
+			}
+		}
+	}
+}
+
+/** Generate symmetric-hermitian positive
+ *  definite matrix in a full-storage container
+ */
+template< typename T >
+void generate_symmherm_pos_def_mat_data(
+	size_t N,
+	std::vector< T > &mat_data,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	generate_symmherm_pos_def_mat_data_full< T >( N, mat_data );
+}
+
+/** Generate upper/lower triangular part of a
+ *  Symmetric positive definite matrix
+ */
+template< typename T >
+void generate_symmherm_pos_def_mat_data(
+	size_t N,
+	std::vector< T > &mat_data,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::fill( mat_data.begin(), mat_data.end(), static_cast< T >( 0 ) );
+	size_t k = 0;
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			mat_data[ k ] = random_value< T >();
+			if( i == j ) {
+				mat_data[ k ] += grb::utils::is_complex< T >::conjugate( mat_data[ k ] );
+				mat_data[ k ] += static_cast< T >( N );
+			}
+			++k;
+		}
+	}
+}
+
+
+/** Check the solution by calculating the
+ *  Frobenius norm of (H-U^TU)
+ */
+template<
+	typename MatSymmType,
+	typename MatUpTriangType,
+	typename T = typename MatSymmType::value_type,
+	typename Ring = Semiring< operators::add< T >, operators::mul< T >, identities::zero, identities::one >,
+	typename Minus = operators::subtract< T >
+>
+alp::RC check_cholesky_solution(
+	const MatSymmType &H,
+	MatUpTriangType &U,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	alp::RC rc = SUCCESS;
+	const Scalar< T > zero( ring.template getZero< T >() );
+	const Scalar< T > one( ring.template getOne< T >() );
+	const size_t N = nrows( H );
+	MatSymmType UTU( N );
+	rc = rc ? rc : alp::set( UTU, zero );
+
+	Matrix< T, structures::UpperTriangular > Ustar( N );
+	rc = rc ? rc : alp::set( Ustar, alp::conjugate( U ) );
+	auto UstarT = alp::get_view< alp::view::transpose >( U );
+
+#ifdef DEBUG
+	print_matrix( "  UTU  ", UTU );
+	print_matrix( "  U   ", U );
+	print_matrix( "  UstarT   ", UstarT );
+#endif
+
+	rc = rc ? rc : alp::mxm( UTU, UstarT, U, ring );
+#ifdef DEBUG
+	print_matrix( " << UTU >> ", UTU );
+#endif
+
+	MatSymmType HminsUUt( N );
+	rc = rc ? rc : alp::set( HminsUUt, zero );
+
+	// UTU = -UTU
+	Scalar< T > alpha( zero );
+	rc = rc ? rc : foldl( alpha, one, minus );
+	rc = rc ? rc : foldl( UTU, alpha, ring.getMultiplicativeOperator() );
+
+#ifdef DEBUG
+	print_matrix( "  -UTU  ", UTU );
+#endif
+
+	// HminsUUt = H - UTU
+	rc = rc ? rc : alp::eWiseApply(
+		HminsUUt, H, UTU,
+		ring.getAdditiveMonoid()
+	);
+#ifdef DEBUG
+	print_matrix( " << H - UTU  >> ", HminsUUt );
+#endif
+
+	//Frobenius norm
+	T fnorm = 0;
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm ]( const size_t i, const size_t j, T &val ) {
+			(void) i;
+			(void) j;
+			fnorm += val * val;
+		},
+		HminsUUt
+	);
+	fnorm = std::sqrt( fnorm );
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(H-U^TU) = " << fnorm << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large. "
+			"Make sure that you have used SPD matrix as input.\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	alp::Semiring<
+		alp::operators::add< ScalarType >,
+		alp::operators::mul< ScalarType >,
+		alp::identities::zero,
+		alp::identities::one
+	> ring;
+	const alp::Scalar< ScalarType > zero_scalar( ring.getZero< ScalarType >() );
+
+	size_t N = 0;
+	if( !unit.fname.empty() ) {
+		alp::utils::MatrixFileReader< ScalarType > parser_A( unit.fname );
+		N = parser_A.n();
+		if( !parser_A.isSymmetric() ) {
+			std::cout << "Symmetric matrix epxected as input!\n";
+			rc = ILLEGAL;
+			return;
+		}
+	} else if( unit.N != 0 )  {
+		N = unit.N;
+	}
+
+	alp::Matrix< ScalarType, structures::UpperTriangular, Dense > U( N );
+	alp::Matrix< ScalarType, HermitianOrSymmetricPD, Dense > H( N );
+
+	if( !unit.fname.empty() ) {
+		alp::utils::MatrixFileReader< ScalarType > parser_A( unit.fname );
+		rc = rc ? rc : alp::buildMatrix( H, parser_A.begin(), parser_A.end() );
+	} else if( unit.N != 0 )  {
+		std::srand( RNDSEED );
+#ifdef _ALP_WITH_REFERENCE
+		std::vector< ScalarType > matrix_data( ( N * ( N + 1 ) ) / 2 );
+		generate_symmherm_pos_def_mat_data( N, matrix_data );
+#endif
+#ifdef _ALP_WITH_DISPATCH
+		std::vector< ScalarType > matrix_data( N * N );
+		generate_symmherm_pos_def_mat_data_full( N, matrix_data );
+#endif
+		rc = rc ? rc : alp::buildMatrix( H, matrix_data.begin(), matrix_data.end() );
+	}
+
+	if( !internal::getInitialized( H ) ) {
+		std::cout << " Matrix H is not initialized\n";
+		return;
+	}
+
+#ifdef DEBUG
+	print_matrix( std::string( "  H  " ), H );
+	print_matrix( std::string( "  U  " ), U );
+#endif
+
+	rc = rc ? rc : alp::set( U, zero_scalar	);
+
+	if( !internal::getInitialized( U ) ) {
+		std::cout << " Matrix U is not initialized\n";
+		return;
+	}
+
+ 	rc = rc ? rc : algorithms::cholesky_uptr( U, H, ring );
+#ifdef DEBUG
+ 	print_matrix( std::string("  U  " ), U );
+#endif
+ 	rc = rc ? rc : check_cholesky_solution( H, U, ring );
+
+	// TODO
+	// rest of cholesky algorithms are not implemented for complex version
+	// they require vector views on conjugate()
+
+	rc = rc ? rc : alp::set( U, zero_scalar	);
+	// test blocked version, for bs = 1, 2, 4, 8 ... N
+	for( size_t bs = 1; bs <= N; bs = std::min( bs * 2, N ) ) {
+		rc = rc ? rc : algorithms::cholesky_uptr_blk( U, H, bs, ring );
+		rc = rc ? rc : check_cholesky_solution( H, U, ring );
+		if( bs == N ) {
+			break;
+		}
+	}
+
+	// test non-blocked inplace version
+	alp::Matrix< ScalarType, structures::Square, Dense > Uip_original( N );
+	alp::Matrix< ScalarType, structures::Square, Dense > Uip( N );
+	std::srand( RNDSEED );
+	{
+		std::vector< ScalarType > matrix_data( N * N );
+		generate_symmherm_pos_def_mat_data_full< ScalarType >( N, matrix_data );
+		rc = rc ? rc : alp::buildMatrix( Uip, matrix_data.begin(), matrix_data.end() );
+	}
+	rc = rc ? rc : alp::set( Uip_original, Uip );
+#ifdef DEBUG
+	print_matrix( " Uip(input) ", Uip );
+#endif
+	rc = rc ? rc : algorithms::cholesky_uptr( Uip, ring );
+#ifdef DEBUG
+	print_matrix( " Uip(output) ", Uip );
+#endif
+	auto UipUpTr = get_view< structures::UpperTriangular >( Uip );
+	rc = rc ? rc : check_cholesky_solution( Uip_original, UipUpTr, ring );
+
+	// test non-blocked inplace version, bs = 1, 2, 4, 8 ... N
+	for( size_t bs = 1; bs <= N; bs = std::min( bs * 2, N ) ) {
+		rc = rc ? rc : alp::set( Uip, Uip_original );
+		rc = rc ? rc : algorithms::cholesky_uptr_blk( Uip, bs, ring );
+		rc = rc ? rc : check_cholesky_solution( Uip_original, UipUpTr, ring );
+		if( bs == N ) {
+			break;
+		}
+	}
+
+
+	// time blocked inplace version with bs = 64
+	std::cout << "Timing of blocked inplace version with bs = 64.\n";
+	grb::utils::Timer timer;
+	timer.reset();
+	double times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+		size_t bs = std::min( static_cast< size_t >( 64 ), N );
+
+		timer.reset();
+
+		rc = rc ? rc : algorithms::cholesky_uptr_blk( U, H, bs, ring );
+
+		times += timer.time();
+
+		rc = rc ? rc : check_cholesky_solution( H, U, ring );
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			( readflag != std::string( "-n" ) ) &&
+			( readflag != std::string( "-fname" ) )
+		) {
+			std::cerr << "Given first argument is unknown: " << readflag << "\n";
+			printUsage = true;
+		} else {
+			if( readflag == std::string( "-n" )  ) {
+				if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+			if( readflag == std::string( "-fname" )  ) {
+				if( ! ( ( ss2 >> in.fname ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -fname \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -fname   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_forwardsubstitution.cpp b/tests/smoke/alp_forwardsubstitution.cpp
new file mode 100644
index 000000000..cceee38d2
--- /dev/null
+++ b/tests/smoke/alp_forwardsubstitution.cpp
@@ -0,0 +1,364 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <alp/algorithms/forwardsubstitution.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+using BaseScalarType = double;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+#else
+using ScalarType = BaseScalarType;
+#endif
+
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+template< typename T >
+T random_value();
+
+template<>
+BaseScalarType random_value< BaseScalarType >() {
+	return static_cast< BaseScalarType >( rand() ) / RAND_MAX;
+}
+
+template<>
+std::complex< BaseScalarType > random_value< std::complex< BaseScalarType > >() {
+	const BaseScalarType re = random_value< BaseScalarType >();
+	const BaseScalarType im = random_value< BaseScalarType >();
+	return std::complex< BaseScalarType >( re, im );
+}
+
+
+/** generate data */
+template< typename T >
+std::vector< T > generate_data( size_t N ) {
+	std::vector< T > data( N );
+	for( size_t i = 0; i < N; ++i ) {
+		data[ i ] = random_value< T >();
+	}
+	return( data );
+}
+
+/** generate real lower triangular positive definite matrix data */
+template<
+	typename T,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+>
+std::vector< T > generate_lpd_matrix( size_t N ) {
+	std::vector< T > data( ( N * ( N + 1 ) ) / 2 );
+	size_t k = 0;
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j <= i; ++j ) {
+			data[ k ] = random_value< T >();
+			if( i == j ) {
+				data[ k ] = data[ k ] + static_cast< T >( N );
+			}
+			++k;
+		}
+	}
+	return( data );
+}
+
+/** generate complex lower triangular positive definite matrix data */
+template<
+	typename T,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+>
+std::vector< T > generate_lpd_matrix( size_t N  ) {
+	std::vector< T > data( ( N * ( N + 1 ) ) / 2 );
+	size_t k = 0;
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j <= i; ++j ) {
+			data[ k ] = random_value< T >();
+			if( i == j ) {
+				data[ k ] = data[ k ] + static_cast< T >( N );
+			}
+			++k;
+		}
+	}
+	return ( data );
+}
+
+/** check if Ax == b */
+template<
+	typename D = double,
+	typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	typename Minus = operators::subtract< D >
+>
+RC check_solution(
+	Matrix< D, structures::LowerTriangular, Dense > &A,
+	Vector< D > &x,
+	Vector< D > &b,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+
+	RC rc = SUCCESS;
+
+	const size_t n = nrows( A );
+
+	alp::Vector< D > lhs( n );
+	rc = rc ? rc : alp::set( lhs, zero );
+	auto lhs_matview =  get_view< alp::view::matrix >( lhs );
+	rc = rc ? rc : alp::mxm( lhs_matview, A, x, ring );
+	rc = rc ? rc : alp::foldl( lhs_matview, b, minus );
+
+	D alpha = ring.template getZero< D >();
+	rc = rc ? rc : alp::norm2( alpha, lhs, ring );
+	if( std::abs( alpha ) > tol ) {
+		std::cout << "Numerical error too large: |Ax-b| = " << alpha << ".\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+/** check if AX == B */
+template<
+	typename D = double,
+	typename StructX,
+	typename StructB,
+	typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	typename Minus = operators::subtract< D >,
+	typename Divide = operators::divide< D >
+>
+RC check_solution(
+	Matrix< D, structures::LowerTriangular, Dense > &A,
+	Matrix< D, StructX, Dense > &X,
+	Matrix< D, StructB, Dense > &B,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus(),
+	const Divide &divide = Divide()
+) {
+	(void) divide;
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+
+	RC rc = SUCCESS;
+
+	if( ncols( A ) != nrows( X ) ){
+		std::cerr << "Asked to check incompatible structures.\n";
+		return FAILED;
+	}
+
+	const size_t n = nrows( A );
+	const size_t m = ncols( X );
+
+	alp::Matrix< D, StructB > LHS( n, m );
+	rc = rc ? rc : alp::set( LHS, zero );
+	rc = rc ? rc : alp::mxm( LHS, A, X, ring );
+	rc = rc ? rc : alp::foldl( LHS, B, minus );
+
+	//Frobenius norm
+	D fnorm = 0;
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			fnorm += val * val;
+		},
+		LHS
+	);
+	fnorm = std::sqrt( fnorm );
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << " FrobeniusNorm(AX-B) = " << fnorm << " is too large\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+
+		// dimensions of lower triangular matrix
+		const size_t N = unit.N;
+
+		alp::Vector< ScalarType > b( N );
+		alp::Vector< ScalarType > x( N );
+		alp::Matrix< ScalarType, structures::LowerTriangular > A( N );
+		{
+			std::srand( RNDSEED );
+			auto matrix_data = generate_lpd_matrix< ScalarType >( N );
+			rc = rc ? rc : alp::buildMatrix( A, matrix_data.begin(), matrix_data.end() );
+		}
+		rc = rc ? rc : alp::set( b, Scalar< ScalarType >( ring.template getOne< ScalarType >() ) );
+		rc = rc ? rc : alp::set( x, Scalar< ScalarType >( ring.template getZero< ScalarType >() ) );
+
+#ifdef DEBUG
+		print_matrix( " input matrix A ", A );
+		print_vector( " input vector b ", b );
+#endif
+
+		rc = rc ? rc : algorithms::forwardsubstitution( A, x, b, ring );
+
+#ifdef DEBUG
+		print_vector( " output vector x ", x );
+#endif
+		rc = rc ? rc : check_solution( A, x, b );
+
+		const size_t M = N / 2;
+		// matrix version
+		alp::Matrix< ScalarType, structures::General > X( N, M );
+		alp::Matrix< ScalarType, structures::General > B( N, M );
+		rc = rc ? rc : alp::set( X, Scalar< ScalarType >( ring.template getZero< ScalarType >() ) );
+		{
+			auto matrix_data = generate_data< ScalarType >( N * M );
+			rc = rc ? rc : alp::buildMatrix( B, matrix_data.begin(), matrix_data.end() );
+		}
+#ifdef DEBUG
+		print_matrix( " input matrix B ", B );
+#endif
+		rc = rc ? rc : algorithms::forwardsubstitution( A, X, B, ring );
+		rc = rc ? rc : check_solution( A, X, B );
+
+		//inplace version
+		rc = rc ? rc : alp::set( x, b );
+		rc = rc ? rc : algorithms::forwardsubstitution( A, x, ring );
+		rc = rc ? rc : check_solution( A, x, b );
+
+		//inplace matrix version
+		rc = rc ? rc : alp::set( X, B );
+
+		timer.reset();
+
+		rc = rc ? rc : algorithms::forwardsubstitution( A, X, ring );
+
+		times += timer.time();
+
+		rc = rc ? rc : check_solution( A, X, B );
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_gemm.cpp b/tests/smoke/alp_gemm.cpp
new file mode 100644
index 000000000..20894c343
--- /dev/null
+++ b/tests/smoke/alp_gemm.cpp
@@ -0,0 +1,315 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <alp/algorithms/gemm.hpp>
+#include "../utils/print_alp_containers.hpp"
+
+using namespace alp;
+
+static double tol = 1.e-7;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+/**
+ * Initializes matrix elements to random values between 0 and 1.
+ * Assumes the matrix uses full storage.
+ * \todo Add support for any type of storage.
+ */
+template<
+	typename MatrixType,
+	typename std::enable_if_t< alp::is_matrix< MatrixType >::value > * = nullptr
+>
+alp::RC initialize_random( MatrixType &A ) {
+	alp::internal::setInitialized( A, true );
+	for( size_t i = 0; i < alp::nrows( A ); ++i ) {
+		for( size_t j = 0; j < alp::ncols( A ); ++j ) {
+			alp::internal::access( A, alp::internal::getStorageIndex( A, i, j ) ) = static_cast< double >( rand() ) / RAND_MAX;
+		}
+	}
+
+	return alp::SUCCESS;
+}
+
+template< typename... Args >
+RC gemm_dispatch( bool transposeA, bool transposeB, Args&&... args ) {
+	if( transposeA ) {
+		if( transposeB ) {
+			return algorithms::gemm_like_example< true, true >( std::forward< Args >( args )... );
+		} else {
+			return algorithms::gemm_like_example< true, false >( std::forward< Args >( args )... );
+		}
+	} else {
+		if( transposeB ) {
+			return algorithms::gemm_like_example< false, true >( std::forward< Args >( args )... );
+		} else {
+			return algorithms::gemm_like_example< false, false >( std::forward< Args >( args )... );
+		}
+	}
+}
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	std::cout << "\tTesting ALP gemm_like_example\n"
+	             "\tC = alpha * A * B + beta * C\n";
+
+
+	const std::vector< std::pair< bool, bool > > transpose_AB_configs = {
+		{ false, false }, { false, true }, { true, false }, { true, true }
+	};
+
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times[] = { 0, 0, 0, 0 };
+
+	for( size_t jrepeat = 0; jrepeat < unit.repeat; ++jrepeat ) {
+
+		alp::Semiring< alp::operators::add< double >, alp::operators::mul< double >, alp::identities::zero, alp::identities::one > ring;
+
+		// dimensions of matrices A, B and C
+		size_t M = 10 * unit.N;
+		size_t N = 20 * unit.N;
+		size_t K = 30 * unit.N;
+
+		alp::Matrix< double, structures::General > A( M, K );
+		alp::Matrix< double, structures::General > B( K, N );
+		alp::Matrix< double, structures::General > C( M, N );
+		alp::Matrix< double, structures::General > C_orig( M, N );
+
+		// Initialize containers A, B, C, alpha, beta
+		rc = rc ? rc : initialize_random( A );
+		rc = rc ? rc : initialize_random( B );
+		rc = rc ? rc : initialize_random( C_orig );
+
+#ifdef DEBUG
+		if( rc != SUCCESS ) {
+			std::cerr << "Initialization failed\n";
+		}
+#endif
+
+		assert( rc == SUCCESS );
+
+#ifdef DEBUG
+		print_matrix( "A", A );
+		print_matrix( "B", B );
+		print_matrix( "C_orig", C_orig );
+#endif
+
+		constexpr double alpha_value = 0.5;
+		constexpr double beta_value = 1.5;
+		Scalar< double > alpha( alpha_value );
+		Scalar< double > beta( beta_value );
+
+		size_t iconfig = 0;
+		for( auto config : transpose_AB_configs ){
+			const bool transposeA = config.first;
+			const bool transposeB = config.second;
+
+			// dimensions of views over A, B and C
+			size_t m = 1 * unit.N;
+			size_t n = 2 * unit.N;
+			size_t k = 3 * unit.N;
+
+			// Set parameters to the gemm-like algorithm
+			const size_t startAr = 1;
+			const size_t startAc = 2;
+			const size_t startBr = 3;
+			const size_t startBc = 4;
+			const size_t startCr = 5;
+			const size_t startCc = 6;
+			const size_t stride = 2;
+
+			rc = rc ? rc : set( C, C_orig );
+#ifndef NDEBUG
+			if( rc != SUCCESS ) {
+				std::cerr << "Initialization of C failed\n";
+			}
+#endif
+
+			// Call gemm-like algorithm
+#ifndef NDEBUG
+			std::cout << "Calling gemm_like_example with "
+				  << ( transposeA ? "" : "non-" ) << "transposed A and "
+				  << ( transposeB ? "" : "non-" ) << "transposed B.\n";
+#endif
+			timer.reset();
+
+			rc = rc ? rc : gemm_dispatch(
+				transposeA, transposeB,
+				m, n, k,
+				alpha,
+				A, startAr, stride, startAc, stride,
+				B, startBr, stride, startBc, stride,
+				beta,
+				C, startCr, stride, startCc, stride,
+				ring
+			);
+
+			times[ iconfig ] += timer.time();
+			++iconfig;
+
+			// Check correctness
+			if( rc != SUCCESS ) {
+				return;
+			}
+
+			// Check numerical correctness
+			for( size_t i = 0; i < alp::nrows( C ); ++i ) {
+				for( size_t j = 0; j < alp::nrows( C ); ++j ) {
+
+					// Calculate the expected value
+					double expected_value;
+					double C_orig_value = alp::internal::access( C_orig, alp::internal::getStorageIndex( C_orig, i, j ) );
+
+					// Check if coordinates (i, j) fall into the gather view over C
+					if(
+						( i >= startCr ) && ( i < startCr + m * stride ) &&
+						( j >= startCc ) && ( j < startCc + n * stride ) &&
+						( ( i - startCr ) % stride == 0 ) &&
+						( ( j - startCc ) % stride == 0 )
+					) {
+						double mxm_res = 0;
+						for( size_t kk = 0; kk < k; ++kk ) {
+							// coordinates within the gather view over C
+							const size_t sub_i = ( i - startCr ) / stride;
+							const size_t sub_j = ( j - startCc ) / stride;
+
+							// take into account the gather view on A and potential transposition
+							const size_t A_i = startAr + stride * ( transposeA ? kk : sub_i );
+							const size_t A_j = startAc + stride * ( transposeA ? sub_i : kk );
+							const auto A_val = alp::internal::access( A, alp::internal::getStorageIndex( A, A_i, A_j ) );
+
+							// take into account the gather view on B and potential transposition
+							const size_t B_i = startBr + stride * ( transposeB ? sub_j : kk );
+							const size_t B_j = startBc + stride * ( transposeB ? kk : sub_j );
+							const auto B_val = alp::internal::access( B, alp::internal::getStorageIndex( B, B_i, B_j ) );
+
+							mxm_res += A_val * B_val;
+						}
+						expected_value = alpha_value * mxm_res + beta_value * C_orig_value;
+					} else {
+						expected_value = C_orig_value;
+					}
+
+					// Obtain the value calculated by the gemm-like algorithm
+					const auto calculated_value = alp::internal::access( C, alp::internal::getStorageIndex( C, i, j ) );
+
+					// Compare and report
+					if( std::abs( expected_value - calculated_value ) > tol ) {
+#ifndef NDEBUG
+						std::cerr << "Numerically incorrect: "
+							"at (" << i << ", " << j << ") "
+							"expected " << expected_value << ", but got " << calculated_value << "\n";
+#endif
+						rc = FAILED;
+						return;
+					}
+				}
+			}
+		}
+	}
+
+	for( size_t i = 0; i < 4; ++i ) {
+		const bool transposeA = transpose_AB_configs[ i ].first;
+		const bool transposeB = transpose_AB_configs[ i ].second;
+		std::cout << " GEMM versions: transposeA = " << transposeA << ", transposeB = " << transposeB << "\n";
+		std::cout << " time (ms, total) = " << times[ i ] << "\n";
+		std::cout << " time (ms, per repeat) = " << times[ i ] / unit.repeat  << "\n";
+	}
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_potri.cpp b/tests/smoke/alp_potri.cpp
new file mode 100644
index 000000000..6e1b2f2db
--- /dev/null
+++ b/tests/smoke/alp_potri.cpp
@@ -0,0 +1,356 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#include <alp/algorithms/symherm_posdef_inverse.hpp>
+#include <alp/utils/parser/MatrixFileReader.hpp>
+#ifdef DEBUG
+#include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+using BaseScalarType = double;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+//not fully implemented structures
+using HermitianOrSymmetricPD = structures::HermitianPositiveDefinite;
+#else
+using ScalarType = BaseScalarType;
+//fully implemented structures
+using HermitianOrSymmetricPD = structures::SymmetricPositiveDefinite;
+#endif
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+	std::string fname = "";
+};
+
+template< typename T >
+T random_value();
+
+template<>
+BaseScalarType random_value< BaseScalarType >() {
+	return static_cast< BaseScalarType >( rand() ) / RAND_MAX;
+}
+
+template<>
+std::complex< BaseScalarType > random_value< std::complex< BaseScalarType > >() {
+	const BaseScalarType re = random_value< BaseScalarType >();
+	const BaseScalarType im = random_value< BaseScalarType >();
+	return std::complex< BaseScalarType >( re, im );
+}
+
+/** Generate full storage Symmetric or Hermitian
+ *   positive definite matrix for in-place tests
+ */
+template< typename T >
+void generate_symmherm_pos_def_mat_data_full(
+	size_t N,
+	std::vector< T > &mat_data
+) {
+	std::fill( mat_data.begin(), mat_data.end(), static_cast< T >( 0 ) );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			mat_data[ i * N + j ] = random_value< T >();
+			mat_data[ j * N + i ] += grb::utils::is_complex< T >::conjugate( mat_data[ i * N + j ] );
+			if( i == j ) {
+				mat_data[ j * N + i ] += static_cast< T >( N );
+			}
+		}
+	}
+}
+
+/** Generate symmetric-hermitian positive
+ *  definite matrix in a full-storage container
+ */
+template< typename T >
+void generate_symmherm_pos_def_mat_data(
+	size_t N,
+	std::vector< T > &mat_data,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	generate_symmherm_pos_def_mat_data_full< T >( N, mat_data );
+}
+
+/** Generate upper/lower triangular part of a
+ *  Symmetric positive definite matrix
+ */
+template< typename T >
+void generate_symmherm_pos_def_mat_data(
+	size_t N,
+	std::vector< T > &mat_data,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::fill( mat_data.begin(), mat_data.end(), static_cast< T >( 0 ) );
+	size_t k = 0;
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			mat_data[ k ] = random_value< T >();
+			if( i == j ) {
+				mat_data[ k ] += grb::utils::is_complex< T >::conjugate( mat_data[ k ] );
+				mat_data[ k ] += static_cast< T >( N );
+			}
+			++k;
+		}
+	}
+}
+
+
+//** check the solution by calculating the Frobenius norm of (I - H^-1 x H) */
+template<
+	typename MatH,
+	typename D = typename MatH::value_type,
+	typename Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	typename Minus = operators::subtract< D >,
+	std::enable_if_t<
+		is_matrix< MatH >::value &&
+		// TODO: structures::Symmetric should be replced
+		//       rewith structures::SymmetricPositiveDefinite
+		(
+			(
+				!grb::utils::is_complex< D >::value &&
+				structures::is_a< typename MatH::structure, structures::Symmetric >::value
+			) || (
+				grb::utils::is_complex< D >::value &&
+				structures::is_a< typename MatH::structure, structures::Hermitian >::value
+			)
+		) &&
+		is_semiring< Ring >::value &&
+		is_operator< Minus >::value
+	> * = nullptr
+>
+alp::RC check_inverse_solution(
+	const MatH &Hinv,
+	const MatH &H,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	alp::RC rc = SUCCESS;
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+	const size_t N = nrows( H );
+
+	alp::Matrix< D, structures::Square, Dense > HxHinv( N );
+	rc = rc ? rc : alp::set( HxHinv, zero );
+	rc = rc ? rc : alp::mxm( HxHinv, H, Hinv, ring );
+#ifdef DEBUG
+	print_matrix( std::string("  HxHinv  "), HxHinv );
+#endif
+
+	auto HxHinvdiag = alp::get_view< alp::view::diagonal >( HxHinv );
+	rc = rc ? rc : foldl( HxHinvdiag, one, minus );
+
+	//Frobenius norm
+	D fnorm = 0;
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			fnorm += val * val;
+		},
+		HxHinv
+	);
+	fnorm = std::sqrt( fnorm );
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(I - H^-1 x H) = " << fnorm << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large. "
+			"Make sure that you have used SPD matrix as input.\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+		const alp::Scalar< ScalarType > zero_scalar( ring.getZero< ScalarType >() );
+		const alp::Scalar< ScalarType > one_scalar( ring.getOne< ScalarType >() );
+
+		size_t N = 0;
+		if( !unit.fname.empty() ) {
+			alp::utils::MatrixFileReader< ScalarType > parser_A( unit.fname );
+			N = parser_A.n();
+			if( !parser_A.isSymmetric() ) {
+				std::cout << "Symmetric matrix epxected as input!\n";
+				rc = ILLEGAL;
+				return;
+			}
+		} else if( unit.N != 0 )  {
+			N = unit.N;
+		}
+
+		alp::Matrix< ScalarType, HermitianOrSymmetricPD, Dense > H( N );
+		alp::Matrix< ScalarType, HermitianOrSymmetricPD, Dense > Hinv( N );
+
+		if( !unit.fname.empty() ) {
+			alp::utils::MatrixFileReader< ScalarType > parser_A( unit.fname );
+			rc = rc ? rc : alp::buildMatrix( H, parser_A.begin(), parser_A.end() );
+		} else if( unit.N != 0 )  {
+			std::srand( RNDSEED );
+			std::vector< ScalarType > matrix_data( ( N * ( N + 1 ) ) / 2 );
+			// Hermitian is currently using full storage
+			if( grb::utils::is_complex< ScalarType >::value ) {
+				matrix_data.resize( N * N );
+			}
+			generate_symmherm_pos_def_mat_data< ScalarType >( N, matrix_data );
+			rc = rc ? rc : alp::buildMatrix( H, matrix_data.begin(), matrix_data.end() );
+		}
+
+		if( !internal::getInitialized( H ) ) {
+			std::cout << " Matrix H is not initialized\n";
+			return;
+		}
+
+#ifdef DEBUG
+		print_matrix( std::string(" << H >> "), H );
+#endif
+
+		timer.reset();
+
+		rc = rc ? rc : algorithms::symherm_posdef_inverse( Hinv, H, ring );
+
+		times += timer.time();
+
+		rc = rc ? rc : check_inverse_solution( Hinv, H, ring );
+
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			( readflag != std::string( "-n" ) ) &&
+			( readflag != std::string( "-fname" ) )
+		) {
+			std::cerr << "Given first argument is unknown: " << readflag << "\n";
+			printUsage = true;
+		} else {
+			if( readflag == std::string( "-n" )  ) {
+				if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+			if( readflag == std::string( "-fname" )  ) {
+				if( ! ( ( ss2 >> in.fname ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -fname \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -fname   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_stedc.cpp b/tests/smoke/alp_stedc.cpp
new file mode 100644
index 000000000..8d6799092
--- /dev/null
+++ b/tests/smoke/alp_stedc.cpp
@@ -0,0 +1,373 @@
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+//#include <alp/algorithms/symm_tridiag_eigensolver.hpp>
+#include <alp/algorithms/qr_eigensolver.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+//#ifdef DEBUG
+#include "../utils/print_alp_containers.hpp"
+//#endif
+
+using namespace alp;
+
+using BaseScalarType = double;
+using Orthogonal = structures::Orthogonal;
+
+using ScalarType = BaseScalarType;
+
+// not fully implemented structures
+using HermitianOrSymmetricTridiagonal = structures::SymmetricTridiagonal;
+
+//fully implemented structures
+using HermitianOrSymmetric = structures::Symmetric;
+
+constexpr BaseScalarType tol = 1.e-5;
+constexpr size_t RNDSEED = 11235;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+//** temp function until (tridiagonal)Hermitian container is implemented */
+//** generate tridiagonal-symmetric-hermitian matrix in a rectangular container */
+template<
+	typename T
+>
+std::vector< T > generate_symmherm_tridiag_matrix_data(
+	size_t N,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * N );
+	std::fill( data.begin(), data.end(), static_cast< T >( 0 ) );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; ( j < N ) && ( j <= i + 1 ); ++j ) {
+			T val( std::rand(), std::rand() );
+			data[ i * N + j ] = val / std::abs( val );
+			data[ j * N + i ] += grb::utils::is_complex< T >::conjugate( data[ i * N + j ] );
+		}
+	}
+	return data;
+}
+
+//** generate_symmherm_tridiag_matrix_data: real numbers version*/
+template<
+	typename T
+>
+std::vector< T >  generate_symmherm_tridiag_matrix_data(
+	size_t N,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * N );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; ( j < N ) && ( j <= i + 1 ); ++j ) {
+			T val = static_cast< T >( std::rand() )  / RAND_MAX  * .001;
+			data[ i * N + j ] = val;
+			data[ j * N + i ] += grb::utils::is_complex< T >::conjugate( data[ i * N + j ] );
+		}
+	}
+	return data;
+}
+
+//** check if rows/columns or matrix Q are orthogonal */
+template<
+	typename T,
+	typename Structure,
+	typename ViewType,
+	class Ring = Semiring< operators::add< T >, operators::mul< T >, identities::zero, identities::one >,
+	class Minus = operators::subtract< T >
+>
+RC check_overlap(
+	alp::Matrix< T, Structure, alp::Density::Dense, ViewType > &Q,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	const Scalar< T > zero( ring.template getZero< T >() );
+	const Scalar< T > one( ring.template getOne< T >() );
+
+	RC rc = SUCCESS;
+	const size_t n = nrows( Q );
+
+	// check if QxQt == I
+	alp::Matrix< T, Structure, alp::Density::Dense, ViewType > Qtmp( n );
+	rc = rc ? rc : set( Qtmp, zero );
+	rc = rc ? rc : mxm(
+		Qtmp,
+		Q,
+		conjugate( alp::get_view< alp::view::transpose >( Q ) ),
+		ring
+	);
+	Matrix< T, Structure, Dense > Identity( n );
+	rc = rc ? rc : alp::set( Identity, zero );
+	auto id_diag = alp::get_view< alp::view::diagonal >( Identity );
+	rc = rc ? rc : alp::set( id_diag, one );
+	rc = rc ? rc : foldl( Qtmp, Identity, minus );
+
+	//Frobenius norm
+	T fnorm = ring.template getZero< T >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, T &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		Qtmp
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(QQt - I) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large: " << std::abs( fnorm ) << ".\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+
+//** check solution by calculating A x Q - Q x diag(d) */
+template<
+	typename D,
+	typename SymmOrHermTridiagonalType,
+	typename OrthogonalType,
+	typename SymmHermTrdiViewType,
+	typename OrthViewType,
+	typename SymmHermTrdiImfR,
+	typename SymmHermTrdiImfC,
+	typename OrthViewImfR,
+	typename OrthViewImfC,
+	typename VecViewType,
+	typename VecImfR,
+	typename VecImfC,
+	class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	class Minus = operators::subtract< D >,
+	class Divide = operators::divide< D >
+>
+RC check_solution(
+	Matrix< D, SymmOrHermTridiagonalType, Dense, SymmHermTrdiViewType, SymmHermTrdiImfR, SymmHermTrdiImfC > &T,
+	Matrix<	D, OrthogonalType, Dense, OrthViewType, OrthViewImfR, OrthViewImfC > &Q,
+	Vector<	D, structures::General, Dense, VecViewType, VecImfR, VecImfC > &d,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus(),
+	const Divide &divide = Divide()
+) {
+	(void) ring;
+	(void) minus;
+	(void) divide;
+	RC rc = SUCCESS;
+
+ 	const size_t n = nrows( Q );
+
+#ifdef DEBUG
+	print_matrix( " T ", T  );
+	print_matrix( " Q ", Q  );
+	print_vector( " d ", d  );
+#endif
+
+	alp::Matrix< D, alp::structures::Square, alp::Density::Dense > Left( n );
+	alp::Matrix< D, alp::structures::Square, alp::Density::Dense > Right( n );
+	alp::Matrix< D, alp::structures::Square, alp::Density::Dense > Dmat( n );
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+
+	rc = rc ? rc : set( Left, zero );
+	rc = rc ? rc : mxm( Left, T, Q, ring );
+
+	rc = rc ? rc : set( Dmat, zero );
+	auto D_diag = alp::get_view< alp::view::diagonal >( Dmat );
+	rc = rc ? rc : set( D_diag, d );
+	rc = rc ? rc : set( Right, zero );
+	rc = rc ? rc : mxm( Right, Q, Dmat, ring );
+#ifdef DEBUG
+	print_matrix( " TxQ ", Left  );
+	print_matrix( " QxD ", Right  ),
+#endif
+	rc = rc ? rc : foldl( Left, Right, minus );
+
+	//Frobenius norm
+	D fnorm = ring.template getZero< D >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		Left
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(AQ-QD) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large: " << std::abs( fnorm ) << ".\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+		const Scalar< ScalarType > zero_scalar( ring.template getZero< ScalarType >() );
+
+		// dimensions of sqare matrices H, Q and R
+		size_t N = unit.N;
+
+		alp::Matrix< ScalarType, Orthogonal > Q( N );
+		alp::Matrix< ScalarType, HermitianOrSymmetricTridiagonal > T( N );
+		Vector< ScalarType, structures::General, Dense > d( N );
+		rc = rc ? rc : set( d, zero_scalar );
+		{
+			std::srand( RNDSEED );
+			auto matrix_data = generate_symmherm_tridiag_matrix_data< ScalarType >( N );
+			rc = rc ? rc : alp::buildMatrix( T, matrix_data.begin(), matrix_data.end() );
+		}
+#ifdef DEBUG
+		print_matrix( " input matrix T ", T );
+#endif
+
+		timer.reset();
+
+		// rc = rc ? rc : algorithms::symm_tridiag_dac_eigensolver( T, Q, d, ring );
+                rc = rc ? rc : algorithms::qr_eigensolver( T, Q, d, ring );
+
+		times += timer.time();
+
+#ifdef DEBUG
+		print_matrix( " << Q >> ", Q );
+		print_matrix( " << T >> ", T );
+#endif
+
+		// the algorithm should return correct eigenvalues
+		// but for larger matrices (n>20) a more stable calculations
+		// of eigenvectors is needed
+
+		rc = check_overlap( Q );
+		if( rc != SUCCESS ) {
+			std::cout << "Error: mratrix Q is not orthogonal\n";
+		}
+
+		rc = check_solution( T, Q, d );
+		if( rc != SUCCESS ) {
+			std::cout << "Error: solution numerically wrong\n";
+		}
+
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_zgeqrf.cpp b/tests/smoke/alp_zgeqrf.cpp
new file mode 100644
index 000000000..614b3de8c
--- /dev/null
+++ b/tests/smoke/alp_zgeqrf.cpp
@@ -0,0 +1,356 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <alp/algorithms/householder_qr.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+using BaseScalarType = double;
+using Orthogonal = structures::Orthogonal;
+using General = structures::General;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+#else
+using ScalarType = BaseScalarType;
+#endif
+
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+//** generate random rectangular matrix data: complex version */
+template< typename T >
+std::vector< T > generate_rectangular_matrix_data(
+	size_t N,
+	size_t M,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * M );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j < M; ++j ) {
+			T val( std::rand(), std::rand() );
+			data[ i * M + j ] = val / std::abs( val );
+		}
+	}
+	return data;
+}
+
+//** generate random rectangular matrix data: real version */
+template<
+	typename T
+>
+std::vector< T >  generate_rectangular_matrix_data(
+	size_t N,
+	size_t M,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * M );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j < M; ++j ) {
+			data[ i * M + j ] = static_cast< T >( std::rand() ) / RAND_MAX;
+		}
+	}
+	return data;
+}
+
+//** check if rows/columns or matrix Q are orthogonal */
+template<
+	typename T,
+	typename Structure,
+	typename ViewType,
+	typename ImfR,
+	typename ImfC,
+	class Ring = Semiring< operators::add< T >, operators::mul< T >, identities::zero, identities::one >,
+	class Minus = operators::subtract< T >
+>
+RC check_overlap(
+	alp::Matrix< T, Structure, alp::Density::Dense, ViewType, ImfR, ImfC > &Q,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	const Scalar< T > zero( ring.template getZero< T >() );
+	const Scalar< T > one( ring.template getOne< T >() );
+
+	RC rc = SUCCESS;
+	const size_t n = nrows( Q );
+
+	// check if QxQt == I
+	alp::Matrix< T, Structure, alp::Density::Dense, ViewType > Qtmp( n );
+	rc = rc ? rc : set( Qtmp, zero );
+	rc = rc ? rc : mxm(
+		Qtmp,
+		Q,
+		conjugate( alp::get_view< alp::view::transpose >( Q ) ),
+		ring
+	);
+	Matrix< T, Structure, Dense > Identity( n );
+	rc = rc ? rc : alp::set( Identity, zero );
+	auto id_diag = alp::get_view< alp::view::diagonal >( Identity );
+	rc = rc ? rc : alp::set( id_diag, one );
+	rc = rc ? rc : foldl( Qtmp, Identity, minus );
+
+	//Frobenius norm
+	T fnorm = ring.template getZero< T >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, T &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		Qtmp
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(QQt - I) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large: " << std::abs( fnorm ) << ".\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+//** check solution by calculating H-QR */
+template<
+	typename D,
+	typename StructureGen,
+	typename GenView,
+	typename GenImfR,
+	typename GenImfC,
+	typename StructureOrth,
+	typename OrthogonalView,
+	typename OrthogonalImfR,
+	typename OrthogonalImfC,
+	class Minus = operators::subtract< D >,
+	class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >
+>
+RC check_solution(
+	alp::Matrix< D, StructureGen, alp::Density::Dense, GenView, GenImfR, GenImfC > &H,
+	alp::Matrix< D, StructureOrth, alp::Density::Dense, OrthogonalView, OrthogonalImfR, OrthogonalImfC > &Q,
+	alp::Matrix< D, StructureGen, alp::Density::Dense, GenView, GenImfR, GenImfC > &R,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	RC rc = SUCCESS;
+	const size_t n = nrows( H );
+	const size_t m = ncols( H );
+
+#ifdef DEBUG
+	std::cout << " ** check_solution **\n";
+	std::cout << " input matrices:\n";
+	print_matrix( " << H >> ", H );
+	print_matrix( " << Q >> ", Q );
+	print_matrix( " << R >> ", R );
+	std::cout << " ********************\n";
+#endif
+
+ 	alp::Matrix< D, StructureGen, alp::Density::Dense > QR( n, m );
+	// QR = Q * R
+	const Scalar< D > zero( ring.template getZero< D >() );
+	rc = rc ? rc : set( QR, zero );
+	rc = rc ? rc : mxm( QR, Q, R, ring );
+	// QR = QR - H
+	rc = foldl( QR, H, minus );
+
+#ifdef DEBUG
+	print_matrix( " << QR - H >> ", QR );
+#endif
+
+	//Frobenius norm
+	D fnorm = ring.template getZero< D >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		QR
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(H-QR) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large.\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+
+		// dimensions of sqare matrices H, Q and R
+		const size_t N = unit.N;
+		const size_t M = 2 * unit.N;
+
+		alp::Matrix< ScalarType, Orthogonal > Q( N );
+		alp::Matrix< ScalarType, General > R( N, M );
+		alp::Matrix< ScalarType, General > H( N, M );
+		{
+			std::srand( RNDSEED );
+			auto matrix_data = generate_rectangular_matrix_data< ScalarType >( N, M );
+			rc = rc ? rc : alp::buildMatrix( H, matrix_data.begin(), matrix_data.end() );
+		}
+#ifdef DEBUG
+		print_matrix( " input matrix H ", H );
+#endif
+
+		timer.reset();
+
+		rc = rc ? rc : algorithms::householder_qr( H, Q, R, ring );
+
+		times += timer.time();
+
+#ifdef DEBUG
+		print_matrix( " << Q >> ", Q );
+		print_matrix( " << R >> ", R );
+#endif
+
+		rc = check_overlap( Q );
+		if( rc != SUCCESS ) {
+			std::cout << "Error: mratrix Q is not orthogonal\n";
+			return;
+		}
+
+		rc = check_solution( H, Q, R );
+		if( rc != SUCCESS ) {
+			std::cout << "Error: solution numerically wrong\n";
+			return;
+		}
+
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_zgesvd.cpp b/tests/smoke/alp_zgesvd.cpp
new file mode 100644
index 000000000..a2e6196fb
--- /dev/null
+++ b/tests/smoke/alp_zgesvd.cpp
@@ -0,0 +1,322 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <alp/algorithms/svd.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+using BaseScalarType = double;
+using Orthogonal = structures::Orthogonal;
+using General = structures::General;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+#else
+using ScalarType = BaseScalarType;
+#endif
+
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+//** generate random rectangular matrix data: complex version */
+template< typename T >
+std::vector< T > generate_rectangular_matrix_data(
+	size_t N,
+	size_t M,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * M );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j < M; ++j ) {
+			T val( std::rand(), std::rand() );
+			data[ i * M + j ] = val / std::abs( val );
+		}
+	}
+	return data;
+}
+
+//** generate random rectangular matrix data: real version */
+template<
+	typename T
+>
+std::vector< T >  generate_rectangular_matrix_data(
+	size_t N,
+	size_t M,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * M );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j < M; ++j ) {
+			data[ i * M + j ] = static_cast< T >( std::rand() ) / RAND_MAX;
+		}
+	}
+	return data;
+}
+
+template<
+	typename MatH,
+	typename MatU,
+	typename MatS,
+	typename MatV,
+	typename D = typename MatH::value_type,
+	class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	class Minus = operators::subtract< D >,
+	std::enable_if_t<
+		is_matrix< MatH >::value &&
+		is_matrix< MatU >::value &&
+		is_matrix< MatS >::value &&
+		is_matrix< MatV >::value &&
+		structures::is_a< typename MatH::structure, structures::General >::value &&
+		structures::is_a< typename MatU::structure, structures::Orthogonal >::value &&
+		structures::is_a< typename MatS::structure, structures::RectangularDiagonal >::value &&
+		structures::is_a< typename MatV::structure, structures::Orthogonal >::value &&
+		is_semiring< Ring >::value &&
+		is_operator< Minus >::value
+	> * = nullptr
+>
+RC check_svd_solution(
+	MatH &H,
+	MatU &U,
+	MatS &S,
+	MatV &V,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	RC rc = SUCCESS;
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+
+	const size_t m = nrows( H );
+	const size_t n = ncols( H );
+
+#ifdef DEBUG
+	std::cout << " ********************\n";
+	std::cout << " ** check_solution **\n";
+	std::cout << " input:\n";
+	print_matrix( "  H  ", H );
+	print_matrix( "  U  ", U );
+	print_matrix( "  S  ", S );
+	print_matrix( "  V  ", V );
+	std::cout << " ********************\n";
+#endif
+
+	MatH US( m, n );
+	// UB = U * S
+	rc = rc ? rc : set( US, zero );
+	rc = rc ? rc : mxm( US, U, S, ring );
+
+	MatH USV( m, n );
+	// USV = U * S * V
+	rc = rc ? rc : set( USV, zero );
+	rc = rc ? rc : mxm( USV, US, V, ring );
+
+
+#ifdef DEBUG
+	print_matrix( " USV ", USV );
+#endif
+
+	rc = foldl( USV, H, minus );
+
+	//Frobenius norm
+	D fnorm = ring.template getZero< D >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		USV
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(USV-H) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large.\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	// test thin, square and flat
+	std::vector< size_t > m_arr { unit.N, unit.N, 2 * unit.N };
+	std::vector< size_t > n_arr { 2 * unit.N, unit.N, unit.N };
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times[] = { 0, 0, 0 };
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+
+		const Scalar< ScalarType > zero( ring.template getZero< ScalarType >() );
+		const Scalar< ScalarType > one( ring.template getOne< ScalarType >() );
+
+		for( size_t i = 0; i < 3; ++i ) {
+			// dimensions of sqare matrices H, Q and R
+			const size_t M = m_arr[ i ];
+			const size_t N = n_arr[ i ];
+			//const size_t K = std::min( N, M );
+
+			alp::Matrix< ScalarType, General > H( M, N );
+			alp::Matrix< ScalarType, structures::RectangularDiagonal > S( M, N );
+			alp::Matrix< ScalarType, structures::Orthogonal > U( M, M );
+			alp::Matrix< ScalarType, structures::Orthogonal > V( N, N );
+			{
+				std::srand( RNDSEED );
+				auto matrix_data = generate_rectangular_matrix_data< ScalarType >( M, N );
+				rc = rc ? rc : alp::buildMatrix( H, matrix_data.begin(), matrix_data.end() );
+			}
+#ifdef DEBUG
+			print_matrix( " input matrix H ", H );
+#endif
+
+			timer.reset();
+
+			rc = rc ? rc : algorithms::svd( H, U, S, V, ring );
+
+			times[ i ] += timer.time();
+
+#ifdef DEBUG
+			print_matrix( "  U(out) ", U );
+			print_matrix( "  S(out) ", S );
+			print_matrix( "  V(out) ", V );
+#endif
+
+			rc = check_svd_solution( H, U, S, V, ring );
+			if( rc != SUCCESS ) {
+				std::cout << "Error: solution numerically wrong\n";
+				return;
+			}
+		}
+	}
+	for( size_t i = 0; i < 3; ++i ) {
+		std::cout << " Matrix " << m_arr[ i ] << " x " << n_arr[ i ] << "\n";
+		std::cout << " time (ms, total) = " << times[ i ] << "\n";
+		std::cout << " time (ms, per repeat) = " << times[ i ] / unit.repeat  << "\n";
+	}
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_zgetrf.cpp b/tests/smoke/alp_zgetrf.cpp
new file mode 100644
index 000000000..8f72b60f9
--- /dev/null
+++ b/tests/smoke/alp_zgetrf.cpp
@@ -0,0 +1,321 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <alp/algorithms/householder_lu.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#ifdef DEBUG
+#include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+using BaseScalarType = double;
+using Orthogonal = structures::Orthogonal;
+using General = structures::General;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+#else
+using ScalarType = BaseScalarType;
+#endif
+
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+/** generate random rectangular matrix data: complex version */
+template< typename T >
+std::vector< T > generate_rectangular_matrix_data(
+	size_t N,
+	size_t M,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * M );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j < M; ++j ) {
+			T val( std::rand(), std::rand() );
+			data[ i * M + j ] = val / std::abs( val );
+		}
+	}
+	return data;
+}
+
+/** generate random rectangular matrix data: real version */
+template<
+	typename T
+>
+std::vector< T >  generate_rectangular_matrix_data(
+	size_t N,
+	size_t M,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * M );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = 0; j < M; ++j ) {
+			data[ i * M + j ] = static_cast< T >( std::rand() ) / RAND_MAX;
+		}
+	}
+	return data;
+}
+
+template<
+	typename D,
+	typename GeneralType,
+	typename GenView,
+	typename GenImfR,
+	typename GenImfC,
+	typename UType,
+	typename UView,
+	typename UImfR,
+	typename UImfC,
+	typename LType,
+	typename LView,
+	typename LImfR,
+	typename LImfC,
+	class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	class Minus = operators::subtract< D >
+>
+RC check_lu_solution(
+	Matrix< D, GeneralType, alp::Dense, GenView, GenImfR, GenImfC > &H,
+	Matrix< D, LType, alp::Dense, LView, LImfR, LImfC > &L,
+	Matrix< D, UType, alp::Dense, UView, UImfR, UImfC > &U,
+	Vector< size_t > &p,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	RC rc = SUCCESS;
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+
+	const size_t m = nrows( H );
+	const size_t n = ncols( H );
+
+#ifdef DEBUG
+	std::cout << " ********************\n";
+	std::cout << " ** check_solution **\n";
+	std::cout << " input:\n";
+	print_matrix( "  H  ", H );
+	print_matrix( "  L  ", L );
+	print_matrix( "  U  ", U );
+	print_vector( "  p  ", p );
+	std::cout << " ********************\n";
+#endif
+
+ 	alp::Matrix< D, GeneralType, alp::Density::Dense > LU( m, n );
+	// LU = L * U
+	rc = rc ? rc : set( LU, zero );
+	rc = rc ? rc : mxm( LU, L, U, ring );
+
+	// until #591 is implemented we use no_permutation_vec explicitly
+	alp::Vector< size_t > no_permutation_vec( n );
+	alp::set< alp::descriptors::use_index >( no_permutation_vec, alp::Scalar< size_t >( 0 ) );
+
+	// LU = LU - [p]H // where p are row permutations
+	auto pH = alp::get_view< alp::structures::General >( H, p, no_permutation_vec );
+	rc = foldl( LU, pH, minus );
+
+#ifdef DEBUG
+	print_matrix( " LU - [p]H >> ", LU );
+#endif
+
+	//Frobenius norm
+	D fnorm = ring.template getZero< D >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		LU
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(LU-[p]H) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large.\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	// test thin, square and flat
+	std::vector< size_t > m_arr { unit.N, unit.N, 2 * unit.N };
+	std::vector< size_t > n_arr { 2 * unit.N, unit.N, unit.N };
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times[] = { 0, 0, 0 };
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+
+		const Scalar< ScalarType > zero( ring.template getZero< ScalarType >() );
+		const Scalar< ScalarType > one( ring.template getOne< ScalarType >() );
+
+		for( size_t i = 0; i < 3; ++i ) {
+			// dimensions of sqare matrices H, Q and R
+			const size_t M = m_arr[ i ];
+			const size_t N = n_arr[ i ];
+			const size_t K = std::min( N, M );
+
+			alp::Matrix< ScalarType, General > H( M, N );
+			alp::Matrix< ScalarType, structures::LowerTrapezoidal > L( M, K );
+			alp::Matrix< ScalarType, structures::UpperTrapezoidal > U( K, N );
+			alp::Vector< size_t > permutation_vec( M );
+			{
+				std::srand( RNDSEED );
+				auto matrix_data = generate_rectangular_matrix_data< ScalarType >( M, N );
+				rc = rc ? rc : alp::buildMatrix( H, matrix_data.begin(), matrix_data.end() );
+			}
+#ifdef DEBUG
+			print_matrix( " input matrix H ", H );
+#endif
+
+			rc = rc ? rc : set( L, zero );
+			rc = rc ? rc : set( U, zero );
+
+			timer.reset();
+
+			rc = rc ? rc : algorithms::householder_lu( H, L, U, permutation_vec, ring );
+			times[ i ] += timer.time();
+
+
+#ifdef DEBUG
+			print_matrix( "  H(out) ", H );
+			print_matrix( "  L(out) ", L );
+			print_matrix( "  U(out) ", U );
+#endif
+
+			rc = check_lu_solution( H, L, U, permutation_vec, ring );
+			if( rc != SUCCESS ) {
+				std::cout << "Error: solution numerically wrong\n";
+				return;
+			}
+		}
+	}
+	for( size_t i = 0; i < 3; ++i ) {
+		std::cout << " Matrix " << m_arr[ i ] << " x " << n_arr[ i ] << "\n";
+		std::cout << " time (ms, total) = " << times[ i ] << "\n";
+		std::cout << " time (ms, per repeat) = " << times[ i ] / unit.repeat  << "\n";
+	}
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_zheevd.cpp b/tests/smoke/alp_zheevd.cpp
new file mode 100644
index 000000000..62b571b77
--- /dev/null
+++ b/tests/smoke/alp_zheevd.cpp
@@ -0,0 +1,391 @@
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <alp/algorithms/householder_tridiag.hpp>
+// #include <alp/algorithms/symm_tridiag_eigensolver.hpp>
+#include <alp/algorithms/qr_eigensolver.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#include "../utils/print_alp_containers.hpp"
+
+using namespace alp;
+
+using BaseScalarType = double;
+using Orthogonal = structures::Orthogonal;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+//not fully implemented structures
+using HermitianOrSymmetricTridiagonal = structures::HermitianTridiagonal;
+using HermitianOrSymmetric = structures::Hermitian;
+#else
+using ScalarType = BaseScalarType;
+using HermitianOrSymmetricTridiagonal = structures::SymmetricTridiagonal;
+//fully implemented structures
+using HermitianOrSymmetric = structures::Symmetric;
+#endif
+
+constexpr BaseScalarType tol = 1.e-5;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+/** Generate symmetric-hermitian matrix in a full storage container */
+template< typename T >
+std::vector< T > generate_symmherm_matrix_data(
+	size_t N,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * N );
+	std::fill( data.begin(), data.end(), static_cast< T >( 0 ) );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			T val( std::rand(), std::rand() );
+			data[ i * N + j ] = val / std::abs( val );
+			data[ j * N + i ] += grb::utils::is_complex< T >::conjugate( data[ i * N + j ] );
+		}
+	}
+	return data;
+}
+
+/** Generate upper/lower triangular part of a Symmetric matrix */
+template< typename T >
+std::vector< T >  generate_symmherm_matrix_data(
+	size_t N,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( ( N * ( N + 1 ) ) / 2 );
+	std::srand( RNDSEED );
+	size_t k = 0;
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			//data[ k ] = static_cast< T >( i + j*j ); // easily reproducible
+			data[ k ] = static_cast< T >( std::rand() )  / RAND_MAX  * .001;
+			++k;
+		}
+	}
+	return data;
+}
+
+/** Check if rows/columns or matrix Q are orthogonal */
+template<
+	typename T,
+	typename Structure,
+	typename ViewType,
+	std::enable_if_t<
+		structures::is_a< Structure, structures::Orthogonal >::value
+	> * = nullptr,
+	class Ring = Semiring< operators::add< T >, operators::mul< T >, identities::zero, identities::one >,
+	class Minus = operators::subtract< T >
+>
+RC check_overlap(
+	alp::Matrix< T, Structure, alp::Density::Dense, ViewType > &Q,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	const Scalar< T > zero( ring.template getZero< T >() );
+	const Scalar< T > one( ring.template getOne< T >() );
+
+	RC rc = SUCCESS;
+	const size_t n = nrows( Q );
+
+	// check if QxQt == I
+	alp::Matrix< T, Structure, alp::Density::Dense, ViewType > Qtmp( n );
+	rc = rc ? rc : set( Qtmp, zero );
+	rc = rc ? rc : mxm(
+		Qtmp,
+		Q,
+		conjugate( alp::get_view< alp::view::transpose >( Q ) ),
+		ring
+	);
+	// For Identity we use Structure (Orthogonal structure),
+	// as later we use fold with Qtmp (Orthogonal matrix)
+	Matrix< T, Structure, Dense > Identity( n );
+	rc = rc ? rc : alp::set( Identity, zero );
+	auto id_diag = alp::get_view< alp::view::diagonal >( Identity );
+	rc = rc ? rc : alp::set( id_diag, one );
+	rc = rc ? rc : foldl( Qtmp, Identity, minus );
+
+	//Frobenius norm
+	T fnorm = ring.template getZero< T >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, T &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		Qtmp
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(QQt - I) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large: " << std::abs( fnorm ) << ".\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+
+/** Check the solution by calculating A x Q - Q x diag(d) */
+template<
+	typename D,
+	typename SymmOrHermTridiagonalType,
+	typename OrthogonalType,
+	typename SymmHermTrdiViewType,
+	typename OrthViewType,
+	typename SymmHermTrdiImfR,
+	typename SymmHermTrdiImfC,
+	typename OrthViewImfR,
+	typename OrthViewImfC,
+	typename VecViewType,
+	typename VecImfR,
+	typename VecImfC,
+	class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >,
+	class Minus = operators::subtract< D >,
+	class Divide = operators::divide< D >
+>
+RC check_solution(
+	Matrix< D, SymmOrHermTridiagonalType, Dense, SymmHermTrdiViewType, SymmHermTrdiImfR, SymmHermTrdiImfC > &T,
+	Matrix<	D, OrthogonalType, Dense, OrthViewType, OrthViewImfR, OrthViewImfC > &Q,
+	Vector<	D, structures::General, Dense, VecViewType, VecImfR, VecImfC > &d,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus(),
+	const Divide &divide = Divide()
+) {
+	(void) ring;
+	(void) minus;
+	(void) divide;
+	RC rc = SUCCESS;
+
+ 	const size_t n = nrows( Q );
+
+#ifdef DEBUG
+	print_matrix( " T ", T  );
+	print_matrix( " Q ", Q  );
+	print_vector( " d ", d  );
+#endif
+
+	alp::Matrix< D, alp::structures::Square, alp::Density::Dense > Left( n );
+	alp::Matrix< D, alp::structures::Square, alp::Density::Dense > Right( n );
+	alp::Matrix< D, alp::structures::Square, alp::Density::Dense > Dmat( n );
+	const Scalar< D > zero( ring.template getZero< D >() );
+	const Scalar< D > one( ring.template getOne< D >() );
+
+	rc = rc ? rc : set( Left, zero );
+	rc = rc ? rc : mxm( Left, T, Q, ring );
+
+	rc = rc ? rc : set( Dmat, zero );
+	auto D_diag = alp::get_view< alp::view::diagonal >( Dmat );
+	rc = rc ? rc : set( D_diag, d );
+	rc = rc ? rc : set( Right, zero );
+	rc = rc ? rc : mxm( Right, Q, Dmat, ring );
+#ifdef DEBUG
+	print_matrix( " TxQ ", Left  );
+	print_matrix( " QxD ", Right  ),
+#endif
+	rc = rc ? rc : foldl( Left, Right, minus );
+
+	//Frobenius norm
+	D fnorm = ring.template getZero< D >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		Left
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(AQ-QD) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+		std::cout << "The Frobenius norm is too large: " << std::abs( fnorm ) << ".\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+		const Scalar< ScalarType > zero( ring.template getZero< ScalarType >() );
+
+		// dimensions of sqare matrices H, Q and R
+		size_t N = unit.N;
+
+		alp::Matrix< ScalarType, Orthogonal > Q( N ); //output eigenvectors
+		alp::Matrix< ScalarType, Orthogonal > Q1( N ); //temp orthogonal matrix
+		alp::Matrix< ScalarType, Orthogonal > Q2( N ); //temp orthogonal matrix
+		alp::Matrix< ScalarType, HermitianOrSymmetricTridiagonal > T( N ); //temptridiagonal matrix
+		alp::Matrix< ScalarType, HermitianOrSymmetric > H( N ); //input matrix
+		Vector< ScalarType, structures::General, Dense > d( N ); //output eigenvalues
+		{
+			std::srand( RNDSEED );
+			auto matrix_data = generate_symmherm_matrix_data< ScalarType >( N );
+			rc = rc ? rc : alp::buildMatrix( H, matrix_data.begin(), matrix_data.end() );
+		}
+#ifdef DEBUG
+		print_matrix( " input matrix H ", H );
+#endif
+
+		rc = rc ? rc : set( Q1, zero );
+		rc = rc ? rc : set( Q2, zero );
+		rc = rc ? rc : set( Q, zero );
+
+		timer.reset();
+		rc = rc ? rc : algorithms::householder_tridiag( Q1, T, H, ring );
+
+		// rc = rc ? rc : algorithms::symm_tridiag_dac_eigensolver( T, Q2, d, ring );
+		rc = rc ? rc : algorithms::qr_eigensolver( T, Q2, d, ring );
+
+		rc = rc ? rc : alp::mxm( Q, Q1, Q2, ring );
+		times += timer.time();
+
+#ifdef DEBUG
+		print_matrix( "  Q1 ", Q1 );
+		print_matrix( "  Q2 ", Q2 );
+		print_matrix( "  Q  ", Q );
+		print_matrix( "  T  ", T );
+#endif
+
+		// the algorithm should return correct eigenvalues
+		// but for larger matrices (n>20) a more stable calculations
+		// of eigenvectors is needed
+
+		rc = check_overlap( Q );
+		if( rc != SUCCESS ) {
+			std::cout << "Error: matrix Q is not orthogonal\n";
+		}
+
+		rc = check_solution( H, Q, d );
+		if( rc != SUCCESS ) {
+			std::cout << "Error: solution numerically wrong\n";
+		}
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/alp_zhetrd.cpp b/tests/smoke/alp_zhetrd.cpp
new file mode 100644
index 000000000..151de5b9a
--- /dev/null
+++ b/tests/smoke/alp_zhetrd.cpp
@@ -0,0 +1,385 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#ifdef _COMPLEX
+#include <complex>
+#include <cmath>
+#include <iomanip>
+#endif
+
+#include <graphblas/utils/timer.hpp>
+#include <alp.hpp>
+#include <alp/algorithms/householder_tridiag.hpp>
+#include <graphblas/utils/iscomplex.hpp> // use from grb
+#include "../utils/print_alp_containers.hpp"
+
+//once TEMPDISABLE is removed the code should be in the final version
+#define TEMPDISABLE
+
+using namespace alp;
+
+using BaseScalarType = double;
+using Orthogonal = structures::Orthogonal;
+
+#ifdef _COMPLEX
+using ScalarType = std::complex< BaseScalarType >;
+//not fully implemented structures
+using HermitianOrSymmetricTridiagonal = structures::HermitianTridiagonal;
+using HermitianOrSymmetric = structures::Hermitian;
+#else
+using ScalarType = BaseScalarType;
+using HermitianOrSymmetricTridiagonal = structures::SymmetricTridiagonal;
+//fully implemented structures
+using HermitianOrSymmetric = structures::Symmetric;
+#endif
+
+constexpr BaseScalarType tol = 1.e-10;
+constexpr size_t RNDSEED = 1;
+
+struct inpdata {
+	size_t N = 0;
+	size_t repeat = 1;
+};
+
+//temp function untill Hermitian containter is implemented
+//** gnerate symmetric-hermitian matrix in a square container */
+template<
+	typename T
+>
+std::vector< T > generate_symmherm_matrix_data(
+	size_t N,
+	const typename std::enable_if<
+		grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( N * N );
+	std::fill(data.begin(), data.end(), static_cast< T >( 0 ) );
+	std::srand( RNDSEED );
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			T val( std::rand(), std::rand() );
+			data[ i * N + j ] = val / std::abs( val );
+			data[ j * N + i ] += grb::utils::is_complex< T >::conjugate( data[ i * N + j ] );
+		}
+	}
+	return data;
+}
+
+//** generate upper/lower triangular part of a Symmetric matrix */
+template<
+	typename T
+>
+std::vector< T >  generate_symmherm_matrix_data(
+	size_t N,
+	const typename std::enable_if<
+		!grb::utils::is_complex< T >::value,
+		void
+	>::type * const = nullptr
+) {
+	std::vector< T > data( ( N * ( N + 1 ) ) / 2 );
+	std::srand( RNDSEED );
+	size_t k = 0;
+	for( size_t i = 0; i < N; ++i ) {
+		for( size_t j = i; j < N; ++j ) {
+			//data[ k ] = static_cast< T >( i + j*j ); // easily reproducible
+			data[ k ] = static_cast< T >( std::rand() )  / RAND_MAX;
+			++k;
+		}
+	}
+	return data;
+}
+
+//** check if rows/columns or matrix Q are orthogonal */
+template<
+	typename T,
+	typename Structure,
+	typename ViewType,
+	class Ring = Semiring< operators::add< T >, operators::mul< T >, identities::zero, identities::one >
+>
+RC check_overlap( alp::Matrix< T, Structure, alp::Density::Dense, ViewType > &Q, const Ring & ring = Ring() ) {
+	RC rc = SUCCESS;
+	const size_t n = nrows( Q );
+#ifdef DEBUG
+	std::cout << "Overlap matrix for Q:\n";
+#endif
+	for ( size_t i = 0; i < n; ++i ) {
+		auto vi = get_view( Q, i, utils::range( 0, n ) );
+		for ( size_t j = 0; j < n; ++j ) {
+			auto vj = get_view( Q, j, utils::range( 0, n ) );
+			Scalar< T > alpha( ring.template getZero< T >() );
+			rc = dot( alpha, vi, vj, ring );
+			if( rc != SUCCESS ) {
+				std::cerr << " dot( alpha, vi, vj, ring ) failed\n";
+				return PANIC;
+			}
+			if( i == j ) {
+				if( std::abs( *alpha - ring.template getOne< T >() ) > tol ) {
+					std::cerr << " vector " << i << " not normalized\n";
+					return PANIC;
+				}
+			} else {
+				if( std::abs( *alpha ) > tol ) {
+					std::cerr << " vector " << i << " and vctor " << j << " are note orthogonal\n";
+					return PANIC;
+				}
+			}
+#ifdef DEBUG
+			std::cout << "\t" << std::abs( *alpha );
+#endif
+		}
+#ifdef DEBUG
+		std::cout << "\n";
+#endif
+	}
+#ifdef DEBUG
+	std::cout << "\n";
+#endif
+	return rc;
+}
+
+
+//** check solution by calculating H-QTQh */
+template<
+	typename D,
+	typename StructureSymm,
+	typename StructureOrth,
+	typename StructureTrDg,
+	class Minus = operators::subtract< D >,
+	class Ring = Semiring< operators::add< D >, operators::mul< D >, identities::zero, identities::one >
+>
+RC check_solution(
+	alp::Matrix< D, StructureSymm, alp::Density::Dense > &H,
+	alp::Matrix< D, StructureOrth, alp::Density::Dense > &Q,
+	alp::Matrix< D, StructureTrDg, alp::Density::Dense > &T,
+	const Ring &ring = Ring(),
+	const Minus &minus = Minus()
+) {
+	RC rc = SUCCESS;
+	const size_t n = nrows( Q );
+
+#ifdef DEBUG
+	std::cout << " ** check_solution **\n";
+	std::cout << " input matrices:\n";
+	print_matrix( " << H >> ", H );
+	print_matrix( " << Q >> ", Q );
+	print_matrix( " << T >> ", T );
+	std::cout << " ********************\n";
+#endif
+
+	alp::Matrix< D, alp::structures::Square, alp::Density::Dense > QTQh( n );
+	alp::Matrix< D, alp::structures::Square, alp::Density::Dense > QTQhmH( n );
+	const Scalar< D > zero( ring.template getZero< D >() );
+
+	rc = rc ? rc : set( QTQh, zero );
+	rc = rc ? rc : mxm( QTQh, T, conjugate( alp::get_view< alp::view::transpose >( Q ) ), ring );
+	rc = rc ? rc : set( QTQhmH, zero );
+	rc = rc ? rc : mxm( QTQhmH, Q, QTQh, ring );
+	rc = rc ? rc : set( QTQh, QTQhmH );
+#ifdef DEBUG
+	print_matrix( " << QTQhmH >> ", QTQhmH );
+	print_matrix( " << H >> ", H );
+	std::cout << "call foldl( mat, mat, minus )\n";
+#endif
+
+#ifndef TEMPDISABLE
+	rc = foldl( QTQhmH, H, minus );
+#else
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &H, &minus, &zero ]( const size_t i, const size_t j, D &val ) {
+			if ( j >= i ) {
+				internal::foldl(
+					val,
+					internal::access( H, internal::getStorageIndex( H, i, j ) ),
+					minus
+				);
+			} else {
+				val = *zero;
+			}
+		},
+		QTQhmH
+	);
+#endif
+
+#ifdef DEBUG
+	print_matrix( " << QTQhmH >> ", QTQhmH );
+	print_matrix( " << H >> ", H );
+#endif
+
+	//Frobenius norm
+	D fnorm = ring.template getZero< D >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, D &val ) {
+			(void) i;
+			(void) j;
+			internal::foldl( fnorm, val * val, ring.getAdditiveOperator() );
+		},
+		QTQhmH
+	);
+	fnorm = std::sqrt( fnorm );
+
+#ifdef DEBUG
+	std::cout << " FrobeniusNorm(H-QTQh) = " << std::abs( fnorm ) << "\n";
+#endif
+	if( tol < std::abs( fnorm ) ) {
+#ifdef DEBUG
+		std::cout << " ----------------------\n";
+		std::cout << " compare matrices\n";
+		print_matrix( " << H >> ", H );
+		print_matrix( " << QTQh >> ", QTQh );
+		std::cout << " ----------------------\n";
+#endif
+		std::cout << "The Frobenius norm is too large.\n";
+		return FAILED;
+	}
+
+	return rc;
+}
+
+
+
+void alp_program( const inpdata &unit, alp::RC &rc ) {
+	rc = SUCCESS;
+
+	grb::utils::Timer timer;
+	timer.reset();
+	double times = 0;
+
+	for( size_t j = 0; j < unit.repeat; ++j ) {
+
+		alp::Semiring<
+			alp::operators::add< ScalarType >,
+			alp::operators::mul< ScalarType >,
+			alp::identities::zero,
+			alp::identities::one
+			> ring;
+
+		// dimensions of sqare matrices H, Q and R
+		size_t N = unit.N;
+
+		alp::Matrix< ScalarType, Orthogonal > Q( N );
+		alp::Matrix< ScalarType, HermitianOrSymmetricTridiagonal > T( N );
+		alp::Matrix< ScalarType, HermitianOrSymmetric > H( N );
+		{
+			auto matrix_data = generate_symmherm_matrix_data< ScalarType >( N );
+			rc = rc ? rc : alp::buildMatrix( H, matrix_data.begin(), matrix_data.end() );
+		}
+#ifdef DEBUG
+		print_matrix( " input matrix H ", H );
+#endif
+
+		timer.reset();
+
+		rc = rc ? rc : algorithms::householder_tridiag( Q, T, H, ring );
+
+		times += timer.time();
+
+#ifdef DEBUG
+		print_matrix( " << Q >> ", Q );
+		print_matrix( " << T >> ", T );
+#endif
+
+		rc = check_overlap( Q );
+		if( rc != SUCCESS ) {
+			std::cout << "Error: mratrix Q is not orthogonal\n";
+			return;
+		}
+
+		rc = check_solution( H, Q, T );
+		if( rc != SUCCESS ) {
+			std::cout << "Error: solution numerically wrong\n";
+			return;
+		}
+
+	}
+
+	std::cout << " time (ms, total) = " << times << "\n";
+	std::cout << " time (ms, per repeat) = " << times / unit.repeat  << "\n";
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	inpdata in;
+
+	// error checking
+	if(
+		( argc == 3 ) || ( argc == 5 )
+	) {
+		std::string readflag;
+		std::istringstream ss1( argv[ 1 ] );
+		std::istringstream ss2( argv[ 2 ] );
+		if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+			std::cerr << "Error parsing\n";
+			printUsage = true;
+		} else if(
+			readflag != std::string( "-n" )
+		) {
+			std::cerr << "Given first argument is unknown\n";
+			printUsage = true;
+		} else {
+			if( ! ( ( ss2 >> in.N ) &&  ss2.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			}
+		}
+
+		if( argc == 5 ) {
+			std::string readflag;
+			std::istringstream ss1( argv[ 3 ] );
+			std::istringstream ss2( argv[ 4 ] );
+			if( ! ( ( ss1 >> readflag ) &&  ss1.eof() ) ) {
+				std::cerr << "Error parsing\n";
+				printUsage = true;
+			} else if(
+				readflag != std::string( "-repeat" )
+			) {
+				std::cerr << "Given third argument is unknown\n";
+				printUsage = true;
+			} else {
+				if( ! ( ( ss2 >> in.repeat ) &&  ss2.eof() ) ) {
+					std::cerr << "Error parsing\n";
+					printUsage = true;
+				}
+			}
+		}
+
+	} else {
+		std::cout << "Wrong number of arguments\n";
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N \n";
+		std::cerr << "      or  \n";
+		std::cerr << "       " << argv[ 0 ] << " -n N   -repeat N \n";
+		return 1;
+	}
+
+	alp::RC rc = alp::SUCCESS;
+	alp_program( in, rc );
+	if( rc == alp::SUCCESS ) {
+		std::cout << "Test OK\n";
+		return 0;
+	} else {
+		std::cout << "Test FAILED\n";
+		return 1;
+	}
+}
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index ce6c3ce6b..cd30a233e 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -39,7 +39,12 @@ echo "**************************************************************************
 echo "      FUNCTIONAL    PERFORMANCE                       DESCRIPTION      "
 echo "----------------------------------------------------------------------------------------"
 echo " "
+# run non alp_ backends
 for BACKEND in ${BACKENDS[@]}; do
+	if [ "${BACKEND:0:4}" == "alp_" ]; then
+	    continue
+	fi
+	
 	if [ "$BACKEND" = "bsp1d" ]; then
 		if [ -z "${LPFRUN}" ]; then
 			echo "LPFRUN is not set!"
@@ -609,7 +614,183 @@ for BACKEND in ${BACKENDS[@]}; do
 	fi
 done
 
+for BACKEND in ${BACKENDS[@]}; do
+	# Temporarily execute tests only for alp_reference backend
+	# until all backends start supporting all smoke tests.
+	if [ "${BACKEND}" != "alp_reference" ]; then
+		continue
+	fi
+
+	runner=
+	echo "#################################################################"
+	echo "# Starting standardised smoke tests for the ${BACKEND} backend"
+	if [ "x${runner}" != "x" ]; then
+	    echo "#   using runner \`\`$runner''"
+	fi
+	echo "#################################################################"
+	echo " "
+	NREPEAT=20
+	NTEST_CHOLESKY=100
+	echo ">>>      [x]           [ ]       Tests Cholesky decomposition for a random"
+	echo "                                 symmetric positive definite matrix (${NTEST_CHOLESKY}x${NTEST_CHOLESKY})."
+	echo "> $runner ${TEST_BIN_DIR}/alp_cholesky_${BACKEND}  -n ${NTEST_CHOLESKY} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_cholesky_${BACKEND}.log"
+	bash -c "$runner ${TEST_BIN_DIR}/alp_cholesky_${BACKEND}  -n ${NTEST_CHOLESKY} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_cholesky_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_cholesky_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_cholesky_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	# disabled until all versions are implemented
+	# NTEST_CHOLESKY_COMPLEX=30
+	# echo ">>>      [x]           [ ]       Tests Cholesky decomposition for a random"
+	# echo "                                 hermitian positive definite matrix (${NTEST_CHOLESKY_COMPLEX}x${NTEST_CHOLESKY_COMPLEX})."
+	# bash -c "$runner ${TEST_BIN_DIR}/alp_cholesky_complex_${BACKEND}  -n ${NTEST_CHOLESKY_COMPLEX} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_cholesky_complex_${BACKEND}.log"
+	# [[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -1 ${TEST_OUT_DIR}/alp_cholesky_complex_${BACKEND}.log
+	# grep 'Test OK' ${TEST_OUT_DIR}/alp_cholesky_complex_${BACKEND}.log || echo "Test FAILED"
+	# echo " "
+
+	NTEST_POTRI=100
+	echo ">>>      [x]           [ ]       Tests inverse of a random"
+	echo "                                 symmetric positive definite matrix (${NTEST_POTRI}x${NTEST_POTRI})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_potri_${BACKEND} -n ${NTEST_POTRI} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_potri_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_potri_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_potri_${BACKEND}.log || echo "Test FAILED"  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_POTRI_COMPLEX=100
+	echo ">>>      [x]           [ ]       Tests inverse of a random"
+	echo "                                 hermitian positive definite matrix (${NTEST_POTRI_COMPLEX}x${NTEST_POTRI_COMPLEX})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_potri_complex_${BACKEND} -n ${NTEST_POTRI_COMPLEX} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_potri_complex_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_potri_complex_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_potri_complex_${BACKEND}.log || echo "Test FAILED"  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+	
+	NTEST_GEMM=100
+	echo ">>>      [x]           [ ]       Tests Gemm on matrix (${NTEST_GEMM}x${NTEST_GEMM}x${NTEST_GEMM})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_gemm_${BACKEND} -n ${NTEST_GEMM} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_gemm_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -14 ${TEST_OUT_DIR}/alp_gemm_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_gemm_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_HOUSEHOLDER=100
+	echo ">>>      [x]           [ ]       Tests dsytrd (Householder tridiagonalisaiton) on"
+	echo ">>>                              a real, random symmetric matrix (${NTEST_HOUSEHOLDER}x${NTEST_HOUSEHOLDER})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_zhetrd_${BACKEND} -n ${NTEST_HOUSEHOLDER} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_zhetrd_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -3 ${TEST_OUT_DIR}/alp_zhetrd_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_zhetrd_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_HOUSEHOLDER_COMPLEX=100
+	echo ">>>      [x]           [ ]       Tests zhetrd (Householder tridiagonalisaiton) on"
+	echo ">>>                              a complex, random hermitian matrix (${NTEST_HOUSEHOLDER_COMPLEX}x${NTEST_HOUSEHOLDER_COMPLEX})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_zhetrd_complex_${BACKEND} -n ${NTEST_HOUSEHOLDER_COMPLEX} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_zhetrd_complex_${BACKEND}.log"
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -3 ${TEST_OUT_DIR}/alp_zhetrd_complex_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_zhetrd_complex_${BACKEND}.log || echo "Test FAILED"
+	echo " "
+
+	NTEST_HOUSEHOLDER=100
+	echo ">>>      [x]           [ ]       Tests dgeqrf (Householder QR decomposition) on"
+	echo ">>>                              a random real general matrix (${NTEST_HOUSEHOLDER}x$((2*NTEST_HOUSEHOLDER)))."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_zgeqrf_${BACKEND} -n ${NTEST_HOUSEHOLDER} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_zgeqrf_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_zgeqrf_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_zgeqrf_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_HOUSEHOLDER_COMPLEX=100
+	echo ">>>      [x]           [ ]       Tests zgeqrf (Householder QR decomposition) on"
+	echo ">>>                              a random complex general matrix (${NTEST_HOUSEHOLDER_COMPLEX}x$((2*NTEST_HOUSEHOLDER_COMPLEX)))."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_zgeqrf_complex_${BACKEND} -n ${NTEST_HOUSEHOLDER_COMPLEX} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_zgeqrf_complex_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_zgeqrf_complex_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_zgeqrf_complex_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_HOUSEHOLDER=100
+	echo ">>>      [x]           [ ]       Tests dgetrf (Householder LU decomposition) on"
+	echo ">>>                              a random real general matrices of sizes (${NTEST_HOUSEHOLDER}x$((2*NTEST_HOUSEHOLDER))),"
+	echo ">>>                              (${NTEST_HOUSEHOLDER} x ${NTEST_HOUSEHOLDER}) and ($((2*NTEST_HOUSEHOLDER)) x ${NTEST_HOUSEHOLDER})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_zgetrf_${BACKEND} -n ${NTEST_HOUSEHOLDER} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_zgetrf_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -9 ${TEST_OUT_DIR}/alp_zgetrf_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_zgetrf_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_HOUSEHOLDER_COMPLEX=100
+	echo ">>>      [x]           [ ]       Tests zgetrf (Householder LU decomposition) on"
+	echo ">>>                              random complex general matrices of sizes (${NTEST_HOUSEHOLDER_COMPLEX}x$((2*NTEST_HOUSEHOLDER_COMPLEX))),"
+	echo ">>>                              (${NTEST_HOUSEHOLDER_COMPLEX} x ${NTEST_HOUSEHOLDER_COMPLEX}) and ($((2*NTEST_HOUSEHOLDER_COMPLEX)) x ${NTEST_HOUSEHOLDER_COMPLEX})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_zgetrf_complex_${BACKEND} -n ${NTEST_HOUSEHOLDER_COMPLEX} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_zgetrf_complex_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -9 ${TEST_OUT_DIR}/alp_zgetrf_complex_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_zgetrf_complex_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+	
+	NTEST_DIVCON=100
+	echo ">>>      [x]           [ ]       Tests dstedc (tridiagonal eigensolver) on"
+	echo ">>>                              random tridiagonal real symmetric matrices of sizes (${NTEST_DIVCON}x${NTEST_DIVCON})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_dstedc_${BACKEND} -n ${NTEST_DIVCON} -repeat 1 &> ${TEST_OUT_DIR}/alp_dstedc_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_dstedc_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_dstedc_${BACKEND}.log || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_DIVCON=100
+	echo ">>>      [x]           [ ]       Tests syevd (symmetric eigensolver) on"
+	echo ">>>                              a random real symmetric matrix (${NTEST_DIVCON}x${NTEST_DIVCON})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_syevd_${BACKEND} -n ${NTEST_DIVCON} -repeat 1 &> ${TEST_OUT_DIR}/alp_syevd_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_syevd_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_syevd_${BACKEND}.log || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_BACKSUB=100
+	echo ">>>      [x]           [ ]       Tests dtrsv and dtrsm (Triangular linear system solve using backsubstitution ) on"
+	echo ">>>                              a random upper tridiagonal real matrix (${NTEST_BACKSUB}x${NTEST_BACKSUB})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_backsubstitution_${BACKEND} -n ${NTEST_BACKSUB} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_backsubstitution_${BACKEND}.log"   || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_backsubstitution_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_backsubstitution_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_BACKSUB=100
+	echo ">>>      [x]           [ ]       Tests ztrsv and ztrsm (Triangular linear system solve using backsubstitution ) on"
+	echo ">>>                              a random upper tridiagonal complex matrix (${NTEST_BACKSUB}x${NTEST_BACKSUB})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_backsubstitution_complex_${BACKEND} -n ${NTEST_BACKSUB} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_backsubstitution_complex_${BACKEND}.log"   || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_backsubstitution_complex_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_backsubstitution_complex_${BACKEND}.log  || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_FORWARDSUB=100
+	echo ">>>      [x]           [ ]       Tests dtrsv and dtrsm (Triangular linear system solve using forwardsubstitution ) on"
+	echo ">>>                              a random lower tridiagonal real matrix (${NTEST_FORWARDSUB}x${NTEST_FORWARDSUB})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_forwardsubstitution_${BACKEND} -n ${NTEST_FORWARDSUB} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_forwardsubstitution_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_forwardsubstitution_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_forwardsubstitution_${BACKEND}.log || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_FORWARDSUB=100
+	echo ">>>      [x]           [ ]       Tests ztrsv and ztrsm (Triangular linear system solve using forwardsubstitution ) on"
+	echo ">>>                              a random lower tridiagonal complex matrix (${NTEST_FORWARDSUB}x${NTEST_FORWARDSUB})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_forwardsubstitution_complex_${BACKEND} -n ${NTEST_FORWARDSUB} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_forwardsubstitution_complex_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -4 ${TEST_OUT_DIR}/alp_forwardsubstitution_complex_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_forwardsubstitution_complex_${BACKEND}.log || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_SVD=100
+	echo ">>>      [x]           [ ]       Tests dgesvd (Singular value decomposition) on"
+	echo ">>>                              a real, random general matrices of sizes (${NTEST_SVD}x$((2*NTEST_SVD))),"
+	echo ">>>                              (${NTEST_SVD} x ${NTEST_SVD}) and ($((2*NTEST_SVD)) x ${NTEST_SVD})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_zgesvd_${BACKEND} -n ${NTEST_SVD} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_zgesvd_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -9 ${TEST_OUT_DIR}/alp_zgesvd_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_zgesvd_${BACKEND}.log || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+	NTEST_SVD_COMPLEX=100
+	echo ">>>      [x]           [ ]       Tests zgesvd (Singular value decomposition) on"
+	echo ">>>                              a complex, random general matrices of sizes (${NTEST_SVD_COMPLEX}x$((2*NTEST_SVD_COMPLEX))),"
+	echo ">>>                              (${NTEST_SVD_COMPLEX} x ${NTEST_SVD_COMPLEX}) and ($((2*NTEST_SVD_COMPLEX)) x ${NTEST_SVD_COMPLEX})."
+	bash -c "$runner ${TEST_BIN_DIR}/alp_zgesvd_complex_${BACKEND} -n ${NTEST_SVD_COMPLEX} -repeat ${NREPEAT} &> ${TEST_OUT_DIR}/alp_zgesvd_complex_${BACKEND}.log"  || { echo -e "Test returned error.\nTest FAILED." && exit 1; }
+	[[ "${SMOKE_PRINT_TIME}" == "ON" ]] && head -9 ${TEST_OUT_DIR}/alp_zgesvd_complex_${BACKEND}.log
+	grep 'Test OK' ${TEST_OUT_DIR}/alp_zgesvd_complex_${BACKEND}.log || { echo -e "Test returned wrong output.\nTest FAILED" && exit 1 ; }
+	echo " "
+
+done
+
 echo "*****************************************************************************************"
 echo "All smoke tests done."
 echo " "
 
+
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 5cd03cf77..e820895c8 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -32,6 +32,26 @@ add_grb_executables( add15m add15m.cpp
 	BACKENDS reference NO_BACKEND_NAME
 )
 
+add_grb_executables( alp_dynamic_views alp_dynamic_views.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( alp_views alp_views.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( alp_type_traits alp_type_traits.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( alp_relations alp_relations.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( alp_file_iterator alp_file_iterator.cpp
+	BACKENDS alp_reference
+)      
+
 add_grb_executables( argmax argmax.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
@@ -268,10 +288,83 @@ add_grb_executables( spy spy.cpp
 	BACKENDS reference reference_omp hyperdags nonblocking
 )
 
+add_grb_executables( dense_conjugate dense_conjugate.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_matrix_access dense_matrix_access.cpp
+	BACKENDS alp_reference
+)
+
+# Temporarily disable this test until the necessary Matrix class specialization are adapted to the new functor-based infrastructure.
+# add_grb_executables( dense_constant_matrices dense_constant_matrices.cpp
+# 	BACKENDS alp_reference
+# )
+
+add_grb_executables( dense_eWiseApply dense_eWiseApply.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_eWiseLambda dense_eWiseLambda.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_eWiseMul dense_eWiseMul.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_dot_norm2 dense_dot_norm2.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_fold dense_fold.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_matrix_imf dense_matrix_imf.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_mxm dense_mxm.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_omp_matrix dense_omp_matrix.cpp
+	BACKENDS alp_omp
+)
+
+add_grb_executables( dense_omp_mxm dense_omp_mxm.cpp
+	BACKENDS alp_omp
+)
+
+add_grb_executables( dense_outer dense_outer.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_set dense_set.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_sort dense_sort.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_storage_polynomials dense_storage_polynomials.cpp
+	BACKENDS alp_reference
+)
+
+add_grb_executables( dense_structured_matrix dense_structured_matrix.cpp
+	BACKENDS alp_reference
+)
+
 add_grb_executables( dense_spmv dense_spmv.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
+add_grb_executables( dispatch_access dispatch_access.cpp
+	BACKENDS alp_dispatch
+)
+
 add_grb_executables( moveMatrix moveMatrix.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
diff --git a/tests/unit/alp_dynamic_views.cpp b/tests/unit/alp_dynamic_views.cpp
new file mode 100644
index 000000000..e12abedfe
--- /dev/null
+++ b/tests/unit/alp_dynamic_views.cpp
@@ -0,0 +1,172 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utility>
+#include <iostream>
+
+#include <alp.hpp>
+#include "../utils/print_alp_containers.hpp"
+
+template< typename T >
+void init_matrix( std::vector< T > &A, const size_t rows, const size_t cols ) {
+
+	size_t multiplier;
+	for( multiplier = 1; multiplier < rows; multiplier *= 10 );
+
+	for( size_t row = 0; row < rows; ++row ) {
+		for( size_t col = 0; col < cols; ++col ) {
+			A[ row * cols + col ] = multiplier * row + col;
+		}
+	}
+}
+
+// alp program
+void alpProgram( const size_t &n, alp::RC &rc ) {
+
+	typedef double T;
+
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	T zero = ring.getZero< T >();
+
+	// allocate
+	const size_t m = 2 * n;
+	std::vector< T > M_data( m * n, zero );
+	init_matrix( M_data, m, n );
+
+	alp::Matrix< T, alp::structures::General > M( m, n );
+	alp::buildMatrix( M, M_data.begin(), M_data.end() );
+	print_matrix( "M", M );
+	std::cout << "------------" << std::endl;
+
+	std::vector< size_t > sel_r_data{ 3, 1, 5 };
+	std::vector< size_t > sel_c_data{ 2, 4, 0 };
+	alp::Vector< size_t > sel_r( sel_r_data.size() );
+	alp::Vector< size_t > sel_c( sel_c_data.size() );
+	alp::buildVector( sel_r, sel_r_data.begin(), sel_r_data.end() );
+	alp::buildVector( sel_c, sel_c_data.begin(), sel_c_data.end() );
+
+	// select view
+	auto Ms = alp::get_view< alp::structures::General >( M, sel_r, sel_c );
+	print_matrix( "Ms", Ms );
+	std::cout << "------------" << std::endl;
+
+	// apply another select view to test telescopic view select->select
+	std::vector< size_t > sel1_data{ 2, 1, 0 };
+	alp::Vector< size_t > sel1( sel1_data.size() );
+	alp::buildVector( sel1, sel1_data.begin(), sel1_data.end() );
+	auto Mss = alp::get_view< alp::structures::General >( Ms, sel1, sel1 );
+	print_matrix( "Mss", Mss );
+	std::cout << "------------" << std::endl;
+
+	// transposed view
+	auto MsT = alp::get_view< alp::view::Views::transpose >( Ms );
+	print_matrix( "Ms^T", MsT );
+	std::cout << "------------" << std::endl;
+
+	// gather view
+	auto MsTg = alp::get_view( MsT, alp::utils::range( 0, 2 ), alp::utils::range( 0, 2 ) );
+	print_matrix( "Ms^Tg", MsTg );
+	std::cout << "------------" << std::endl;
+
+	// another select view to test telescopic view dynamic->static->dynamic
+	std::vector< size_t > sel2_r_data{ 1, 0 };
+	std::vector< size_t > sel2_c_data{ 0, 1 };
+	alp::Vector< size_t > sel2_r( sel2_r_data.size() );
+	alp::Vector< size_t > sel2_c( sel2_c_data.size() );
+	alp::buildVector( sel2_r, sel2_r_data.begin(), sel2_r_data.end() );
+	alp::buildVector( sel2_c, sel2_c_data.begin(), sel2_c_data.end() );
+	auto MsTgs = alp::get_view< alp::structures::General >( MsTg, sel2_r, sel2_c );
+	print_matrix( "Ms^Tgs", MsTgs );
+	std::cout << "------------" << std::endl;
+
+	// Vector views
+	// allocate vector
+	std::vector< T > v_data( m, zero );
+	init_matrix( v_data, m, 1 );
+	alp::Vector< T > v( m );
+	alp::buildMatrix( static_cast< decltype( v )::base_type & >( v ), v_data.begin(), v_data.end() );
+	print_vector( "v", v );
+
+	// select view over a vector
+	auto v_view = alp::get_view< alp::structures::General >( v, sel_r );
+	print_vector( "v_view", v_view );
+
+	// select view over select view
+	std::vector< size_t > sel2_v_data{ 2, 0, 1 };
+	alp::Vector< size_t > sel2_v( sel2_v_data.size() );
+	alp::buildVector( sel2_v, sel2_v_data.begin(), sel2_v_data.end() );
+	auto v_view_2 = alp::get_view< alp::structures::General >( v_view, sel2_v );
+	print_vector( "v_view_2", v_view_2 );
+
+	// matrix view over select x select view
+	auto v_mat = alp::get_view< alp::view::matrix >( v_view_2 );
+	print_matrix( "v_mat", v_mat );
+
+	rc = alp::SUCCESS;
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read == 0 ) {
+			std::cerr << "n must be a positive number\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an integer, the "
+					 "test size.\n";
+		return 1;
+	}
+	std::cout << "Functional test executable: " << argv[ 0 ] << "\n";
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alpProgram, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/alp_file_iterator.cpp b/tests/unit/alp_file_iterator.cpp
new file mode 100644
index 000000000..55cecb83a
--- /dev/null
+++ b/tests/unit/alp_file_iterator.cpp
@@ -0,0 +1,84 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+#include <alp/utils/parser/MatrixFileReader.hpp>
+
+using namespace alp;
+
+std::string fname;
+
+void alp_program( const size_t & in, alp::RC & rc ) {
+	(void)in;
+	rc = SUCCESS;
+
+	alp::utils::MatrixFileReader<
+		double
+	> parser_A( fname );
+
+	for ( auto it = parser_A.begin() ; it != parser_A.end() ; ++it  ) {
+		std::cout << " i,j,v= " << it.i() << " " << it.j() << " " << it.v() << "\n";
+	}
+
+	return;
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> fname ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else {
+			// all OK
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " \n";
+		std::cerr << "  -filename \n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	size_t in;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/alp_relations.cpp b/tests/unit/alp_relations.cpp
new file mode 100644
index 000000000..9863298c0
--- /dev/null
+++ b/tests/unit/alp_relations.cpp
@@ -0,0 +1,346 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	(void) n;
+
+	/**
+	 * Basic checks on the less-than relation (lt).
+	 * lt is a strict total order and therefore also a strict partial order.
+	 */
+	typedef alp::relations::lt< double > dbl_lt;
+
+	static_assert( alp::is_relation< dbl_lt >::value );
+
+	static_assert( !( alp::is_partial_order< dbl_lt >::value ) );
+
+	static_assert( alp::is_strict_partial_order< dbl_lt >::value );
+
+	static_assert( !( alp::is_total_order< dbl_lt >::value ) );
+
+	static_assert( alp::is_strict_total_order< dbl_lt >::value );
+
+	static_assert( !( alp::is_equivalence_relation< dbl_lt >::value ) );
+
+	if( !( dbl_lt::check( 2.4, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "dbl_lt::test( 2.4, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( dbl_lt::check( 5, 2.4 ) ) {
+#ifndef NDEBUG
+		std::cerr << "dbl_lt::check( 5, 2.4 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( dbl_lt::check( 5, 5 ) ) {
+#ifndef NDEBUG
+		std::cerr << "dbl_lt::test( 5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	/**
+	 * Basic checks on the greater-than relation (gt).
+	 * lt is a strict total order and therefore also a strict partial order.
+	 */
+	typedef alp::relations::gt< double > dbl_gt;
+
+	static_assert( alp::is_relation< dbl_gt >::value );
+
+	static_assert( !( alp::is_partial_order< dbl_gt >::value ) );
+
+	static_assert( alp::is_strict_partial_order< dbl_gt >::value );
+
+	static_assert( !( alp::is_total_order< dbl_gt >::value ) );
+
+	static_assert( alp::is_strict_total_order< dbl_gt >::value );
+
+	static_assert( !( alp::is_equivalence_relation< dbl_gt >::value ) );
+
+	if( dbl_gt::check( 2.4, 5 ) ) {
+#ifndef NDEBUG
+		std::cerr << "dbl_gt::test( 2.4, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( dbl_gt::check( 5, 2.4 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "dbl_gt::check( 5, 2.4 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( dbl_gt::check( 5, 5 ) ) {
+#ifndef NDEBUG
+		std::cerr << "dbl_gt::test( 5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	/**
+	 * Basic checks on the equality relation (eq).
+	 * eq is both an equivalence relation and a partial order.
+	 */
+	typedef alp::relations::eq< int > int_eq;
+
+	static_assert( alp::is_relation< int_eq >::value );
+
+	static_assert( alp::is_partial_order< int_eq >::value );
+
+	static_assert( !( alp::is_strict_partial_order< int_eq >::value ) );
+
+	static_assert( !( alp::is_total_order< int_eq >::value ) );
+
+	static_assert( !( alp::is_strict_total_order< int_eq >::value ) );
+
+	static_assert( alp::is_equivalence_relation< int_eq >::value );
+
+	if( int_eq::check( 2.4, 5 ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_eq::test( 2.4, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( int_eq::check( 5, 2.4 ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_eq::check( 5, 2.4 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( int_eq::check( 5, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_eq::test( 5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( int_eq::check( 5.5, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_eq::test( 5.5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	/**
+	 * Basic checks on the not-equal relation (neq).
+	 * neq is neither an order nor an equivalence relation.
+	 */
+	typedef alp::relations::neq< int > int_neq;
+
+	static_assert( alp::is_relation< int_neq >::value );
+
+	static_assert( !( alp::is_partial_order< int_neq >::value ) );
+
+	static_assert( !( alp::is_strict_partial_order< int_neq >::value ) );
+
+	static_assert( !( alp::is_total_order< int_neq >::value ) );
+
+	static_assert( !( alp::is_strict_total_order< int_neq >::value ) );
+
+	static_assert( !( alp::is_equivalence_relation< int_neq >::value ) );
+
+	if( !( int_neq::check( 2.4, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_neq::test( 2.4, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( int_neq::check( 5, 2.4 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_neq::check( 5, 2.4 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( int_neq::check( 5, 5 ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_neq::test( 5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( int_neq::check( 5.5, 5 ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_neq::test( 5.5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	/**
+	 * Basic checks on the less-than-or-equal relation (le).
+	 * le is both a total order and therefore also a partial order.
+	 */
+	typedef alp::relations::le< int > int_le;
+
+	static_assert( alp::is_relation< int_le >::value );
+
+	static_assert( alp::is_partial_order< int_le >::value );
+
+	static_assert( !( alp::is_strict_partial_order< int_le >::value ) );
+
+	static_assert( alp::is_total_order< int_le >::value );
+
+	static_assert( !( alp::is_strict_total_order< int_le >::value ) );
+
+	static_assert( !( alp::is_equivalence_relation< int_le >::value ) );
+
+	if( !( int_le::check( 2.4, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_le::test( 2.4, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( int_le::check( 5, 2.4 ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_le::check( 5, 2.4 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( int_le::check( 5, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_le::test( 5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( int_le::check( 5.5, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_le::test( 5.5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	/**
+	 * Basic checks on the greater-than-or-equal relation (ge).
+	 * le is both a total order and therefore also a partial order.
+	 */
+	typedef alp::relations::ge< int > int_ge;
+
+	static_assert( alp::is_relation< int_ge >::value );
+
+	static_assert( alp::is_partial_order< int_ge >::value );
+
+	static_assert( !( alp::is_strict_partial_order< int_ge >::value ) );
+
+	static_assert( alp::is_total_order< int_ge >::value );
+
+	static_assert( !( alp::is_strict_total_order< int_ge >::value ) );
+
+	static_assert( !( alp::is_equivalence_relation< int_ge >::value ) );
+
+	if( int_ge::check( 2.4, 5 ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_ge::test( 2.4, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( int_ge::check( 5, 2.4 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_ge::check( 5, 2.4 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( int_ge::check( 5, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_ge::test( 5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	if( !( int_ge::check( 5.5, 5 ) ) ) {
+#ifndef NDEBUG
+		std::cerr << "int_ge::test( 5.5, 5 ) failed." << std::endl;
+#endif
+		rc = alp::FAILED;
+		return;
+	}
+
+	rc = alp::SUCCESS;
+
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in;
+
+	// error checking
+	if( argc > 1 ) {
+		printUsage = true;
+	}
+
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << "\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/alp_type_traits.cpp b/tests/unit/alp_type_traits.cpp
new file mode 100644
index 000000000..38c02ae1b
--- /dev/null
+++ b/tests/unit/alp_type_traits.cpp
@@ -0,0 +1,146 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+
+using namespace alp;
+
+template< typename MatrixType >
+void ask_questions( const MatrixType & M, std::string name ) {
+
+	using M_type = typename std::remove_const< typename std::remove_reference< decltype( M ) >::type >::type;
+	using M_view_type = typename internal::inspect_view< M_type >::type;
+
+	std::cout << name << "( " << alp::nrows( M ) << ", " << alp::ncols( M ) << " )" << std::endl;
+	std::cout << "Is " << name << ":" << std::endl;
+	std::cout << "\tan ALP Matrix? " << alp::is_matrix< M_type >::value << std::endl;
+	std::cout << "\tan ALP Vector? " << alp::is_vector< M_type >::value << std::endl;
+	std::cout << "\ta structured Matrix? " << alp::is_structured_matrix< M_type >::value << std::endl;
+	std::cout << "\ta storage-based Matrix? " << alp::internal::is_view_over_storage< M_view_type >::value << std::endl;
+	std::cout << "\ta functor-based Matrix? " << alp::internal::is_view_over_functor< M_view_type >::value << std::endl;
+	std::cout << "\ta view over another Matrix? " << !alp::internal::requires_allocation< M_view_type >::value << std::endl;
+	//std::cout << name << " has the following static properties:" << std::endl;
+	//std::cout << "\tstructure: " << typeid(typename alp::inspect_structure< M_type >::type).name() << std::endl;
+	//std::cout << "\tview type: " << typeid(typename alp::inspect_view< M_type >::type).name() << std::endl;
+	//std::cout << "\tApplied to: " << typeid(typename alp::inspect_view< M_type >::type::applied_to).name() << std::endl;
+}
+
+
+void alp_program( const size_t & n, alp::RC & rc ) {
+
+	/* Basic checks */
+	std::cout << "Basic type traits over views. Answering questions of type:\n"
+		<< "Does a view of a given type correspond to a storage/functor-based ALP container?\n";
+
+	std::cout << "\tFunctor< std::function< ... > > --> functor-based? "
+		<< alp::internal::is_view_over_functor< alp::view::Functor< std::function< int( int, int ) > > >::value << "\n";
+	std::cout << "\tOriginal< void >                --> functor-based? "
+		<< alp::internal::is_view_over_functor< alp::view::Original< void > >::value << "\n";
+	std::cout << "\tFunctor< std::function< ... > > --> storage-based? "
+		<< alp::internal::is_view_over_storage< alp::view::Functor< std::function< int( int, int ) > > >::value << "\n";
+	std::cout << "\tOriginal< void >                --> storage-based? "
+		<< alp::internal::is_view_over_storage< alp::view::Original< void > >::value << "\n";
+
+	/* Checks with container types */
+	alp::Matrix< float, alp::structures::General > M( n, n );
+	alp::Matrix< float, alp::structures::Square > A( n );
+	auto At = alp::get_view< alp::view::transpose >( A );
+	auto Mt = alp::get_view< alp::view::transpose >( M );
+	auto Mview = alp::get_view( M, alp::utils::range(0,4), alp::utils::range(0,4) );
+	auto Sq_Mref = alp::get_view< alp::structures::Square > ( M );
+
+	ask_questions( M, "M" );
+	ask_questions( A, "A" );
+
+	ask_questions( At, "At" );
+	ask_questions( Mt, "Mt" );
+	ask_questions( Mview, "Mview" );
+	ask_questions( Sq_Mref, "Sq_Mref" );
+
+	auto v_diag = alp::get_view< alp::view::diagonal >( M );
+	auto v_view1 = alp::get_view( v_diag );
+	auto v_view2 = alp::get_view( v_diag, alp::utils::range(1,2) );
+	std::cout << "v_diag( " << alp::size( v_diag ) << " )" << std::endl;
+	std::cout << "v_view1( " << alp::size( v_view1 ) << " )" << std::endl;
+	std::cout << "v_view2( " << alp::size( v_view2 ) << " )" << std::endl;
+
+	ask_questions( v_diag, "v_diag" );
+	ask_questions( v_view1, "v_view1" );
+	ask_questions( v_view2, "v_view2" );
+
+	// TODO: temporarily comented until containers are ready
+	//alp::Matrix< float, alp::structures::Band< alp::Interval<-2, 5> > > BM0( n, n );
+	//alp::Matrix< float, alp::structures::Band< alp::RightOpenInterval<-2> > > BM1( n, n );
+	//alp::Matrix< float, alp::structures::Band< alp::LeftOpenInterval<-2> > > BM2( n, n );
+	//alp::Matrix< double, alp::structures::Band< alp::Interval<-2>, alp::Interval<1>, alp::Interval<3> > > BM3( n, n );
+	rc = alp::SUCCESS;
+
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/alp_views.cpp b/tests/unit/alp_views.cpp
new file mode 100644
index 000000000..441718067
--- /dev/null
+++ b/tests/unit/alp_views.cpp
@@ -0,0 +1,222 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utility>
+#include <iostream>
+
+#include <alp.hpp>
+#include "../utils/print_alp_containers.hpp"
+
+template< typename T >
+void init_matrix( std::vector< T > &A, const size_t rows, const size_t cols ) {
+
+	for( size_t row = 0; row < rows; ++row ) {
+		for( size_t col = 0; col < cols; ++col ) {
+			A[ row * cols + col ] = row + col;
+		}
+	}
+}
+
+template< typename T >
+void print_stdvec_as_matrix( std::string name, const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda ) {
+
+	std::cout << "Vec " << name << ":" << std::endl;
+	for( size_t row = 0; row < m; ++row ) {
+		std::cout << "[\t";
+		for( size_t col = 0; col < n; ++col ) {
+			std::cout << vA[ row * lda + col ] << "\t";
+		}
+		std::cout << "]" << std::endl;
+	}
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
+
+	if( std::is_same< Structure, alp::structures::General >::value ) {
+		std::fill( vA.begin(), vA.end(), one );
+	} else if( std::is_same< Structure, alp::structures::Symmetric >::value ) {
+		std::fill( vA.begin(), vA.end(), one );
+	}
+}
+
+template< typename MatType, typename T >
+void diff_stdvec_matrix(
+	const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda,
+	const MatType &mA,
+	double threshold=1e-7
+) {
+
+	if( std::is_same< typename MatType::structure, alp::structures::General >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = 0; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( alp::internal::access( mA, alp::internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl;
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, alp::structures::Symmetric >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( alp::internal::access( mA, alp::internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl;
+				}
+			}
+		}
+	}
+}
+
+
+
+// alp program
+void alpProgram( const size_t &n, alp::RC &rc ) {
+
+	typedef double T;
+
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	T zero = ring.getZero< T >();
+
+	// allocate
+	const size_t m = 2 * n;
+	std::vector< T > M_data( m * n, zero );
+	init_matrix( M_data, m, n );
+
+	alp::Matrix< T, alp::structures::General > M( m, n );
+	alp::buildMatrix( M, M_data.begin(), M_data.end() );
+	print_matrix( "M", M );
+	std::cout << "------------" << std::endl;
+
+	// gather view
+	auto Mv = alp::get_view( M, alp::utils::range( 1, 3 ), alp::utils::range( 1, 3 ) );
+	print_matrix( "Mv", Mv );
+	std::cout << "------------" << std::endl;
+
+	// transposed view
+	auto MT = alp::get_view< alp::view::Views::transpose >( M );
+	print_matrix( "M^T", MT );
+	std::cout << "------------" << std::endl;
+
+	// row-view
+	auto Mrow = alp::get_view( M, m - 2, alp::utils::range( 1, n - 1 ) );
+	print_vector( "Mrow", Mrow );
+	std::cout << "------------" << std::endl;
+
+	// row-view on a symmetric matrix
+	alp::Matrix< T, alp::structures::Symmetric > A( n, n );
+	alp::set( A, alp::get_view< alp::structures::Symmetric >( M, alp::utils::range( 0, n ), alp::utils::range( 0, n ) ) );
+	auto Arow = alp::get_view( A, 2, alp::utils::range( 2, n ) );
+	print_vector( "Arow", Arow );
+	(void)Arow;
+
+	// column-view
+	auto Mcol = alp::get_view( M, alp::utils::range( 1, m - 1 ), n - 2 );
+	print_vector( "Mcol", Mcol );
+	std::cout << "------------" << std::endl;
+
+	// diagonal view on a general (non-square) matrix
+	auto Mdiag = alp::get_view< alp::view::Views::diagonal >( M );
+	print_vector( "Mdiag", Mdiag );
+
+	// diagonal view on a square matrix
+	auto Msquare = alp::get_view< alp::structures::Square >( M, alp::utils::range( 0, 5 ), alp::utils::range( 0, 5 ) );
+	auto Mdiagsquare = alp::get_view< alp::view::Views::diagonal >( Msquare );
+	print_vector( "Mdiagsquare", Mdiagsquare );
+
+	// view over a vector
+	auto Mdiagpart = alp::get_view( Mdiag, alp::utils::range( 1, 3 ) );
+	print_vector( "Mdiagpart", Mdiagpart );
+
+	// Vector views
+	// allocate vector
+	std::vector< T > v_data( m, zero );
+	init_matrix( v_data, m, 1 );
+	alp::Vector< T, alp::structures::General > v( m );
+	alp::buildMatrix( static_cast< decltype( v )::base_type & >( v ), v_data.begin(), v_data.end() );
+	print_vector( "v", v );
+
+	// gather view over a vector
+	auto v_view = alp::get_view( v, alp::utils::range( 1, 3 ) );
+	print_vector( "v_view", v_view );
+
+	// matrix view over an original vector
+	auto v_mat_view = alp::get_view< alp::view::matrix >( v );
+	print_matrix( "v_mat_view", v_mat_view );
+
+	// matrix view over a vector, which is a vector view over a matrix
+	auto Mrow_mat_view = alp::get_view< alp::view::matrix >( Mrow );
+	print_matrix( "Mrow_mat_view", Mrow_mat_view );
+
+	rc = alp::SUCCESS;
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read == 0 ) {
+			std::cerr << "n must be a positive number\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an integer, the "
+					 "test size.\n";
+		return 1;
+	}
+	std::cout << "Functional test executable: " << argv[ 0 ] << "\n";
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alpProgram, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/dense_conjugate.cpp b/tests/unit/dense_conjugate.cpp
new file mode 100644
index 000000000..50cd8ec02
--- /dev/null
+++ b/tests/unit/dense_conjugate.cpp
@@ -0,0 +1,252 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <memory>
+#include <complex>
+
+#include <alp.hpp>
+#include "../utils/print_alp_containers.hpp"
+
+typedef float BaseScalarType;
+constexpr BaseScalarType tol = 1.e-10;
+
+template< typename T >
+T random_value();
+
+template<>
+BaseScalarType random_value< BaseScalarType >() {
+	return static_cast< BaseScalarType >( rand() ) / RAND_MAX;
+}
+
+template<>
+std::complex< BaseScalarType > random_value< std::complex< BaseScalarType > >() {
+	const BaseScalarType re = random_value< BaseScalarType >();
+	const BaseScalarType im = random_value< BaseScalarType >();
+	return std::complex< BaseScalarType >( re, im );
+}
+
+template< typename MatrixType >
+void init_matrix( MatrixType &M ) {
+	// Temporary until proper matrix building is implemented
+	typedef typename MatrixType::value_type value_type;
+	alp::internal::setInitialized( M, true );
+	const size_t height = alp::ncols( M );
+	const size_t width = alp::nrows( M );
+	for( size_t r = 0; r < height; ++r ) {
+		for( size_t c = 0; c < width; ++c ) {
+			const value_type val = random_value< value_type >();
+			if( r < c ) {
+				alp::internal::access( M, alp::internal::getStorageIndex( M, r, c ) ) = val;
+				if( r != c ) {
+					alp::internal::access( M, alp::internal::getStorageIndex( M, c, r ) ) = grb::utils::is_complex< value_type >::conjugate( val );
+				}
+			} else if ( r == c ) {
+				alp::internal::access( M, alp::internal::getStorageIndex( M, r, c ) ) = std::real( val );
+			}
+		}
+	}
+}
+
+template< typename T >
+std::vector< T > generate_random_data(
+	size_t N
+) {
+	std::vector< T > data( N );
+	for( size_t i = 0; i < N; ++i ) {
+		const T val = random_value< T >();
+		data[ i ] = val;
+	}
+	return data;
+}
+
+template<
+	typename MatrixType1,
+	typename MatrixType2,
+	typename T = typename MatrixType1::value_type,
+	typename Ring
+>
+alp::RC check_if_same( const MatrixType1 &A, const MatrixType2 &B, const Ring &ring ) {
+
+	alp::RC rc = alp::SUCCESS;
+
+	alp::Matrix< T, alp::structures::Square > E( nrows( A ) );
+	rc = rc ? rc : set( E, alp::Scalar< T >( ring.template getZero< T >() ) );
+
+	rc = rc ? rc : alp::foldl( E, A, ring.getAdditiveOperator() );
+	rc = rc ? rc : alp::foldl( E, B, alp::operators::subtract< T >() );
+
+	BaseScalarType fnorm = ring.template getZero< BaseScalarType >();
+	rc = rc ? rc : alp::eWiseLambda(
+		[ &fnorm, &ring ]( const size_t i, const size_t j, T &val ) {
+			(void) i;
+			(void) j;
+			const BaseScalarType valsquare = std::norm( val );
+			alp::internal::foldl(
+				fnorm,
+				valsquare,
+				alp::operators::add< BaseScalarType >()
+			);
+		},
+		E
+	);
+	fnorm = std::sqrt( fnorm );
+
+	if( fnorm < tol ) {
+		return alp::SUCCESS;
+	} else {
+		return alp::FAILED;
+	}
+}
+
+template<
+	typename T,
+	typename Structure = typename std::conditional<
+		grb::utils::is_complex< T >::value,
+		alp::structures::Hermitian,
+		// Should be Symmetric.
+		// Temporarily using Square until fold is fixed to support folding symmetric onto more general structures
+		alp::structures::Square
+	>::type
+>
+alp::RC test_conjugate_matrix( const size_t n ) {
+
+	const alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	alp::RC rc = alp::SUCCESS;
+
+	// create the original matrix
+	alp::Matrix< T, Structure > H( n, n );
+	// set matrix elements using the internal interface
+	init_matrix( H );
+
+	// create a conjugated matrix
+	auto H_conj = alp::conjugate( H );
+
+	// create a transpose view over original matrix (used for error checking)
+	auto H_T = alp::get_view< alp::view::transpose >( H );
+
+	// check if conjugated and transposed matrix are the same
+	rc = rc ? rc : check_if_same( H_conj, H_T, ring );
+
+	return rc;
+}
+
+template<
+	typename T
+>
+alp::RC test_conjugate_vector( const size_t n ) {
+
+	const alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	alp::RC rc = alp::SUCCESS;
+
+	std::srand( 1 );
+	auto randdata = generate_random_data< T >( n );
+	alp::Vector< T > x( n );
+	rc = rc ? rc : alp::buildVector( x, randdata.begin(), randdata.end() );
+
+	auto x_conj = conjugate( x );
+	auto x_conj_conj = conjugate( x_conj );
+
+	alp::Scalar< T > alpha( ring.template getZero< T >() );
+	alp::Vector< T > tmp_vec( n );
+
+	rc = rc ? rc : alp::set( tmp_vec, x );
+	rc = rc ? rc : alp::foldl( tmp_vec, x_conj, ring.getAdditiveOperator() );
+	rc = rc ? rc : alp::foldl( alpha, tmp_vec, ring.getAdditiveMonoid() );
+	if( std::imag( *alpha ) > tol ) {
+		std::cout << " Solution is numerically wrong: imag( sum( x + conjugate( x ) ) ) = "
+			  << std::imag( *alpha ) << "\n";
+		return alp::FAILED;
+	}
+
+	rc = rc ? rc : alp::set( alpha, alp::Scalar< T >( ring.template getZero< T >() ) );
+	rc = rc ? rc : alp::set( tmp_vec, x );
+	rc = rc ? rc : alp::foldl( tmp_vec, x_conj_conj, alp::operators::subtract< T >() );
+	rc = rc ? rc : alp::foldl( alpha, tmp_vec, ring.getAdditiveMonoid() );
+	if( std::abs( *alpha ) > tol ) {
+		std::cout << " Solution is numerical wrong: abs( sum( x - conjugate( conjugate( x ) ) ) ) = "
+			  << std::imag( *alpha ) << "\n";
+		return alp::FAILED;
+	}
+
+	return rc;
+}
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	rc = alp::SUCCESS;
+
+	rc = rc ? rc : test_conjugate_matrix< std::complex< BaseScalarType > >( n );
+	rc = rc ? rc : test_conjugate_matrix< BaseScalarType >( n );
+	rc = rc ? rc : test_conjugate_vector< std::complex< BaseScalarType > >( n );
+	rc = rc ? rc : test_conjugate_vector< BaseScalarType >( n );
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 5;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/dense_constant_matrices.cpp b/tests/unit/dense_constant_matrices.cpp
new file mode 100644
index 000000000..d03ede758
--- /dev/null
+++ b/tests/unit/dense_constant_matrices.cpp
@@ -0,0 +1,104 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+
+using namespace alp;
+
+void alp_program( const size_t & n, alp::RC & rc ) {
+	alp::Semiring< alp::operators::add< double >, alp::operators::mul< double >, alp::identities::zero, alp::identities::one > ring;
+
+	std::cout << "\tTesting dense Identity and Zero matrices\n";
+	// initialize test
+	rc = SUCCESS;
+	//alp::Matrix< double, structures::Square > A( n );
+	//alp::Matrix< double, structures::Square > C( n );
+	const auto I = alp::structures::constant::I< double >( n );
+	std::cout << "I(0, 0) = " << internal::access( I, internal::getStorageIndex( I, 0, 0 ) ) << "\n";
+	std::cout << "I(1, 0) = " << internal::access( I, internal::getStorageIndex( I, 1, 0 ) ) << "\n";
+	auto Zero = alp::structures::constant::Zero< double >( n, n );
+	std::cout << "Zero(0, 0) = " << internal::access( Zero, internal::getStorageIndex( Zero, 0, 0 ) ) << "\n";
+	std::cout << "Zero(1, 0) = " << internal::access( Zero, internal::getStorageIndex( Zero, 1, 0 ) ) << "\n";
+
+	// Initialize input matrix
+	//std::vector< double > A_data( n * n, 1 );
+	//rc = alp::buildMatrix( A, A_data.begin(), A_data.end() );
+
+	//TODO: These should forward to alp::set
+	// if( rc == SUCCESS ) {
+	// 	alp::mxm( C, A, I, ring );
+	// 	// C should be equal to A
+	// }
+
+	// if (rc == SUCCESS ) {
+	// 	alp::mxm( C, A, Zero, ring );
+	// 	// C should be a zero
+	// }
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/dense_dot_norm2.cpp b/tests/unit/dense_dot_norm2.cpp
new file mode 100644
index 000000000..aecf15f9f
--- /dev/null
+++ b/tests/unit/dense_dot_norm2.cpp
@@ -0,0 +1,251 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+
+using namespace alp;
+
+// uncomment next line to disable some set()
+// #define TEMP_DISABLE_SET
+
+typedef double T1;
+
+const T1 testval1 = 1.5;
+const T1 testval2 = -1;
+const T1 testval3 = 2.;
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	{
+		alp::Vector< T1 > left( n );
+		alp::Vector< T1 > right( n );
+
+		alp::Semiring<
+			alp::operators::add< double >, alp::operators::mul< double >,
+			alp::identities::zero, alp::identities::one
+		> ring;
+
+		// test 1, init
+		rc = SUCCESS;
+#ifndef TEMP_DISABLE_SET
+		rc = rc ? rc : alp::set( left, Scalar< T1 >( testval1 ) );
+		rc = rc ? rc : alp::set( right, Scalar< T1 >( testval2 ) );
+#else
+		std::vector< T1 > left_data( n );
+		std::vector< T1 > right_data( n );
+
+		std::fill( left_data.begin(), left_data.end(), testval1 );
+		std::fill( right_data.begin(), right_data.end(), testval2 );
+
+		rc = rc ? rc : alp::buildVector( left, left_data.begin(), left_data.end() );
+		rc = rc ? rc : alp::buildVector( right, right_data.begin(), right_data.end() );
+#endif
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 1 (dense, regular semiring): initialisation FAILED\n";
+			return;
+		}
+
+		// test 1, exec
+		Scalar< T1 > out( 0 );
+		rc = alp::dot( out, left, right, ring );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 1 (dense, regular semiring): dot FAILED\n";
+			return;
+		}
+
+		// test 1, check
+		if( static_cast< T1 >( testval1 * testval2 * n ) != *out ) {
+			std::cerr << "\t test 1 (dense, regular semiring): unexpected output "
+				  << "( " << *out << ", expected "
+				  << ( static_cast< T1 >( testval1 * testval2 * n ) )
+				  << " )\n";
+			rc = FAILED;
+			return;
+		}
+	}
+
+	{
+		alp::Vector< T1 > left( n );
+		alp::Vector< T1 > right( n );
+
+		//test 2, init
+		alp::Semiring<
+			alp::operators::add< double >,
+			alp::operators::left_assign_if< T1, bool, T1 >,
+			alp::identities::zero, alp::identities::logical_true
+		> pattern_sum_if;
+		rc = SUCCESS;
+#ifndef TEMP_DISABLE_SET
+		rc = rc ? rc : alp::set( left, Scalar< T1 >( 0 ) );
+		rc = rc ? rc : alp::set( right, Scalar< T1 >( 1 ) );
+#else
+		// temp initialization
+		std::vector< T1 > left_data( n );
+		std::vector< T1 > right_data( n );
+		std::fill( left_data.begin(), left_data.end(), static_cast< T1 >( 0 ) );
+		std::fill( right_data.begin(), right_data.end(), static_cast< T1 >( 1 ) );
+		rc = rc ? rc : alp::buildVector( left, left_data.begin(), left_data.end() );
+		rc = rc ? rc : alp::buildVector( right, right_data.begin(), right_data.end() );
+#endif
+
+		auto left_view_even = alp::get_view( left, alp::utils::range( 0, n, 2 ) );
+		rc = rc ? rc : alp::set( left_view_even, Scalar< T1 >( testval3 ) );  // needs an implementation
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 2 (sparse, non-standard semiring) "
+				  << "initialisation FAILED\n";
+			return;
+		}
+
+		// test 2, exec
+		Scalar< T1 > out( 0 );
+		rc = alp::dot( out, left, right, pattern_sum_if );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 2 (sparse, non-standard semiring) dot FAILED\n";
+			return;
+		}
+
+		// test 2, check
+		if( testval3 * static_cast< T1 >( n ) != *out * 2  ) {
+			std::cerr << "\t test 2 (sparse, non-standard semiring), "
+				  << "unexpected output: " << *out << ", expected " << n
+				  << ".\n";
+			rc = FAILED;
+			return;
+		}
+	}
+
+	Scalar< int > alpha( 0 );
+	alp::Semiring<
+		alp::operators::add< int >, alp::operators::mul< int >,
+		alp::identities::zero, alp::identities::one
+	> intRing;
+
+	{
+		// test 3, init
+		rc = SUCCESS;
+
+		alp::Vector< int > x( n ), y( n );
+
+#ifndef TEMP_DISABLE_SET
+		rc = alp::set( x, Scalar< T1 >( 1 ) );
+		rc = rc ? rc : alp::set( y, Scalar< T1 >( 2 ) );
+#else
+		// temp initialization
+		std::vector< T1 > x_data( n ), y_data( n );
+		std::fill( x_data.begin(), x_data.end(), 1 );
+		std::fill( y_data.begin(), y_data.end(), 2 );
+		rc = rc ? rc : alp::buildVector( x, x_data.begin(), x_data.end() );
+		rc = rc ? rc : alp::buildVector( y, y_data.begin(), y_data.end() );
+#endif
+
+		if( rc != alp::SUCCESS ) {
+			std::cerr << "\t test 3 (dense integer vectors) initialisation FAILED\n";
+			return;
+		}
+
+		// test 3, exec
+		rc = alp::dot( alpha, x, y, intRing );
+		if( rc != alp::SUCCESS ) {
+			std::cerr << "\t test 3 (dense integer vectors) dot FAILED\n";
+			return;
+		}
+
+		// test 3, check
+		if( *alpha != 2 * static_cast< int >( n ) ) {
+			std::cerr << "\t test 3 (dense integer vectors) unexpected value "
+				  << *alpha << ", expected 2 * n = " << ( 2 * n) << ".\n";
+			rc = FAILED;
+			return;
+		}
+	}
+
+	{
+		// test 4, init
+		alp::Vector< int > empty_left( 0 );
+		alp::Vector< int > empty_right( 0 );
+		internal::setInitialized( empty_left, true );
+		internal::setInitialized( empty_right, true );
+
+		// test 4, exec
+		rc = alp::dot( alpha, empty_left, empty_right, intRing );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 4 (empty vectors) dot FAILED\n";
+			return;
+		}
+
+		// test 4, check
+		if( *alpha != 2 * static_cast< int >(n) ) {
+			std::cerr << "\t test 4 (empty vectors) unexpected value "
+				  << *alpha << ", expected 2 * n = " << ( 2 * n ) << ".\n";
+			rc = FAILED;
+			return;
+		}
+	}
+
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/dense_eWiseApply.cpp b/tests/unit/dense_eWiseApply.cpp
new file mode 100644
index 000000000..dadce3066
--- /dev/null
+++ b/tests/unit/dense_eWiseApply.cpp
@@ -0,0 +1,193 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+
+using namespace alp;
+
+typedef int T;
+constexpr T ALPHA_VALUE = 1;
+constexpr T BETA_VALUE = 2;
+
+// Function used to initialize elements of matrix A
+T f_A( const size_t i, const size_t j, const size_t n ){
+	return n * i + j;
+}
+
+// Function used to initialize elements of matrix B
+T f_B( const size_t i, const size_t j, const size_t n ){
+	return i + static_cast< T >( n ) * j;
+}
+
+// Important: The following functions must match the corresponding eWiseApply calls in this unit test
+// Function used to calculate C = alpha .+ B
+T f_alphaB( const size_t i, const size_t j, const size_t n ) {
+	const T b_value = f_B( i, j, n );
+	return ALPHA_VALUE + b_value;
+}
+
+// Function used to calculate C = A .* beta
+T f_Abeta( const size_t i, const size_t j, const size_t n ) {
+	const T a_value = f_A( i, j, n );
+	return a_value * BETA_VALUE;
+}
+
+// Function used to calculate C = A .* B
+T f_AB( const size_t i, const size_t j, const size_t n ) {
+	const T a_value = f_A( i, j, n );
+	const T b_value = f_B( i, j, n );
+	return a_value * b_value;
+}
+
+// Checks if each element of provided matrix match the value calculated by the provided function
+template< typename MatrixType, typename Function >
+bool check_correctness( const MatrixType &A, const Function &f ) {
+	for( size_t i = 0; i < nrows( A ); ++i ) {
+		for( size_t j = 0; j < ncols( A ); ++j ) {
+			const T expected = f( i, j, nrows( A ) );
+			const auto result = alp::internal::access( A, alp::internal::getStorageIndex( A, i, j ) );
+			if( expected != result ) {
+				return false;
+			}
+		}
+	}
+	return true;
+}
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	// This test is designed to work with the ring below
+	// because it assumes that operators::add and operators::mul are equivalent to
+	// C++ operators + and *, respectively, for the value type T defined at the top.
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	alp::Matrix< T, structures::General > A( n, n );
+	alp::Matrix< T, structures::General > B( n, n );
+	alp::Matrix< T, structures::General > C( n, n );
+	alp::Scalar< T > alpha( ALPHA_VALUE );
+	alp::Scalar< T > beta( BETA_VALUE );
+
+	internal::setInitialized( A, true );
+	internal::setInitialized( B, true );
+	internal::setInitialized( C, true );
+
+	// Initialize matrices
+	// A[i][j] = n * i + j
+	rc = alp::eWiseLambda(
+		[ n ]( const size_t i, const size_t j, T &val ) {
+			val = f_A( i, j, n );
+		},
+		A
+	);
+	if( rc != alp::SUCCESS ){
+		std::cerr << "\talp::eWiseLambda (matrix, no vectors) FAILED\n";
+		return;
+	}
+	// B[i][j] = i + n * j
+	rc = alp::eWiseLambda(
+		[ n ]( const size_t i, const size_t j, T &val ) {
+			val = f_B( i, j, n );
+		},
+		B
+	);
+	if( rc != alp::SUCCESS ){
+		std::cerr << "\talp::eWiseLambda (matrix, no vectors) FAILED\n";
+		return;
+	}
+	// Test C = alpha .+ B
+	rc = alp::eWiseApply( C, alpha, B, ring.getAdditiveMonoid() );
+
+	if( rc != alp::SUCCESS ){
+		std::cerr << "\talp::eWiseApply ( matrix = scalar .* matrix ) FAILED\n";
+	}
+	if( ! check_correctness( C, f_alphaB ) ) {
+		std::cerr << "\talp::eWiseApply ( matrix = scalar .* matrix ) FAILED: numerically incorrect\n";
+	}
+
+	// Test C = A .* beta
+	rc = alp::eWiseApply( C, A, beta, ring.getMultiplicativeMonoid() );
+
+	if( rc != alp::SUCCESS ){
+		std::cerr << "\talp::eWiseApply ( matrix = matrix .* scalar ) FAILED\n";
+	}
+	if( ! check_correctness( C, f_Abeta ) ) {
+		std::cerr << "\talp::eWiseApply ( matrix = matrix .* scalar ) FAILED: numerically incorrect\n";
+	}
+
+	// Test C = A . B
+	rc = alp::eWiseApply( C, A, B, ring.getMultiplicativeMonoid() );
+
+	if( rc != alp::SUCCESS ){
+		std::cerr << "\talp::eWiseApply ( matrix = matrix .* matrix ) FAILED\n";
+	}
+	if( ! check_correctness( C, f_AB ) ) {
+		std::cerr << "\talp::eWiseApply ( matrix = matrix .* matrix ) FAILED: numerically incorrect\n";
+	}
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cout << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+		return out;
+	} else {
+		std::cout << "Test OK" << std::endl;
+		return 0;
+	}
+}
diff --git a/tests/unit/dense_eWiseLambda.cpp b/tests/unit/dense_eWiseLambda.cpp
new file mode 100644
index 000000000..eccd4b8a0
--- /dev/null
+++ b/tests/unit/dense_eWiseLambda.cpp
@@ -0,0 +1,151 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+
+using namespace alp;
+
+static const int data1[ 15 ] = { 4, 7, 4, 6, 4, 7, 1, 7, 3, 6, 7, 5, 1, 8, 7 };
+static const size_t I[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 8, 7, 6 };
+static const size_t J[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 5, 7, 5, 1 };
+static const double data2[ 6 ] = { 1, 1, 1, 1, 1, 1 };
+static const size_t I2[ 6 ] = { 0, 1, 0, 2, 1, 2 };
+static const size_t J2[ 6 ] = { 1, 0, 2, 0, 2, 1 };
+static const double testv[ 3 ] = { 0.1, 2.1, -2.3 };
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+	// initialize test
+	typedef int T;
+	alp::Matrix< T, structures::General > A( n, n );
+	alp::Vector< T > u( n );
+	alp::Vector< T > v( n );
+
+	internal::setInitialized( A, true );
+	internal::setInitialized( u, true );
+	internal::setInitialized( v, true );
+
+	// test eWiseLambda on matrix
+	rc = alp::eWiseLambda(
+		[]( const size_t i, const size_t j, T &val ) {
+			(void)i;
+			(void)j;
+			val = 1;
+		},
+		A
+	);
+	if( rc != alp::SUCCESS ){
+		std::cerr << "\talp::eWiseLambda (matrix, no vectors) FAILED\n";
+		return;
+	}
+	assert( internal::access( A, internal::getStorageIndex( A, 0, 0 ) ) == 1 );
+
+	// test eWiseLambda on vector
+	rc = alp::eWiseLambda(
+		[]( const size_t i, T &val ) {
+			(void)i;
+			val = 2;
+		},
+		v
+	);
+
+	if( rc != SUCCESS ) {
+		std::cerr << "\talp::eWiseLambda (vector) FAILED\n";
+		return;
+	}
+	assert( v[ 0 ] == 2 );
+
+	// test eWiseLambda on vector, consuming from another vector
+	rc = alp::eWiseLambda(
+		[ &v ]( const size_t i, T &val ) {
+			val = v[ i ];
+		},
+		u, v
+	);
+
+	if( rc != SUCCESS ) {
+		std::cerr << "\talp::eWiseLambda (vector, vectors...) FAILED\n";
+		return;
+	}
+	assert( v[ 0 ] = u[ 0 ] );
+
+	// test eWiseLambda on Matrix, consuming two other vectors
+	rc = alp::eWiseLambda(
+		[ &u, &v ]( const size_t i, const size_t j, T &val ) {
+			val = val + u[ i ] * v[ j ];
+		},
+		A, u, v
+	);
+	assert( internal::access( A, internal::getStorageIndex( A, 0, 0 ) ) == 1 + u[ 0 ] + v[ 0 ] );
+
+	if( rc != SUCCESS ) {
+		std::cerr << "\talp::eWiseLambda (matrix, vectors...) FAILED\n";
+		return;
+	}
+	
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cout << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+		return out;
+	} else {
+		std::cout << "Test OK" << std::endl;
+		return 0;
+	}
+}
diff --git a/tests/unit/dense_eWiseMul.cpp b/tests/unit/dense_eWiseMul.cpp
new file mode 100644
index 000000000..7ef6f6a5e
--- /dev/null
+++ b/tests/unit/dense_eWiseMul.cpp
@@ -0,0 +1,236 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include "../utils/print_alp_containers.hpp"
+
+#include <alp.hpp>
+
+typedef int T;
+constexpr T ALPHA_VALUE = 3;
+constexpr T BETA_VALUE = 2;
+
+// Function used to initialize elements of matrix A
+T f_A( const size_t i, const size_t j, const size_t n ){
+	return static_cast< T >( n * i + j );
+}
+
+// Function used to initialize elements of matrix B
+T f_B( const size_t i, const size_t j, const size_t n ){
+	return static_cast< T >( i + n * j );
+}
+
+// Function used to initialize elements of matrix C
+T f_C( const size_t i, const size_t j, const size_t n ){
+	(void)n;
+	return static_cast< T >( i ) - static_cast< T >( j );
+}
+
+// Function used to calculate C = alpha .* B
+T f_alphaB( const size_t i, const size_t j, const size_t n ) {
+	const T b_value = f_B( i, j, n );
+	const T c_value = f_C( i, j, n );
+	return c_value + ALPHA_VALUE * b_value;
+}
+
+// Function used to calculate C = A .* beta
+T f_Abeta( const size_t i, const size_t j, const size_t n ) {
+	const T a_value = f_A( i, j, n );
+	const T c_value = f_C( i, j, n );
+	return c_value + a_value * BETA_VALUE;
+}
+
+// Function used to calculate C = A .* B
+T f_AB( const size_t i, const size_t j, const size_t n ) {
+	const T a_value = f_A( i, j, n );
+	const T b_value = f_B( i, j, n );
+	const T c_value = f_C( i, j, n );
+	return c_value + a_value * b_value;
+}
+
+template< typename MatrixType, typename Function >
+alp::RC initialize( MatrixType &A, const Function &f ) {
+	const size_t n = nrows( A );
+	const alp::RC rc = alp::eWiseLambda(
+		[ &f, n ]( const size_t i, const size_t j, T &val ) {
+			val = f( i, j, n );
+		},
+		A
+	);
+
+	if( rc != alp::SUCCESS ){
+		std::cerr << "\tFailed to initialize matrix. alp::eWiseLambda (matrix, no vectors) FAILED\n";
+	}
+	return rc;
+}
+
+// Checks if each element of provided matrix match the value calculated by the provided function
+template< typename MatrixType, typename Function >
+alp::RC check_correctness( const MatrixType &A, const Function &f ) {
+	for( size_t i = 0; i < nrows( A ); ++i ) {
+		for( size_t j = 0; j < ncols( A ); ++j ) {
+			const T expected = f( i, j, nrows( A ) );
+			const auto result = alp::internal::access( A, alp::internal::getStorageIndex( A, i, j ) );
+			if( expected != result ) {
+				return alp::FAILED;
+			}
+		}
+	}
+	return alp::SUCCESS;
+}
+
+bool verify_success( const alp::RC &rc, const std::string &message ) {
+	if( rc != alp::SUCCESS ) {
+		std::cerr << message << "\n";
+		return false;
+	} else {
+		return true;
+	}
+}
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	// This test is designed to work with the ring below
+	// because it assumes that operators::add and operators::mul are equivalent to
+	// C++ operators + and *, respectively, for the value type T defined at the top.
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	alp::Matrix< T, alp::structures::General > A( n, n );
+	alp::Matrix< T, alp::structures::General > B( n, n );
+	alp::Matrix< T, alp::structures::General > C( n, n );
+	alp::Scalar< T > alpha( ALPHA_VALUE );
+	alp::Scalar< T > beta( BETA_VALUE );
+
+	alp::internal::setInitialized( A, true );
+	alp::internal::setInitialized( B, true );
+	alp::internal::setInitialized( C, true );
+
+	rc = alp::SUCCESS;
+	// Initialize matrices
+	rc = rc ? rc : initialize( A, f_A );
+	rc = rc ? rc : initialize( B, f_B );
+	(void) verify_success( rc, "Input matrix initialization FAILED" );
+
+	// Test C += alpha .* B
+	rc = rc ? rc : initialize( C, f_C );
+	(void) verify_success( rc, "Matrix C initialization FAILED" );
+
+	rc = rc ? rc : alp::eWiseMul( C, alpha, B, ring );
+	if( !verify_success( rc, "eWiseMul ( matrix += scalar .* matrix ) FAILED" ) ) {
+		return;
+	}
+
+	rc = rc ? rc : check_correctness( C, f_alphaB );
+	if( !verify_success( rc, "eWiseMul ( matrix += scalar .* matrix ) FAILED: numerically incorrect" ) ) {
+#ifdef DEBUG
+		print_matrix( "A", A );
+		print_matrix( "B", B );
+		print_matrix( "C", C );
+#endif
+		return;
+	}
+
+	// Test C += A .* beta
+	rc = rc ? rc : initialize( C, f_C );
+	(void) verify_success( rc, "Matrix C initialization FAILED" );
+
+	rc = rc ? rc : alp::eWiseMul( C, A, beta, ring );
+	if( !verify_success( rc, "eWiseMul ( matrix += matrix .* scalar ) FAILED" ) ) {
+		return;
+	}
+
+	rc = rc ? rc : check_correctness( C, f_Abeta );
+	if( !verify_success( rc, "eWiseMul ( matrix += matrix .* scalar ) FAILED: numerically incorrect" ) ) {
+#ifdef DEBUG
+		print_matrix( "A", A );
+		print_matrix( "B", B );
+		print_matrix( "C", C );
+#endif
+		return;
+	}
+
+	// Test C = A . B
+	rc = rc ? rc : initialize( C, f_C );
+	(void) verify_success( rc, "Matrix C initialization FAILED" );
+
+	rc = rc ? rc : alp::eWiseMul( C, A, B, ring );
+	if( !verify_success( rc, "eWiseMul ( matrix += matrix .* matrix ) FAILED" ) ) {
+		return;
+	}
+
+	rc = rc ? rc : check_correctness( C, f_AB );
+	if( !verify_success( rc, "eWiseMul ( matrix += matrix .* matrix ) FAILED: numerically incorrect" ) ) {
+#ifdef DEBUG
+		print_matrix( "A", A );
+		print_matrix( "B", B );
+		print_matrix( "C", C );
+#endif
+		return;
+	}
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cout << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+		return out;
+	} else {
+		std::cout << "Test OK" << std::endl;
+		return 0;
+	}
+}
diff --git a/tests/unit/dense_fold.cpp b/tests/unit/dense_fold.cpp
new file mode 100644
index 000000000..4073244bb
--- /dev/null
+++ b/tests/unit/dense_fold.cpp
@@ -0,0 +1,512 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <alp.hpp>
+
+using namespace alp;
+
+typedef double T1;
+
+const T1 testval1 = 1.5;
+const T1 testval2 = -1;
+const T1 testval3 = 2.;
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	// test 1
+	// foldl( vector, scalar, mul_op)
+	// foldr( scalar, vector, mul_op)
+	{
+		rc = SUCCESS;
+
+		alp::Vector< T1 > x_l( n );
+
+		alp::Semiring<
+			alp::operators::add< double >, alp::operators::mul< double >,
+			alp::identities::zero, alp::identities::one
+		> ring;
+
+		std::vector< T1 > x_data( n );
+
+		// test 1 foldl , init
+		std::fill( x_data.begin(), x_data.end(), testval1 );
+
+		rc = rc ? rc : alp::buildVector( x_l, x_data.begin(), x_data.end() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 1 (foldl( vector, scalar, mul_op )): initialisation FAILED\n";
+			return;
+		}
+
+		// test 1 foldl, exec
+		Scalar< T1 > out_r( testval2 );
+		rc = alp::foldl( x_l, out_r, ring.getMultiplicativeOperator() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 1 (foldl( vector, scalar, mul_op )): foldl FAILED\n";
+			return;
+		}
+
+		// test 1 foldl, check
+#ifdef DEBUG
+		std::cout << "x_l = ";
+#endif
+		if( *out_r != testval2 ) {
+			std::cerr << "\t test 1 ( foldl( vector, scalar, mul_op )): unexpected output ";
+			std::cerr << "\t scalar should not me modified here\n";
+			rc = FAILED;
+			return;
+		}
+		for( size_t i = 0; i < alp::size( x_l ); ++i ) {
+			if( x_l[ i ] !=  testval1 * testval2 ) {
+				std::cerr << "\t test 1 ( foldl( vector, scalar, mul_op )): unexpected output "
+					  << "vector [" <<  i << " ] ( " << x_l[ i ] << ", expected "
+					  << ( static_cast< T1 >( testval1 * testval2 ) )
+					  << " )\n";
+				rc = FAILED;
+				return;
+			}
+#ifdef DEBUG
+			if( i < 10 ) {
+				std::cout << x_l[ i ] << " ";
+			} else if( i + 10 > alp::size( x_l ) ) {
+				std::cout << x_l[ i ] << " ";
+			} else if( i == 10 ) {
+				std::cout << " ...  ";
+			}
+#endif
+		}
+
+		// test 1 foldr, exec
+		alp::Vector< T1 > x_r( n );
+		*out_r = testval2;
+
+		// test 1 foldr, init
+		std::fill( x_data.begin(), x_data.end(), testval1 );
+		rc = rc ? rc : alp::buildVector( x_r, x_data.begin(), x_data.end() );
+
+		// test 1 foldr, exec
+		rc = alp::foldr( out_r, x_r, ring.getMultiplicativeOperator() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 1 (foldr( scalar, vector, mul_op )): foldr FAILED\n";
+			return;
+		}
+		// test 1 foldr, check
+#ifdef DEBUG
+		std::cout << "x_r = ";
+#endif
+		for( size_t i = 0; i < alp::size( x_r ); ++i ) {
+			if( x_r[ i ] !=  testval1 * testval2 ) {
+				std::cerr << "\t test 1 ( foldr( scalar, vector, mul_op )): unexpected output "
+					  << "vector [" <<  i << " ] ( " << x_r[ i ] << ", expected "
+					  << ( static_cast< T1 >( testval1 * testval2 ) )
+					  << " )\n";
+				rc = FAILED;
+				return;
+			}
+#ifdef DEBUG
+			if( i < 10 ) {
+				std::cout << x_r[ i ] << " ";
+			} else if( i + 10 > alp::size( x_r ) ) {
+				std::cout << x_r[ i ] << " ";
+			} else if( i == 10 ) {
+				std::cout << " ...  ";
+			}
+#endif
+		}
+
+	}
+
+	// test 2
+	// foldl( scalar, vector, add_op)
+	// foldr( vector, scalar, add_op)
+	{
+		alp::Vector< T1 > x_l( n );
+
+		//test 2, init
+		alp::Semiring<
+			alp::operators::add< double >, alp::operators::mul< double >,
+			alp::identities::zero, alp::identities::one
+		> ring;
+
+		rc = SUCCESS;
+		{
+			// temp initialization
+			std::vector< T1 > x_data( n );
+			std::fill( x_data.begin(), x_data.end(), static_cast< T1 >( testval2 ) );
+			rc = rc ? rc : alp::buildVector( x_l, x_data.begin(), x_data.end() );
+		}
+		// rc = rc ? rc : alp::set( x_l, Scalar< T1 >( 0 ) ); // needs an implementation
+
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 2 (foldl( scalar, vector, add_op )) "
+				  << "initialisation FAILED\n";
+			return;
+		}
+
+		// test 2 foldl, exec
+		Scalar< T1 > out( testval3 );
+		rc = alp::foldl( out, x_l, ring.getAdditiveMonoid() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 2 (foldl( scalar, vector, monoid )) foldl FAILED\n";
+			return;
+		}
+
+		// test 2 foldl, check
+		if( testval3 + testval2 * static_cast< T1 >( n ) != *out ) {
+			std::cerr << "\t test 2 (foldl( scalar, vector, monoid)), "
+				  << "unexpected output: " << *out << ", expected "
+				  << testval3 + testval2 * static_cast< T1 >( n )
+				  << ".\n";
+			rc = FAILED;
+			return;
+		}
+
+		// test 2 foldr, exec
+		rc = alp::foldr( x_l, out, ring.getAdditiveMonoid() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 2 (foldr( vector, scalar, monoid )) foldr FAILED\n";
+			return;
+		}
+
+		// test 2 foldr, check
+		if( testval3 + testval2 * static_cast< T1 >( n * 2 ) != *out ) {
+			std::cerr << "\t test 2 (foldl( scalar, vector, monoid)), "
+				  << "unexpected output: " << *out << ", expected "
+				  << testval3 + testval2 * static_cast< T1 >( n * 2 )
+				  << ".\n";
+			rc = FAILED;
+			return;
+		}
+
+#ifdef DEBUG
+		std::cout << "x_l = ";
+#endif
+		for( size_t i = 0; i < alp::size( x_l ); ++i ) {
+			if( x_l[ i ] !=  testval2 ) {
+				std::cerr << "\t test 2 ( foldl/r): unexpected output, vector x_l should not be modified "
+					  << "vector [" <<  i << " ] ( " << x_l[ i ] << ", expected "
+					  << ( static_cast< T1 >( testval1 * testval2 ) )
+					  << " )\n";
+				rc = FAILED;
+				return;
+			}
+#ifdef DEBUG
+			if( i < 10 ) {
+				std::cout << x_l[ i ] << " ";
+			} else if( i + 10 > alp::size( x_l ) ) {
+				std::cout << x_l[ i ] << " ";
+			} else if( i == 10 ) {
+				std::cout << " ...  ";
+			}
+#endif
+		}
+
+		// test 3
+		// test 3 (foldl( scalar, vector_view, add_op))
+		// test 3 (foldr( vector_view, scalar, add_op))
+		auto x_view_even_l = alp::get_view( x_l, alp::utils::range( 0, n, 2 ) );
+		*out = testval3;
+		rc = alp::foldl( out, x_view_even_l, ring.getAdditiveMonoid() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 3 (foldl( scalar, vector_view, monoid )) foldl FAILED\n";
+			return;
+		}
+
+		// test 3 foldl, check
+		if( testval3 + testval2 * static_cast< T1 >( n / 2 ) != *out ) {
+			std::cerr << "\t test 3 (foldl( scalar, vector_view, monoid )), "
+				  << "unexpected output: " << *out << ", expected "
+				  << testval3 + testval2 * static_cast< T1 >( n / 2 )
+				  << ".\n";
+			rc = FAILED;
+			return;
+		}
+
+		// test 3 (foldr( vector_view, scalar, add_op))
+		auto x_view_even_r = alp::get_view( x_l, alp::utils::range( 0, n, 2 ) );
+		*out = testval3;
+		rc = alp::foldr( x_view_even_r, out, ring.getAdditiveMonoid() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 3 (foldr( vector_view, scalar, monoid )) foldr FAILED\n";
+			return;
+		}
+
+		// test 3 foldr, check
+		if( testval3 + testval2 * static_cast< T1 >( n / 2 ) != *out ) {
+			std::cerr << "\t test 3 (foldr( vector_view, scalar, monoid )), "
+				  << "unexpected output: " << *out << ", expected "
+				  << testval3 + testval2 * static_cast< T1 >( n / 2 )
+				  << ".\n";
+			rc = FAILED;
+			return;
+		}
+
+#ifdef DEBUG
+		std::cout << "x_l = ";
+#endif
+		for( size_t i = 0; i < alp::size( x_l ); ++i ) {
+			if( x_l[ i ] !=  testval2 ) {
+				std::cerr << "\t test 3 ( foldl/r): unexpected output, vector x_l should not be modified "
+					  << "vector [" <<  i << " ] ( " << x_l[ i ] << ", expected "
+					  << ( static_cast< T1 >( testval1 * testval2 ) )
+					  << " )\n";
+				rc = FAILED;
+				return;
+			}
+#ifdef DEBUG
+			if( i < 10 ) {
+				std::cout << x_l[ i ] << " ";
+			} else if( i + 10 > alp::size( x_l ) ) {
+				std::cout << x_l[ i ] << " ";
+			} else if( i == 10 ) {
+				std::cout << " ...  ";
+			}
+#endif
+		}
+	}
+
+	// test 4
+	// test 4 (foldl( vector, vector, add_op))
+	// test 4 (foldr( vector, vector, add_op))
+	{
+		alp::Semiring<
+			alp::operators::add< double >, alp::operators::mul< double >,
+			alp::identities::zero, alp::identities::one
+		> ring;
+
+		alp::Vector< T1 > x( n );
+		alp::Vector< T1 > y( n );
+
+		std::vector< T1 > data( n );
+
+		std::fill( data.begin(), data.end(), static_cast< T1 >( testval2 ) );
+		rc = rc ? rc : alp::buildVector( x, data.begin(), data.end() );
+
+		std::fill( data.begin(), data.end(), static_cast< T1 >( testval3 ) );
+		rc = rc ? rc : alp::buildVector( y, data.begin(), data.end() );
+
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 4 alp::buildVector FAILED\n";
+			return;
+		}
+
+		// test 4 (foldl( vector, vector, add_op))
+		rc = alp::foldl( x, y, ring.getAdditiveOperator() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 4 (foldl( vector, vector, monoid )) foldl FAILED\n";
+			return;
+		}
+
+		for( size_t i = 0; i < alp::size( x ); ++i ) {
+			if( x[ i ] !=  testval2 +  testval3 ) {
+				std::cerr << "\t test 4 ( foldl): unexpected output "
+					  << "vector [" <<  i << " ] ( " << x[ i ] << ", expected "
+					  << ( static_cast< T1 >( testval2 + testval3 ) )
+					  << " )\n";
+				rc = FAILED;
+				return;
+			}
+
+			if( y[ i ] !=  testval3 ) {
+				std::cerr << "\t test 4 ( foldl): unexpected output, vector y should not be modified "
+					  << "vector [" <<  i << " ] ( " << y[ i ] << ", expected "
+					  << ( static_cast< T1 >( testval3 ) )
+					  << " )\n";
+				rc = FAILED;
+				return;
+			}
+		}
+
+		// test 4 (foldr( vector, vector, add_op))
+		rc = alp::foldr( x, y, ring.getAdditiveOperator() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 4 (foldr( vector, vector, monoid )) foldr FAILED\n";
+			return;
+		}
+
+		for( size_t i = 0; i < alp::size( x ); ++i ) {
+			if( x[ i ] !=  testval2 +  testval3 ) {
+				std::cerr << "\t test 4 ( foldr): unexpected output, vector x should not be modified "
+					  << "vector [" <<  i << " ] ( " << x[ i ] << ", expected "
+					  << ( static_cast< T1 >( testval2 + testval3 ) )
+					  << " )\n";
+				rc = FAILED;
+				return;
+			}
+
+			if( y[ i ] !=  testval2 +  2 * testval3 ) {
+				std::cerr << "\t test 4 ( foldr): unexpected output, "
+					  << "vector [" <<  i << " ] ( " << y[ i ] << ", expected "
+					  << ( static_cast< T1 >( testval2 +  2 * testval3 ) )
+					  << " )\n";
+				rc = FAILED;
+				return;
+			}
+		}
+
+	}
+
+	// test 5
+	// test 5 (foldl( matrix, scalar, add_monoid))
+	// test 5 (foldl( matrix, matrix, add_monoid))
+	{
+		alp::Matrix< T1, alp::structures::General > A( n, n );
+		alp::Matrix< T1, alp::structures::General > B( n, n );
+		alp::Scalar< T1 > alpha( testval1 );
+
+		//test 5, init
+		alp::Semiring<
+			alp::operators::add< double >, alp::operators::mul< double >,
+			alp::identities::zero, alp::identities::one
+		> ring;
+
+		rc = SUCCESS;
+		rc = rc ? rc : alp::set( A, alp::Scalar< T1 >( testval2 ) );
+		rc = rc ? rc : alp::set( B, alp::Scalar< T1 >( testval3 ) );
+
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 5 (foldl( matrix, scalar, add_op )) "
+				  << "initialisation FAILED\n";
+			return;
+		}
+
+		// test 5 (foldl( matrix, scalar, add_monoid)) exec
+		rc = alp::foldl( A, alpha, ring.getAdditiveMonoid() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 5 (foldl( matrix, scalar, monoid )) foldl FAILED\n";
+			return;
+		}
+
+		// test 5 (foldl( matrix, matrix, add_monoid)) exec
+		rc = alp::foldl( A, B, ring.getAdditiveMonoid() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 5 (foldl( matrix, matrix, monoid )) foldl FAILED\n";
+			return;
+		}
+
+		// test 2, check
+		const auto A_val = alp::internal::access( A, alp::internal::getStorageIndex( A, 0, 0 ) );
+		if( testval1 + testval2 + testval3 != A_val ) {
+			std::cerr << "\t test 5 (foldl( matrix, scalar, monoid ), foldl( matrix, matrix, monoid )), "
+				  << "unexpected output: " << A_val << ", expected "
+				  << testval1 + testval2 + testval3
+				  << ".\n";
+			rc = FAILED;
+			return;
+		}
+	}
+
+	// test 6
+	// test 6 (foldr( scalar, matrix, add_monoid ))
+	// test 6 (foldr( matrix, matrix, add_monoid ))
+	{
+		alp::Matrix< T1, alp::structures::General > A( n, n );
+		alp::Matrix< T1, alp::structures::General > B( n, n );
+		alp::Scalar< T1 > alpha( testval1 );
+
+		//test 6, init
+		alp::Semiring<
+			alp::operators::add< double >, alp::operators::mul< double >,
+			alp::identities::zero, alp::identities::one
+		> ring;
+
+		rc = SUCCESS;
+		rc = rc ? rc : alp::set( A, alp::Scalar< T1 >( testval2 ) );
+		rc = rc ? rc : alp::set( B, alp::Scalar< T1 >( testval3 ) );
+
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 6 (foldr( scalar, matrix, monoid ), foldr( matrix, matrix, monoid )) "
+				  << "initialisation FAILED\n";
+			return;
+		}
+
+		// test 6 (foldr( scalar, matrix, add_monoid )) exec
+		rc = alp::foldr( alpha, B, ring.getAdditiveMonoid() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 6 (foldr( matrix, scalar, monoid )) foldl FAILED\n";
+			return;
+		}
+
+		// test 6 (foldr( matrix, matrix, add_monoid )) exec
+		rc = alp::foldr( A, B, ring.getAdditiveMonoid() );
+		if( rc != SUCCESS ) {
+			std::cerr << "\t test 6 (foldr( matrix, matrix, monoid )) foldl FAILED\n";
+			return;
+		}
+
+		// test 2, check
+		const auto B_val = alp::internal::access( B, alp::internal::getStorageIndex( B, 0, 0 ) );
+		if( testval1 + testval2 + testval3 != B_val ) {
+			std::cerr << "\t test 6 (foldr( scalar, matrix, monoid ), foldr( matrix, matrix, monoid )), "
+				  << "unexpected output: " << B_val << ", expected "
+				  << testval1 + testval2 + testval3
+				  << ".\n";
+			rc = FAILED;
+			return;
+		}
+	}
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/dense_matrix_access.cpp b/tests/unit/dense_matrix_access.cpp
new file mode 100644
index 000000000..ec512bc92
--- /dev/null
+++ b/tests/unit/dense_matrix_access.cpp
@@ -0,0 +1,126 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <memory>
+
+#include <alp.hpp>
+
+template< typename MatrixType >
+void setElements( MatrixType &M, const typename MatrixType::value_type value ) {
+	// Temporary until proper matrix building is implemented
+	alp::internal::setInitialized( M, true );
+	const size_t height = alp::ncols( M );
+	const size_t width = alp::nrows( M );
+	for( size_t r = 0; r < height; ++r ) {
+		for( size_t c = 0; c < width; ++c ) {
+			alp::internal::access( M, alp::internal::getStorageIndex( M, r, c ) ) = value;
+		}
+	}
+}
+
+void alp_program( const size_t & n, alp::RC & rc ) {
+
+	const size_t width = 2 * n;
+	const size_t height = n;
+	std::cout << "\tStarting structured matrices test with size ( H x W ): " << height << " x " << width <<  "\n";
+	rc = alp::SUCCESS;
+
+	// create the original matrix
+	alp::Matrix< float, alp::structures::General > M( n, n );
+	// set matrix elements using the internal interface
+	setElements( M, 1 );
+	// create transposed view over M
+	auto Mt = alp::get_view< alp::view::transpose >( M );
+	// create a square view over M and treat it as a matrix view with a square structure
+	const size_t block_size = 4;
+	auto Mview = alp::get_view( M, alp::utils::range( 0, block_size ), alp::utils::range( 0, block_size ) );
+	auto Sq_Mref = alp::get_view< alp::structures::Square > ( Mview );
+
+	// verify that accessing corner elements succeeds
+	// original matrix
+	alp::internal::access( M, alp::internal::getStorageIndex( M, 0, 0 ) );
+	alp::internal::access( M, alp::internal::getStorageIndex( M, height - 1, 0 ) );
+	alp::internal::access( M, alp::internal::getStorageIndex( M, 0, width - 1 ) );
+	alp::internal::access( M, alp::internal::getStorageIndex( M, height - 1, width - 1 ) );
+
+	// transposed view
+	alp::internal::access( Mt, alp::internal::getStorageIndex( Mt, 0, 0 ) );
+	alp::internal::access( Mt, alp::internal::getStorageIndex( Mt, width - 1, 0 ) );
+	alp::internal::access( Mt, alp::internal::getStorageIndex( Mt, 0, height - 1 ) );
+	alp::internal::access( Mt, alp::internal::getStorageIndex( Mt, width - 1, height - 1 ) );
+
+	// square block view
+	alp::internal::access( Sq_Mref, alp::internal::getStorageIndex( Sq_Mref, 0, 0 ) );
+	alp::internal::access( Sq_Mref, alp::internal::getStorageIndex( Sq_Mref, block_size - 1, 0 ) );
+	alp::internal::access( Sq_Mref, alp::internal::getStorageIndex( Sq_Mref, 0, block_size - 1 ) );
+	alp::internal::access( Sq_Mref, alp::internal::getStorageIndex( Sq_Mref, block_size - 1, block_size - 1 ) );
+
+	rc = alp::SUCCESS;
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 5;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/dense_matrix_imf.cpp b/tests/unit/dense_matrix_imf.cpp
new file mode 100644
index 000000000..4ae629241
--- /dev/null
+++ b/tests/unit/dense_matrix_imf.cpp
@@ -0,0 +1,195 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <alp.hpp>
+
+void buildUpperTriangularRawArray( std::vector< double > & v, int n ) {
+	for (int i = 0; i < n; ++i ) {
+		for (int j = 0; j < n; ++j ) {
+			if( i >= j ) {
+				v[ i * n + j ] = 1;
+			} else {
+				v[ i * n + j ] = 0;
+			}
+		}
+	}
+}
+
+void alp_program( const size_t & n, alp::RC & rc ) {
+	// initialize test
+	// using General Views over General Structured Matrix
+	alp::Matrix< double, alp::structures::General > A( n, n );
+	std::cout << "General gather from a general Matrix (expect success)\n";
+	try {
+		auto Aview = alp::get_view< alp::structures::General >(
+			A,
+			alp::utils::range(1,3), alp::utils::range(1,5)
+		);
+		(void)Aview;
+		std::cout << "\tSUCCESS\n";
+		rc = alp::SUCCESS;
+	} catch( const std::exception & e ) {
+		std::cerr << e.what() << "\n";
+	}
+
+
+	// using Upper Triangular Structured Matrix
+	alp::Matrix< double, alp::structures::UpperTriangular > U( n, n );
+
+	// Initialize Matrix
+	std::vector< double > Mdata ( n * n, 1 );
+	buildUpperTriangularRawArray( Mdata, n );
+	//rc = alp::buildMatrix( U, Mdata.begin(), Mdata.end() );
+	if( rc != alp::SUCCESS ) {
+		return;
+	}
+
+	// Valid block
+	std::cout << "Gather to UpperTriangular (expect success)\n"
+	"|x  x  x  x  x  x|\n"
+	"|.  A  A  x  x  x|\n"
+	"|.  A  A  x  x  x|\n"
+	"|.  .  .  x  x  x|\n"
+	"|.  .  .  .  x  x|\n"
+	"|.  .  .  .  .  x|\n";
+	try {
+		auto Uview1 = alp::get_view< alp::structures::UpperTriangular >(
+			U,
+			alp::utils::range(1,3), alp::utils::range(1,3)
+		);
+		(void)Uview1;
+		std::cout << "\tSUCCESS\n";
+	} catch( const std::exception & e ) {
+		std::cerr << e.what() << "\n";
+	}
+
+	// Valid block -  because of "casting" to general structure
+	std::cout << "Gather to General (expect success)\n"
+	"|x  x  x  A  A  x|\n"
+	"|.  x  x  A  A  x|\n"
+	"|.  .  x  x  x  x|\n"
+	"|.  .  .  x  x  x|\n"
+	"|.  .  .  .  x  x|\n"
+	"|.  .  .  .  .  x|\n";
+	try {
+		auto Uview2 = alp::get_view< alp::structures::General >(
+			U,
+			alp::utils::range(0,2), alp::utils::range(3,5)
+		);
+		(void)Uview2;
+		std::cout << "\tSUCCESS\n";
+	} catch( const std::exception & e ) {
+		std::cerr << e.what() << "\n";
+	}
+
+	// Invalid block -  selecting a block that is not UpperTriangular
+	std::cout << "Gather to UpperTriangular (expect failure)\n"
+	"|x  x  x  x  x  x|\n"
+	"|.  A  A  A  A  x|\n"
+	"|.  A  A  A  A  x|\n"
+	"|.  .  .  x  x  x|\n"
+	"|.  .  .  .  x  x|\n"
+	"|.  .  .  .  .  x|\n";
+	try {
+		auto Uview3 = alp::get_view< alp::structures::UpperTriangular >(
+			U,
+			alp::utils::range(1,3), alp::utils::range(1,5)
+		);
+		(void)Uview3;
+		std::cout << "\tSUCCESS\n";
+	} catch( const std::exception & e ) {
+		std::cerr << e.what() << "\n";
+	}
+
+	// Invalid block -  currently no support for zero matrix
+	std::cout << "Gather to General (expect failure)\n"
+	"|x  x  x  x  x  x|\n"
+	"|.  x  x  x  x  x|\n"
+	"|.  .  x  x  x  x|\n"
+	"|.  .  .  x  x  x|\n"
+	"|A  A  .  .  x  x|\n"
+	"|A  A  .  .  .  x|\n";
+	try {
+		auto Uview4 = alp::get_view< alp::structures::General >(
+			U,
+			alp::utils::range(4,n), alp::utils::range(0,2)
+		);
+		(void)Uview4;
+		std::cout << "\tSUCCESS\n";
+	} catch( const std::exception & e ) {
+		std::cerr << e.what() << "\n";
+	}
+
+	rc = alp::SUCCESS;
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 5;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else if( read < 6 ) {
+			std::cerr << "Given value for n is smaller than 6\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer >= 6, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/dense_mxm.cpp b/tests/unit/dense_mxm.cpp
new file mode 100644
index 000000000..0f2549243
--- /dev/null
+++ b/tests/unit/dense_mxm.cpp
@@ -0,0 +1,367 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+#include "../utils/print_alp_containers.hpp"
+
+using namespace alp;
+
+
+template< typename T >
+void print_stdvec_as_matrix( std::string name, const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda ) {
+
+	std::cout << "Vec " << name << ":" << std::endl;
+	for( size_t row = 0; row < m; ++row ) {
+		std::cout << "[\t";
+		for( size_t col = 0; col < n; ++col ) {
+			std::cout << vA[ row * lda + col ] << "\t";
+		}
+		std::cout << "]" << std::endl;
+	}
+}
+
+template< typename T, typename Operator, typename Monoid >
+void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
+							const std::vector< T > & vA, const size_t lda,
+							const std::vector< T > & vB, const size_t ldb,
+							const size_t m, const size_t k, const size_t n,
+							const Operator oper,
+							const Monoid monoid ) {
+    
+	T temp;
+
+	print_stdvec_as_matrix("vA", vA, n, n, n);
+	print_stdvec_as_matrix("vB", vB, n, n, n);
+	print_stdvec_as_matrix("vC - PRE", vC, n, n, n);
+
+	for( size_t i = 0; i < m; ++i ) {
+		for( size_t j = 0; j < n; ++j ) {
+			T & c_val { vC[ i * ldc + j ] };
+			for( size_t l = 0; l < k; ++l ) {
+					const T & a_val { vA[ i * lda + l ] };
+					const T & b_val { vB[ l * ldb + j ] };
+					// std::cout << c_val << " += " << a_val << " * " << b_val << std::endl;
+					(void)internal::apply( temp, a_val, b_val, oper );
+					// std::cout << "temp = " << temp << std::endl;
+					(void)internal::foldl( c_val, temp, monoid.getOperator() );
+			}
+		}
+	}
+
+	print_stdvec_as_matrix("vC - POST", vC, n, n, n);
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
+
+		if( std::is_same< Structure, structures::General >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = one;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::General >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = vA[ col * lda + row ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > & vA, const T one ) {
+
+	std::fill( vA.begin(), vA.end(), one );
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > & vA, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper
+			for( auto & elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper
+			for( auto & elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		}
+
+}
+
+template< typename MatType, typename T >
+void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda,
+						 const MatType & mA, double threshold=1e-7 ) {
+
+	if( std::is_same< typename MatType::structure, structures::General >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = 0; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::Symmetric >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::UpperTriangular >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	}
+
+}
+
+void alp_program( const size_t & n, alp::RC & rc ) {
+
+	typedef double T;
+
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	T one  = ring.getOne< T >();
+	T zero = ring.getZero< T >();
+
+	std::vector< T > A_data( n * n );
+	std::vector< T > B_data( n * n );
+	std::vector< T > C_data( n * n, zero );
+
+	std::vector< T > A_packed( n * ( n + 1 ) / 2 );
+	std::vector< T > B_packed( n * ( n + 1 ) / 2 );
+	std::vector< T > C_packed( n * ( n + 1 ) / 2, zero );
+
+	std::vector< T > A_vec( n * n );
+	std::vector< T > B_vec( n * n );
+	std::vector< T > C_vec( n * n, zero );
+
+	std::cout << "\tTesting dense General mxm " << n << std::endl;
+
+	stdvec_build_matrix< structures::General >( A_data, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_data, n, n, n, zero, one, one );
+
+	// initialize test
+	alp::Matrix< T, structures::General > A( n, n );
+	alp::Matrix< T, structures::General > B( n, n );
+	alp::Matrix< T, structures::General > C( n, n );
+
+	// Initialize input matrices
+	rc = alp::buildMatrix( A, A_data.begin(), A_data.end() );
+	rc = alp::buildMatrix( B, B_data.begin(), B_data.end() );
+	rc = alp::buildMatrix( C, C_data.begin(), C_data.end() );
+
+	print_matrix("A", A);
+	print_matrix("B", B);
+	print_matrix("C - PRE", C);
+
+	rc = alp::mxm( C, A, B, ring );
+
+	print_matrix("C - POST", C);
+
+	stdvec_build_matrix< structures::General >( A_vec, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_vec, n, n, n, zero, one, one );
+
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	diff_stdvec_matrix( C_vec, n, n, n, C );
+
+	std::cout << "\n\n=========== Testing Uppertriangular ============\n\n";
+
+	alp::Matrix< T, structures::UpperTriangular > UA( n );
+	alp::Matrix< T, structures::UpperTriangular > UB( n );
+	alp::Matrix< T, structures::UpperTriangular > UC( n );
+
+	stdvec_build_matrix_packed< structures::UpperTriangular >( A_packed, one, one );
+	stdvec_build_matrix_packed< structures::UpperTriangular >( B_packed, one, one );
+
+	rc = alp::buildMatrix( UA, A_packed.begin(), A_packed.end() );
+	rc = alp::buildMatrix( UB, B_packed.begin(), B_packed.end() );
+	rc = alp::buildMatrix( UC, C_packed.begin(), C_packed.end() );
+
+	print_matrix("UA", UA);
+	print_matrix("UB", UB);
+	print_matrix("UC - PRE", UC);
+	rc = alp::mxm( UC, UA, UB, ring );
+	print_matrix("UC - POST", UC);
+
+	stdvec_build_matrix< structures::UpperTriangular >( A_vec, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::UpperTriangular >( B_vec, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero );
+
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	diff_stdvec_matrix( C_vec, n, n, n, UC );
+
+	std::cout << "\n\n=========== Testing Symmetric Output ============\n\n";
+
+	alp::Matrix< T, structures::Symmetric > SC( n );
+
+	stdvec_build_matrix< structures::Symmetric >( A_data, n, n, n, zero, one, one );
+
+	rc = alp::buildMatrix( A,  A_data.begin(), A_data.end() );
+	rc = alp::buildMatrix( SC, C_packed.begin(), C_packed.end() );
+
+	print_matrix("A", A );
+	print_matrix("A^T", alp::get_view< alp::view::transpose >( A ) );
+	print_matrix("SC - PRE", SC);
+	rc = alp::mxm( SC, A, alp::get_view< alp::view::transpose >( A ), ring );
+	print_matrix("SC - POST", SC);
+
+	stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::Symmetric >( C_vec, n, n, n, zero, zero );
+
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, n, A_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	diff_stdvec_matrix( C_vec, n, n, n, SC );
+
+	std::cout << "\n\n=========== Testing Symmetric x Symmetric Output ============\n\n";
+
+	alp::Matrix< T, structures::Symmetric > SA( n );
+	alp::Matrix< T, structures::Symmetric > SB( n );
+
+	stdvec_build_matrix_packed< structures::Symmetric >( A_packed, one, one );
+	stdvec_build_matrix_packed< structures::Symmetric >( B_packed, one, one + one );
+
+	rc = alp::buildMatrix( SA, A_packed.begin(), A_packed.end() );
+	rc = alp::buildMatrix( SB, B_packed.begin(), B_packed.end() );
+	rc = alp::buildMatrix( C,  C_data.begin(), C_data.end() );
+
+	print_matrix("SA", SA);
+	print_matrix("SB", SB);
+	print_matrix("C - PRE", C);
+	rc = alp::mxm( C, SA, SB, ring );
+	print_matrix("C - POST", C);
+
+	stdvec_build_matrix< structures::Symmetric >( A_vec, n, n, n, zero, one, one );
+	stdvec_build_matrix< structures::Symmetric >( B_vec, n, n, n, zero, one, one + one );
+	stdvec_build_matrix< structures::General >( C_vec, n, n, n, zero, zero );
+
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, n, B_vec, n, n, n, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	diff_stdvec_matrix( C_vec, n, n, n, C );
+
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 6;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << " " << in << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/dense_omp_matrix.cpp b/tests/unit/dense_omp_matrix.cpp
new file mode 100644
index 000000000..4b9c4ae8e
--- /dev/null
+++ b/tests/unit/dense_omp_matrix.cpp
@@ -0,0 +1,95 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define _DEBUG
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <memory>
+
+#include <alp.hpp>
+#include "../utils/print_alp_containers.hpp"
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	typedef double T;
+
+	rc = alp::SUCCESS;
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	std::cout << "\tStarting dense OMP matrix test with size: " << n << " x " << n <<  "\n";
+
+	// create the matrix
+	alp::Matrix< T, alp::structures::General > M( n, n );
+
+	// set all matrix elements to 1
+	rc = rc ? rc : alp::set( M, alp::Scalar< T >( ring.template getOne< T >() ) );
+
+	print_matrix( "M", M );
+
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 5;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/dense_omp_mxm.cpp b/tests/unit/dense_omp_mxm.cpp
new file mode 100644
index 000000000..157b71494
--- /dev/null
+++ b/tests/unit/dense_omp_mxm.cpp
@@ -0,0 +1,346 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <alp.hpp>
+#ifdef NDEBUG
+ #include "../utils/print_alp_containers.hpp"
+#endif
+
+using namespace alp;
+
+
+template< typename T >
+void print_stdvec_as_matrix( std::string name, const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda ) {
+
+	std::cout << "Vec " << name << ":" << std::endl;
+	for( size_t row = 0; row < m; ++row ) {
+		std::cout << "[\t";
+		for( size_t col = 0; col < n; ++col ) {
+			std::cout << vA[ row * lda + col ] << "\t";
+		}
+		std::cout << "]" << std::endl;
+	}
+}
+
+template< typename T, typename Operator, typename Monoid >
+void mxm_stdvec_as_matrix(	std::vector< T > & vC, const size_t ldc,
+							const std::vector< T > & vA, const size_t lda,
+							const std::vector< T > & vB, const size_t ldb,
+							const size_t m, const size_t k, const size_t n,
+							const Operator oper,
+							const Monoid monoid ) {
+    
+	T temp;
+
+#ifndef NDEBUG
+	print_stdvec_as_matrix("vA", vA, n, n, n);
+	print_stdvec_as_matrix("vB", vB, n, n, n);
+	print_stdvec_as_matrix("vC - PRE", vC, n, n, n);
+#endif
+
+	for( size_t i = 0; i < m; ++i ) {
+		for( size_t j = 0; j < n; ++j ) {
+			T & c_val { vC[ i * ldc + j ] };
+			for( size_t l = 0; l < k; ++l ) {
+					const T & a_val { vA[ i * lda + l ] };
+					const T & b_val { vB[ l * ldb + j ] };
+					// std::cout << c_val << " += " << a_val << " * " << b_val << std::endl;
+					(void)internal::apply( temp, a_val, b_val, oper );
+					// std::cout << "temp = " << temp << std::endl;
+					(void)internal::foldl( c_val, temp, monoid.getOperator() );
+			}
+		}
+	}
+
+#ifndef NDEBUG
+	print_stdvec_as_matrix("vC - POST", vC, n, n, n);
+#endif
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
+
+		if( std::is_same< Structure, structures::General >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			std::fill( vA.begin(), vA.end(), one );
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = one;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > & vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::General >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::Symmetric >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = vA[ col * lda + row ] = val;
+					val += inc;
+				}
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) {
+			for( size_t row = 0; row < m; ++row ) {
+				for( size_t col = 0; col < row; ++col ) {
+					vA[ row * lda + col ] = zero;
+				}
+				for( size_t col = row; col < n; ++col ) {
+					vA[ row * lda + col ] = val;
+					val += inc;
+				}
+			}
+		}
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > & vA, const T one ) {
+
+	std::fill( vA.begin(), vA.end(), one );
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix_packed( std::vector< T > & vA, const T one, const T inc ) {
+
+		T val = one;
+		if( std::is_same< Structure, structures::Symmetric >::value ) { // Assumes Packed Row - Upper
+			for( auto & elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		} else if( std::is_same< Structure, structures::UpperTriangular >::value ) { // Assumes Packed Row - Upper
+			for( auto & elem: vA ) {
+				elem = val;
+				val += inc;
+			}
+		}
+
+}
+
+template< typename MatType, typename T >
+void diff_stdvec_matrix( const std::vector< T > & vA, const size_t m, const size_t n, const size_t lda,
+						 const MatType & mA, double threshold=1e-7 ) {
+
+	if( std::is_same< typename MatType::structure, structures::General >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = 0; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::Symmetric >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, structures::UpperTriangular >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( internal::access( mA, internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl; 
+				}
+			}
+		}
+	}
+
+}
+
+template < typename T, typename SemiringT >
+void run_mxm( const size_t m, const size_t k, const size_t n, alp::RC &rc ) {
+
+	const SemiringT ring;
+	const T one  = ring.template getOne< T >();
+	const T zero = ring.template getZero< T >();
+
+	std::vector< T > A_data( m * k );
+	std::vector< T > B_data( k * n );
+	std::vector< T > C_data( m * n, zero );
+
+	std::vector< T > A_vec( m * k );
+	std::vector< T > B_vec( k * n );
+	std::vector< T > C_vec( m * n, zero );
+
+	std::cout << "\tTesting dense General mxm " << m << " " << k << " " << n << std::endl;
+
+	stdvec_build_matrix< structures::General >( A_data, m, k, k, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_data, k, n, n, zero, one, one );
+
+	// initialize test
+	alp::Matrix< T, structures::General > A( m, k );
+	alp::Matrix< T, structures::General > B( k, n );
+	alp::Matrix< T, structures::General > C( m, n );
+
+	// Initialize input matrices
+	rc = rc ? rc : alp::buildMatrix( A, A_data.begin(), A_data.end() );
+	if ( rc != alp::SUCCESS ) {
+		std::cerr << "\tIssues building A" << std::endl;
+		return;
+	}
+	rc = rc ? rc : alp::buildMatrix( B, B_data.begin(), B_data.end() );
+	rc = rc ? rc : alp::buildMatrix( C, C_data.begin(), C_data.end() );
+
+	if ( rc != alp::SUCCESS ) {
+		std::cerr << "\tIssues building matrices" << std::endl;
+		return;
+	}
+
+#ifndef NDEBUG
+	print_matrix("A", A);
+	print_matrix("B", B);
+	print_matrix("C - PRE", C);
+#endif
+
+	rc = rc ? rc : alp::mxm( C, A, B, ring );
+
+#ifndef NDEBUG
+	print_matrix("C - POST", C);
+#endif
+
+	if ( rc != alp::SUCCESS )
+		return;
+
+	stdvec_build_matrix< structures::General >( A_vec, m, k, k, zero, one, one );
+	stdvec_build_matrix< structures::General >( B_vec, k, n, n, zero, one, one );
+
+	mxm_stdvec_as_matrix( C_vec, n, A_vec, k, B_vec, n, m, k, n, ring.getMultiplicativeOperator(), ring.getAdditiveMonoid() );
+
+	diff_stdvec_matrix( C_vec, m, n, n, C );
+
+
+	std::cout << "\tDone." << std::endl;
+
+}
+
+#define M ( alp::config::BLOCK_ROW_DIM * n )
+#define K ( alp::config::BLOCK_COL_DIM * 2 * n )
+#define N ( alp::config::BLOCK_COL_DIM * 3 * n )
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	using T = double;
+
+	using SemiringT = alp::Semiring< 
+		alp::operators::add< T >, alp::operators::mul< T >, 
+		alp::identities::zero, alp::identities::one 
+	>;
+
+	rc = alp::SUCCESS;
+
+	/** 
+	 * Testing cubic mxm.
+	 */
+	run_mxm< T, SemiringT >( M, M, M, rc );
+
+	/**
+	 * Testing rectangular mxm
+	 */
+	run_mxm< T, SemiringT >( M, K, N, rc );
+
+	/**
+	 * Testing outer-prod of blocks mxm
+	 */
+	run_mxm< T, SemiringT >( M, alp::config::BLOCK_COL_DIM, N, rc );
+
+	/**
+	 * Testing dot-prod of blocks mxm
+	 */
+	run_mxm< T, SemiringT >( alp::config::BLOCK_ROW_DIM, M, alp::config::BLOCK_COL_DIM, rc );
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 4;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << " " << in << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/dense_outer.cpp b/tests/unit/dense_outer.cpp
new file mode 100644
index 000000000..eecaf4077
--- /dev/null
+++ b/tests/unit/dense_outer.cpp
@@ -0,0 +1,210 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utility>
+#include <iostream>
+
+#include "alp.hpp"
+#include "../utils/print_alp_containers.hpp"
+
+template< typename T >
+void print_stdvec_as_matrix( std::string name, const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda ) {
+
+	std::cout << "Vec " << name << ":" << std::endl;
+	for( size_t row = 0; row < m; ++row ) {
+		std::cout << "[\t";
+		for( size_t col = 0; col < n; ++col ) {
+			std::cout << vA[ row * lda + col ] << "\t";
+		}
+		std::cout << "]" << std::endl;
+	}
+}
+
+template< typename T, typename Operator >
+void outer_stdvec_as_matrix(
+	std::vector< T > &vC, const size_t ldc,
+	const std::vector< T > &vA,
+	const std::vector< T > &vB,
+	const size_t m, const size_t n,
+	const Operator oper
+) {
+
+	print_stdvec_as_matrix("vA", vA, m, 1, 1);
+	print_stdvec_as_matrix("vB", vB, 1, n, n);
+	print_stdvec_as_matrix("vC - PRE", vC, m, n, n);
+
+	for( size_t i = 0; i < m; ++i ) {
+		for( size_t j = 0; j < n; ++j ) {
+			const T &a_val { vA[ i ] };
+			const T &b_val { vB[ j ] };
+			T &c_val { vC[ i * ldc + j ] };
+			(void)alp::internal::apply( c_val, a_val, b_val, oper );
+		}
+	}
+
+	print_stdvec_as_matrix("vC - POST", vC, m, n, n);
+
+}
+
+template< typename Structure, typename T >
+void stdvec_build_matrix( std::vector< T > &vA, const size_t m, const size_t n, const size_t lda, const T zero, const T one ) {
+
+	if( std::is_same< Structure, alp::structures::General >::value ) {
+		std::fill( vA.begin(), vA.end(), one );
+	} else if( std::is_same< Structure, alp::structures::Symmetric >::value ) {
+		std::fill( vA.begin(), vA.end(), one );
+	}
+}
+
+template< typename MatType, typename T >
+void diff_stdvec_matrix(
+	const std::vector< T > &vA, const size_t m, const size_t n, const size_t lda,
+	const MatType &mA,
+	double threshold=1e-7
+) {
+
+	if( std::is_same< typename MatType::structure, alp::structures::General >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = 0; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( alp::internal::access( mA, alp::internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl;
+				}
+			}
+		}
+	} else if( std::is_same< typename MatType::structure, alp::structures::Symmetric >::value ) {
+		for( size_t row = 0; row < m; ++row ) {
+			for( size_t col = row; col < n; ++col ) {
+				double va = ( double )( vA[ row * lda + col ] );
+				double vm = ( double )( alp::internal::access( mA, alp::internal::getStorageIndex( mA, row, col ) ) );
+				double re = std::abs( ( va - vm ) / va );
+				if( re > threshold ) {
+					std::cout << "Error ( " << row << ", " << col << " ): " << va << " v " << vm << std::endl;
+				}
+			}
+		}
+	}
+}
+
+
+// alp program
+void alpProgram( const size_t &n, alp::RC &rc ) {
+
+	typedef double T;
+
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	T zero = ring.getZero< T >();
+
+	// allocate
+	const size_t m = 2 * n;
+	std::vector< T > u_data( m );
+	for( size_t i = 0; i < u_data.size(); ++i ) {
+		u_data[ i ] = i + 1;
+	}
+	std::vector< T > v_data( n );
+	for( size_t i = 0; i < v_data.size(); ++i ) {
+		v_data[ i ] = i + 1;
+	}
+	std::vector< T > M_data( n, zero );
+
+	alp::Vector< T > u( m );
+	alp::Vector< T > v( n );
+	alp::Matrix< T, alp::structures::General > M( m, n );
+
+	// Example with matrix view on a lambda function.
+	// Create before building vector to test functor init status mechanism
+	auto uvT = alp::outer( u, v, ring.getMultiplicativeOperator() );
+
+	std::cout << "Is uvt initialized before initializing source containers? " << alp::internal::getInitialized( uvT ) << "\n";
+
+	alp::buildVector( u, u_data.begin(), u_data.end() );
+	alp::buildVector( v, v_data.begin(), v_data.end() );
+
+	std::cout << "Is uvT initialized after initializing source containers? " << alp::internal::getInitialized( uvT ) << "\n";
+
+	print_matrix( "uvT", uvT );
+
+	std::vector< T > uvT_test( m * n, zero );
+	outer_stdvec_as_matrix( uvT_test, n, u_data, v_data, m, n, ring.getMultiplicativeOperator() );
+	diff_stdvec_matrix( uvT_test, m, n, n, uvT );
+
+	// Example when outer product takes the same vector as both inputs.
+	// This operation results in a symmetric positive definite matrix.
+	auto vvT = alp::outer( v, ring.getMultiplicativeOperator() );
+	print_matrix( "vvT", vvT );
+
+	std::vector< T > vvT_test( n * n, zero );
+	outer_stdvec_as_matrix( vvT_test, n, v_data, v_data, n, n, ring.getMultiplicativeOperator() );
+	diff_stdvec_matrix( vvT_test, n, n, n, vvT );
+
+	// Example with storage-based matrix
+	rc = alp::outer( M, u, v, ring.getMultiplicativeOperator() );
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read == 0 ) {
+			std::cerr << "n must be a positive number\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an integer, the "
+					 "test size.\n";
+		return 1;
+	}
+	std::cout << "Functional test executable: " << argv[ 0 ] << "\n";
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alpProgram, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/dense_set.cpp b/tests/unit/dense_set.cpp
new file mode 100644
index 000000000..5b208b34f
--- /dev/null
+++ b/tests/unit/dense_set.cpp
@@ -0,0 +1,155 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utility>
+#include <iostream>
+
+#include "alp.hpp"
+
+// alp program
+void alpProgram( const size_t &n, alp::RC &rc ) {
+
+	typedef double T;
+
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	T zero = ring.getZero< T >();
+	T one = ring.getOne< T >();
+
+	alp::Matrix< T, alp::structures::General > A( n, n );
+	alp::Scalar< T > zero_scalar( zero );
+	alp::Scalar< T > one_scalar( one );
+
+	assert( !alp::internal::getInitialized( A ) );
+	rc = alp::set( A, one_scalar );
+	assert( rc == alp::SUCCESS );
+	assert( alp::internal::getInitialized( A ) );
+
+	alp::Matrix< T, alp::structures::General > B( n, n );
+	// set with matching structures and sizes, but source is uninitialized
+	rc = set( A, B );
+	assert( rc == alp::SUCCESS );
+	assert( !alp::internal::getInitialized( A ) );
+
+	// re-initialize matrix A
+	rc = set( A, one_scalar );
+	assert( rc == alp::SUCCESS );
+
+	// set a matrix to another matrix of the same structure but different size
+	alp::Matrix< T, alp::structures::General > C( 2 * n, n );
+	rc = set( C, A );
+	assert( rc == alp::MISMATCH );
+
+	// set a symmetric matrix to a scalar
+	alp::Matrix< T, alp::structures::Symmetric > D( n, n );
+	rc = set( D, one_scalar );
+	assert( rc == alp::SUCCESS );
+	assert( alp::internal::getInitialized( D ) );
+	for( size_t i = 0; i < nrows( D ); ++i ) {
+		for( size_t j = i; j < ncols( D ); ++j ) {
+			assert( *one_scalar == alp::internal::access( D, alp::internal::getStorageIndex( D, i, j ) ) );
+		}
+	}
+
+	// set a symmetric matrix to another symmetric matrix
+	alp::Matrix< T, alp::structures::Symmetric > E( n, n );
+	rc = set( E, D );
+	assert( rc == alp::SUCCESS );
+	assert( alp::internal::getInitialized( E ) );
+	for( size_t i = 0; i < nrows( E ); ++i ) {
+		for( size_t j = i; j < ncols( E ); ++j ) {
+			assert(
+				alp::internal::access( E, alp::internal::getStorageIndex( E, i, j ) ) ==
+				alp::internal::access( D, alp::internal::getStorageIndex( D, i, j ) )
+			);
+		}
+	}
+
+	alp::Vector< T > v( n );
+	assert( !alp::internal::getInitialized( v ) );
+
+	// set vector to a scalar
+	rc = set( v, one_scalar );
+	assert( rc == alp::SUCCESS );
+	assert( alp::internal::getInitialized( v ) );
+
+	// set vector to another vector
+	alp::Vector< T > u( n );
+	rc = set( u, v );
+	assert( rc == alp::SUCCESS );
+	assert( v[ 0 ] == u[ 0 ] );
+	assert( alp::internal::getInitialized( u ) );
+
+	// set scalar to another scalar
+	alp::Scalar< T > beta;
+	assert( !alp::internal::getInitialized( beta ) );
+	rc = set( beta, one_scalar );
+	assert( rc == alp::SUCCESS );
+	assert( alp::internal::getInitialized( beta ) );
+	assert( *beta == *one_scalar );
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read == 0 ) {
+			std::cerr << "n must be a positive number\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an integer, the "
+					 "test size.\n";
+		return 1;
+	}
+	std::cout << "Functional test executable: " << argv[ 0 ] << "\n";
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alpProgram, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/dense_sort.cpp b/tests/unit/dense_sort.cpp
new file mode 100644
index 000000000..b93f4d183
--- /dev/null
+++ b/tests/unit/dense_sort.cpp
@@ -0,0 +1,184 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <alp.hpp>
+
+#include "../utils/print_alp_containers.hpp"
+
+using namespace alp;
+
+
+void alp_program( const size_t &n, alp::RC &rc ) {
+
+	typedef double T;
+
+#ifndef NDEBUG
+	/**
+	 * Note: in case of a distributed backend, the resulting printout 
+	 * may appear interleaved across processes. In ALP/GraphBLAS, there is 
+	 * the if( spmd<>::pid() == k ) guard to print for process k only, 
+	 * and the statement can of course be looped over k=0 
+	 * to spmd<>::nprocs() and separated by spmd<>::barrier().
+	 */
+	auto print_std_vector = [](std::vector< T > const vec) {
+		for(auto val : vec) {
+			std::cout << val << ' ';
+		}
+		std::cout << std::endl;
+	};
+#endif
+
+	// Check with vector of length n randomly intitialized and shuffled
+	alp::Vector< size_t > perm( n );
+	alp::Vector< T > v( n );
+
+	std::random_device rd;
+	std::default_random_engine rng( rd() );
+
+	std::vector< T > stdv( n );
+
+	std::iota( std::begin( stdv ), std::end( stdv ), 0. );
+	std::shuffle( std::begin( stdv ), std::end( stdv ), rng );
+
+	alp::buildVector( v, std::begin( stdv ), std::end( stdv ) );
+
+#ifndef NDEBUG
+	std::cout << "Original content of the std::vector:" << std::endl;
+	print_std_vector( stdv );
+	std::cout << "Original content of the alp::Vector:" << std::endl;
+	print_vector("v", v);
+#endif
+
+	alp::sort( perm, v, relations::lt< T >() );
+
+	std::sort( std::begin( stdv ), std::end( stdv ) );
+
+	auto sorted_v = alp::get_view< alp::structures::General >( v, perm );
+
+	// Check sorted view
+	for( size_t i = 0; i < n; i++ ) {
+		if( stdv[i] != sorted_v[ i ] ) {
+#ifndef NDEBUG
+			std::cerr << "Error: ( std_v[ " << i << " ] = " << stdv[i] << " ) != " 
+				<< " ( alp_sorted_v[ " << i << " ] = " << sorted_v[ i ] << " )" 
+				<< std::endl;
+#endif
+			rc = alp::FAILED;
+#ifdef NDEBUG
+			return;
+#endif
+		}
+	}
+
+#ifndef NDEBUG
+	std::cout << "Sorted alp::Vector:" << std::endl;
+	print_vector("sorted_v", sorted_v);
+
+	if( rc == alp::FAILED ) {
+		return;
+	}
+#endif
+
+	// Check descending sorted view
+	alp::sort( perm, v, alp::relations::gt< T >() );
+
+	auto desc_cmp = []( const T& a, const T& b) {
+		return a > b;
+	};
+
+	std::sort( std::begin( stdv ), std::end( stdv ), desc_cmp );
+
+	auto desc_sorted_v = alp::get_view< alp::structures::General >( v, perm );
+
+	// Check sorted view
+	for( size_t i = 0; i < n; i++ ) {
+		if( stdv[i] != desc_sorted_v[ i ] ) {
+#ifndef NDEBUG
+			std::cerr << "Error: ( std_v[ " << i << " ] = " << stdv[i] << " ) != " 
+			<< " ( alp_sorted_v[ " << i << " ] = " << desc_sorted_v[ i ] << " )" 
+			<< std::endl;
+#endif
+			rc = alp::FAILED;
+#ifdef NDEBUG
+			return;
+#endif
+		}
+	}
+
+#ifndef NDEBUG
+	std::cout << "Sorted alp::Vector in descending order:" << std::endl;
+	print_vector("desc_sorted_v", desc_sorted_v);
+#endif
+
+	rc = alp::SUCCESS;
+}
+
+int main( int argc, char **argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+			"test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cout << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+		return out;
+	} else {
+		std::cout << "Test OK" << std::endl;
+		return 0;
+	}
+}
+
diff --git a/tests/unit/dense_storage_polynomials.cpp b/tests/unit/dense_storage_polynomials.cpp
new file mode 100644
index 000000000..6f0205d10
--- /dev/null
+++ b/tests/unit/dense_storage_polynomials.cpp
@@ -0,0 +1,198 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utility>
+#include <iostream>
+#include <set>
+
+#include "alp.hpp"
+
+template< typename Container >
+bool has_no_conflict( const size_t k, const size_t i, const size_t j, const Container &indices ) {
+
+	if( indices.find( k ) != indices.end() ) {
+		std::cerr << "Coordinate ( " << i << ", " << j << " ) maps to the same location as another coordinate\n";
+		return false;
+	} else {
+		return true;
+	}
+}
+
+bool is_within_bounds( const size_t k, const size_t i, const size_t j, const size_t storage_dimensions ) {
+
+	if( k >= storage_dimensions ) {
+		std::cerr << "Coordinate ( " << i << ", " << j << " ) maps outside of storage bounds [0, " << storage_dimensions << ").\n";
+		return false;
+	} else {
+		return true;
+	}
+}
+
+bool maps_to_more( const size_t stored, const size_t storage_dimensions ) {
+
+	if( stored > storage_dimensions ) {
+		std::cerr << "Polynomial maps to more elements than the claimed amount of " << storage_dimensions << "elements.\n";
+		return true;
+	} else {
+		return false;
+	}
+}
+
+bool maps_to_less( const size_t stored, const size_t storage_dimensions ) {
+
+	if( stored < storage_dimensions ) {
+		std::cerr << "Polynomial maps to less elements than the claimed amount of " << storage_dimensions << "elements.\n";
+		return true;
+	} else {
+		return false;
+	}
+}
+
+// alp program
+void alpProgram( const size_t &n, alp::RC &rc ) {
+
+	const size_t m = 2 * n;
+
+	// Test full storage of size m * n
+	{
+		typedef alp::storage::polynomials::FullFactory<> Factory;
+		const auto poly = Factory::Create( m, n );
+		const size_t storage_dimensions = Factory::GetStorageDimensions( m, n );
+		std::set< size_t > indices;
+		for( size_t i = 0; i < m; ++i ) {
+			for( size_t j = 0; j < n; ++j ) {
+				const size_t k = poly.evaluate( i, j );
+				if( !has_no_conflict( k, i, j, indices ) ) {
+					return;
+				}
+				if( !is_within_bounds( k, i, j, storage_dimensions ) ) {
+					return;
+				}
+				indices.insert( k );
+			}
+		}
+		if( maps_to_less( indices.size(), storage_dimensions ) ) {
+			return;
+		}
+		if( maps_to_more( indices.size(), storage_dimensions ) ) {
+			return;
+		}
+	}
+
+	// Test packed storage of size n * n storing upper triangular portion row-wise
+	{
+		typedef alp::storage::polynomials::PackedFactory< alp::storage::UPPER, alp::storage::ROW_WISE > Factory;
+		const auto poly = Factory::Create( n, n );
+		const size_t storage_dimensions = Factory::GetStorageDimensions( n, n );
+		std::set< size_t > indices;
+		for( size_t i = 0; i < n; ++i ) {
+			for( size_t j = i; j < n; ++j ) {
+				const size_t k = poly.evaluate( i, j );
+				if( !has_no_conflict( k, i, j, indices ) ) {
+					return;
+				}
+				if( !is_within_bounds( k, i, j, storage_dimensions ) ) {
+					return;
+				}
+				indices.insert( k );
+			}
+		}
+		if( maps_to_less( indices.size(), storage_dimensions ) ) {
+			return;
+		}
+		if( maps_to_more( indices.size(), storage_dimensions ) ) {
+			return;
+		}
+	}
+
+	// Test array storage of size n * 1
+	{
+		typedef alp::storage::polynomials::ArrayFactory Factory;
+		const auto poly = Factory::Create( n, 1 );
+		const size_t storage_dimensions = Factory::GetStorageDimensions( n, 1 );
+		std::set< size_t > indices;
+		for( size_t i = 0; i < n; ++i ) {
+			const size_t k = poly.evaluate( i, 0 );
+			if( !has_no_conflict( k, i, 0, indices ) ) {
+				return;
+			}
+			if( !is_within_bounds( k, i, 0, storage_dimensions ) ) {
+				return;
+			}
+			indices.insert( k );
+		}
+		if( maps_to_less( indices.size(), storage_dimensions ) ) {
+			return;
+		}
+		if( maps_to_more( indices.size(), storage_dimensions ) ) {
+			return;
+		}
+	}
+
+	rc = alp::SUCCESS;
+
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read == 0 ) {
+			std::cerr << "n must be a positive number\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an integer, the "
+					 "test size.\n";
+		return 1;
+	}
+	std::cout << "Functional test executable: " << argv[ 0 ] << "\n";
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alpProgram, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/dense_structured_matrix.cpp b/tests/unit/dense_structured_matrix.cpp
new file mode 100644
index 000000000..33be6a831
--- /dev/null
+++ b/tests/unit/dense_structured_matrix.cpp
@@ -0,0 +1,140 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <complex>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <memory>
+
+#include <alp.hpp>
+
+template< typename StructuredMat >
+void ask_questions( const StructuredMat & M, std::string name ) {
+
+	using M_type = typename std::remove_const< typename std::remove_reference< decltype( M ) >::type >::type;
+
+	std::cout << name << "( " << alp::nrows( M ) << ", " << alp::ncols( M ) << " )" << std::endl;
+	std::cout << "Is " << name << ":" << std::endl;
+	std::cout << "\ta structured Matrix? " << alp::is_structured_matrix< M_type >::value << std::endl;
+	std::cout << "\tgeneral? " << alp::structures::is_a< typename M_type::structure, alp::structures::General >::value << std::endl;
+	std::cout << "\tsquare? " << alp::structures::is_a< typename M_type::structure, alp::structures::Square >::value << std::endl;
+	std::cout << "\tfull rank? " << alp::structures::is_a< typename M_type::structure, alp::structures::FullRank >::value << std::endl;
+	std::cout << "\tnon-singular? " << alp::structures::is_a< typename M_type::structure, alp::structures::NonSingular >::value << std::endl;
+	std::cout << "\tsymmetric? " << alp::structures::is_in< alp::structures::Symmetric, typename M_type::structure::inferred_structures >::value << std::endl;
+	std::cout << "\ttridiagonal? " << alp::structures::is_in< alp::structures::Tridiagonal, typename M_type::structure::inferred_structures >::value << std::endl;
+}
+
+void alp_program( const size_t & n, alp::RC & rc ) {
+
+	std::cout << "\tStarting structured matrices test with size: " << n << "\n";
+	rc = alp::SUCCESS;
+
+	// initialize test
+	alp::Matrix< float, alp::structures::General > M( n, n );
+	alp::Matrix< float, alp::structures::Square > A( n );
+	alp::Matrix< float, alp::structures::Orthogonal > Orth( n );
+	alp::Matrix< float, alp::structures::SymmetricTridiagonal > SymmTridiag( n );
+	alp::Matrix< std::complex< float >, alp::structures::Hermitian > Hermit( n );
+	alp::Matrix< std::complex< float >, alp::structures::HermitianTridiagonal > HermitTridiag( n );
+	// TODO: temporarily comented until containers are ready
+	//alp::Matrix< float, alp::structures::NonSingular > B( n, n );
+	//alp::Matrix< float, alp::structures::FullRank > C( n, 2 * n );
+	auto At = alp::get_view< alp::view::transpose >( A );
+	auto Mt = alp::get_view< alp::view::transpose >( M );
+	auto Mview = alp::get_view( M, alp::utils::range(0,4), alp::utils::range(0,4) );
+	auto Sq_Mref = alp::get_view< alp::structures::Square > ( M );
+
+	ask_questions( M, "M" );
+	ask_questions( A, "A" );
+	ask_questions( Orth, "Orth" );
+	ask_questions( SymmTridiag, "SymmTridiag" );
+	ask_questions( Hermit, "Hermit" );
+	ask_questions( HermitTridiag, "HermitTridiag" );
+	// TODO: temporarily comented until containers are ready
+	//ask_questions( B, "B" );
+	//ask_questions( C, "C" );
+
+	ask_questions( At, "At" );
+	ask_questions( Mt, "Mt" );
+	ask_questions( Mview, "Mview" );
+	ask_questions( Sq_Mref, "Sq_Mref" );
+
+	auto v_diag = alp::get_view< alp::view::diagonal >( M );
+	auto v_view1 = alp::get_view( v_diag );
+	//auto v_view2 = alp::get_view( v_diag, alp::utils::range(1,2) );
+	std::cout << "v_diag( " << alp::size( v_diag ) << " )" << std::endl;
+	std::cout << "v_view1( " << alp::size( v_view1 ) << " )" << std::endl;
+	//std::cout << "v_view2( " << alp::size( v_view2 ) << " )" << std::endl;
+
+	// TODO: temporarily comented until containers are ready
+	//alp::Matrix< float, alp::structures::Band< alp::Interval<-2, 5> > > BM0( n, n );
+	//alp::Matrix< float, alp::structures::Band< alp::RightOpenInterval<-2> > > BM1( n, n );
+	//alp::Matrix< float, alp::structures::Band< alp::LeftOpenInterval<-2> > > BM2( n, n );
+	//alp::Matrix< double, alp::structures::Band< alp::Interval<-2>, alp::Interval<1>, alp::Interval<3> > > BM3( n, n );
+	rc = alp::SUCCESS;
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 5;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/unit/dispatch_access.cpp b/tests/unit/dispatch_access.cpp
new file mode 100644
index 000000000..037eedb2a
--- /dev/null
+++ b/tests/unit/dispatch_access.cpp
@@ -0,0 +1,129 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <memory>
+
+#include <alp.hpp>
+
+#include "../utils/print_alp_containers.hpp"
+
+typedef float T;
+
+template< typename T >
+void init_matrix( std::vector< T > &A, const size_t rows, const size_t cols ) {
+
+	size_t multiplier;
+	for( multiplier = 1; multiplier < rows; multiplier *= 10 );
+
+	for( size_t row = 0; row < rows; ++row ) {
+		for( size_t col = 0; col < cols; ++col ) {
+			A[ row * cols + col ] = multiplier * row + col;
+		}
+	}
+}
+
+void alp_program( const size_t & n, alp::RC & rc ) {
+
+	alp::Semiring< alp::operators::add< T >, alp::operators::mul< T >, alp::identities::zero, alp::identities::one > ring;
+
+	rc = alp::SUCCESS;
+
+	// create the original matrix
+	std::vector< T > M_data( n * n );
+	init_matrix( M_data, n, n );
+
+	alp::Matrix< T, alp::structures::General > M( n, n );
+	alp::buildMatrix( M, M_data.begin(), M_data.end() );
+	print_matrix( "M", M );
+
+	T *M_ptr = alp::internal::getRawPointerToFirstElement( M );
+	std::cout << M_ptr[0] << std::endl;
+	std::cout << "Leading dimension = " << alp::internal::getLeadingDimension( M ) << "\n";
+
+	// matrix view
+	auto A = alp::get_view( M, alp::utils::range( 2, 4 ), alp::utils::range( 2, 4 ) );
+
+	T *A_ptr = alp::internal::getRawPointerToFirstElement( A );
+	std::cout << A_ptr[0] << std::endl;
+	std::cout << "Leading dimension = " << alp::internal::getLeadingDimension( A ) << "\n";
+
+	// vector view
+	auto v = alp::get_view( M, 1, alp::utils::range( 2, 4 ) );
+	T *v_ptr = alp::internal::getRawPointerToFirstElement( v );
+	std::cout << v_ptr[0] << std::endl;
+	std::cout << " INC = " << alp::internal::getIncrement( v ) << "\n";
+
+	auto u = alp::get_view( M, alp::utils::range( 2, 4 ), 1 );
+	T *u_ptr = alp::internal::getRawPointerToFirstElement( u );
+	std::cout << u_ptr[0] << std::endl;
+	std::cout << " INC = " << alp::internal::getIncrement( u ) << "\n";
+
+	rc = alp::SUCCESS;
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 5;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	alp::Launcher< alp::AUTOMATIC > launcher;
+	alp::RC out;
+	if( launcher.exec( &alp_program, in, out, true ) != alp::SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != alp::SUCCESS ) {
+		std::cerr << "Test FAILED (" << alp::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
diff --git a/tests/utils/print_alp_containers.hpp b/tests/utils/print_alp_containers.hpp
new file mode 100644
index 000000000..ed1401bc2
--- /dev/null
+++ b/tests/utils/print_alp_containers.hpp
@@ -0,0 +1,89 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _H_TEST_UTILS_PRINT_ALP_CONTAINERS
+#define _H_TEST_UTILS_PRINT_ALP_CONTAINERS
+
+#include <iomanip>
+#include <cstdlib>
+#include <climits>
+
+#include <alp.hpp>
+
+template<
+	typename MatrixType,
+	std::enable_if_t< alp::is_matrix< MatrixType >::value > * = nullptr
+>
+void print_matrix( std::string name, const MatrixType &A ) {
+
+	if( ! alp::internal::getInitialized( A ) ) {
+		std::cout << "Matrix " << name << " uninitialized. Nothing to print.\n";
+		return;
+	}
+
+	constexpr bool is_sym { alp::structures::is_a< typename alp::inspect_structure< MatrixType >::type, alp::structures::Symmetric >::value };
+	// Temporary until adding multiple symmetry directions
+	constexpr bool sym_up { is_sym };
+
+	std::cout << name << "= [\n";
+	for( size_t row = 0; row < alp::nrows( A ); ++row ) {
+		std::cout << " [";
+		for( size_t col = 0; col < alp::ncols( A ); ++col ) {
+			if( alp::is_non_zero< typename alp::inspect_structure< MatrixType >::type >( row, col ) ) {
+				const auto k = ( !is_sym || ( is_sym && ( sym_up == ( row < col ) ) ) ) ?
+					alp::internal::getStorageIndex( A, row, col ) :
+					alp::internal::getStorageIndex( A, col, row );
+				auto val = alp::internal::access( A, k );
+				val = std::abs(val) < 1.e-10 ? 0 : val;
+				std::cout << std::setprecision( 10 ) << "\t" << val  ;
+				if( col + 1 != alp::ncols( A ) ) {
+					std::cout <<  ",";
+				}
+			} else {
+				std::cout << std::setprecision( 0 ) << "\t" << ".";
+			}
+		}
+		if( row + 1 != alp::nrows( A ) ) {
+			std::cout << "\t" << "]," << "\n";
+		} else {
+			std::cout << "\t" << "]" << "\n";
+		}
+
+	}
+	std::cout << "]\n";
+}
+
+template<
+	typename VectorType,
+	std::enable_if_t< alp::is_vector< VectorType >::value > * = nullptr
+>
+void print_vector( std::string name, const VectorType &v ) {
+
+	if( ! alp::internal::getInitialized( v ) ) {
+		std::cout << "Vector " << name << " uninitialized. Nothing to print.\n";
+		return;
+	}
+
+	std::cout << name << ":" << std::endl;
+	std::cout << "[";
+	for( size_t i = 0; i < alp::nrows( v ); ++i ) {
+		std::cout << std::setprecision( 3 ) << "\t" << v[ i ];
+	}
+	std::cout << "\t" << "]" << "\n";
+}
+
+#endif // _H_TEST_UTILS_PRINT_ALP_CONTAINERS