natverse · jefferis · Aug 17, 2025 · Nov 29, 2023 · Nov 29, 2023 · Jul 25, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,12 +13,12 @@ Description: Provides data source agnostic utility functions to support
 License: GPL (>= 3)
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Imports: 
     bit64,
     checkmate,
     glue,
-    Matrix,
+    Matrix (>= 1.5-3),
     grDevices,
     stats
 Suggests: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,12 +1,15 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_cluster_info)
+export(colScaleM)
 export(cosine_sim)
 export(dataset_names)
+export(geomScaleM)
 export(id2char)
 export(partner_summary2adjacency_matrix)
 export(prepare_cosine_matrix)
 export(register_dataset)
+export(rowScaleM)
 importFrom(grDevices,hcl.colors)
 importFrom(stats,as.dist)
 importFrom(stats,hclust)

diff --git a/R/sparsemat.R b/R/sparsemat.R
@@ -0,0 +1,79 @@
+#' Efficient column and row scaling for sparse matrices
+#'
+#' @details Conventionally in connectivity matrices, rows are upstream input
+#'   neurons and columns are downstream output neurons. \code{colScaleM} will
+#'   therefore normalise by the total input onto each downstream neuron while
+#'   \code{rowScaleM} will normalise by the total output from each upstream
+#'   neuron.
+#'
+#'   These functions will work with dense inputs but are less efficient than
+#'   their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k
+#'   subset of VNC connectivity data took \itemize{
+#'
+#'   \item \code{colScaleM} sparse 3.5ms
+#'
+#'   \item \code{colScaleM} dense 1460ms
+#'
+#'   \item \code{scale} dense 365ms
+#'
+#'   }
+#'
+#'   In conclusion sparse matrices with \code{colScaleM}/\code{rowScaleM} are
+#'   the way to go for real connectivity matrices! For large matrices they will
+#'   be \emph{much} faster and for small ones, both methods will be fast anyway.
+#'
+#'   Note also that these functions were originally called \code{colScale} etc
+#'   but then the \code{\link[Matrix]{Matrix}} package added functions of the
+#'   same name. The functions in this package differ in two respects from the
+#'   \code{Matrix::\link[Matrix:colScale]{colScale,rowScale}} functions. First
+#'   they calculate the required row or column sums to perform the scaling.
+#'   Second, they handle the inevitable zero sum rows or columns via the default
+#'   \code{na.rm} argument.
+#'
+#' @param A a sparse (or dense) matrix
+#' @param na.rm when \code{T}, the default, converts any NA values (usually
+#'   resulting from divide by zero errors when a neuron has no partners) to 0
+#' @description \code{colScale} normalises a matrix by the sum of each column
+#'
+#' @return A scaled sparse matrix
+#' @export
+#'
+#' @seealso \code{Matrix::\link[Matrix]{colScale}} on which these are based and,
+#'   for the original version,
+#'   \url{https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r}
+#'
+#' @examples
+#' library(Matrix)
+#' set.seed(42)
+#' A <- Matrix(rbinom(100,10,0.05), nrow = 10)
+#' rownames(A)=letters[1:10]
+#' colnames(A)=LETTERS[1:10]
+#'
+#' colScaleM(A)
+#' rowScaleM(A)
+#' geomScaleM(A)
+colScaleM <- function(A, na.rm=TRUE) {
+  scalefac = 1 / Matrix::colSums(A)
+  if(na.rm) scalefac[!is.finite(scalefac)]=0
+  Matrix::colScale(A, scalefac)
+}
+
+#' @export
+#' @rdname colScaleM
+#' @description \code{rowScale} normalises a matrix by the sum of each row
+rowScaleM <- function(A, na.rm=TRUE) {
+  scalefac = 1 / Matrix::rowSums(A)
+  if(na.rm) scalefac[!is.finite(scalefac)]=0
+  Matrix::rowScale(A, scalefac)
+}
+
+#' @export
+#' @rdname colScaleM
+#' @description sqrt of both column and row normalisation
+geomScaleM <- function(A, na.rm=TRUE) {
+  sf1=sqrt(1/Matrix::rowSums(A))
+  if(na.rm) sf1[!is.finite(sf1)]=0
+  sf2=sqrt(1/Matrix::colSums(A))
+  if(na.rm) sf2[!is.finite(sf2)]=0
+  Matrix::dimScale(A, d1 = sf1, d2 = sf2)
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -5,8 +5,10 @@ Lifecycle
 Natverse
 ORCID
 PNs
+VNC
 connectome
 connectomics
+etc
 fafbseg
 flywire
 natverse

diff --git a/man/colScaleM.Rd b/man/colScaleM.Rd
diff --git a/tests/testthat/test-sparsemat.R b/tests/testthat/test-sparsemat.R
@@ -0,0 +1,37 @@
+test_that("multiplication works", {
+  # library(Matrix)
+  # set.seed(42)
+  # A <- Matrix(rbinom(100,10,0.05), nrow = 10)
+  # rownames(A)=letters[1:10]
+  # colnames(A)=LETTERS[1:10]
+  A=new("dgCMatrix", i = c(0L, 1L, 3L, 4L, 6L, 8L, 9L, 1L, 2L, 5L,
+6L, 0L, 2L, 3L, 7L, 9L, 0L, 1L, 3L, 5L, 8L, 9L, 3L, 5L, 6L, 7L,
+8L, 9L, 3L, 5L, 6L, 0L, 1L, 2L, 4L, 7L, 8L, 5L, 3L, 4L, 0L, 3L,
+4L, 5L, 8L, 9L), p = c(0L, 7L, 11L, 16L, 22L, 28L, 31L, 37L,
+38L, 40L, 46L), Dim = c(10L, 10L), Dimnames = list(c("a", "b",
+"c", "d", "e", "f", "g", "h", "i", "j"), c("A", "B", "C", "D",
+"E", "F", "G", "H", "I", "J")), x = c(2, 2, 1, 1, 1, 1, 1, 1,
+2, 2, 2, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1), factors = list())
+  colScaleM(A)
+  csA=new("dgCMatrix", i = c(0L, 1L, 3L, 4L, 6L, 8L, 9L, 1L, 2L, 5L,
+6L, 0L, 2L, 3L, 7L, 9L, 0L, 1L, 3L, 5L, 8L, 9L, 3L, 5L, 6L, 7L,
+8L, 9L, 3L, 5L, 6L, 0L, 1L, 2L, 4L, 7L, 8L, 5L, 3L, 4L, 0L, 3L,
+4L, 5L, 8L, 9L), p = c(0L, 7L, 11L, 16L, 22L, 28L, 31L, 37L,
+38L, 40L, 46L), Dim = c(10L, 10L), Dimnames = list(c("a", "b",
+"c", "d", "e", "f", "g", "h", "i", "j"), c("A", "B", "C", "D",
+"E", "F", "G", "H", "I", "J")), x = c(0.222222222222222, 0.222222222222222,
+0.111111111111111, 0.111111111111111, 0.111111111111111, 0.111111111111111,
+0.111111111111111, 0.142857142857143, 0.285714285714286, 0.285714285714286,
+0.285714285714286, 0.125, 0.375, 0.25, 0.125, 0.125, 0.166666666666667,
+0.166666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667,
+0.166666666666667, 0.222222222222222, 0.222222222222222, 0.111111111111111,
+0.111111111111111, 0.222222222222222, 0.111111111111111, 0.333333333333333,
+0.333333333333333, 0.333333333333333, 0.142857142857143, 0.285714285714286,
+0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143,
+1, 0.5, 0.5, 0.125, 0.25, 0.25, 0.125, 0.125, 0.125), factors = list())
+  expect_equal(colScaleM(A), csA)
+
+  expect_equal(geomScaleM(A),
+    sqrt(colScaleM(A)*rowScaleM(A)))
+})