'Assemble list of sparse matrices with variable columns

I have a (named) list of sparse matrices. They have the same rows, and their columns come from the same set. However, any sparse matrix may have a subset of the possible columns. For example:

library(Matrix)

set.seed(2)
my_colnames <- LETTERS[1:5]
my_rownames <- letters[1:3]
my_mat_names <- month.name[1:3]

generate_mat <- function(){
  cols_here <- sample(my_colnames, rbinom(1,5, .7)) |> sort()
  Matrix(rbinom(length(cols_here)*length(my_rownames),5, .2),
         nrow = length(my_rownames),
         ncol = length(cols_here),
         dimnames = list(my_rownames, cols_here)) |>
    as("dgCMatrix")
}

list_of_mat <- replicate(length(my_mat_names), generate_mat()) |>
  setNames(my_mat_names)

list_of_mat
#> $January
#> 3 x 4 sparse Matrix of class "dgCMatrix"
#>   A B D E
#> a 1 . 3 .
#> b . 1 . 1
#> c 2 2 1 1
#> 
#> $February
#> 3 x 3 sparse Matrix of class "dgCMatrix"
#>   A B E
#> a 3 . 1
#> b . 2 1
#> c . 2 2
#> 
#> $March
#> 3 x 4 sparse Matrix of class "dgCMatrix"
#>   A B C E
#> a 3 1 . 2
#> b 2 1 . .
#> c 3 2 1 2

I want to combine the individual matrices in a way that I can use them, e.g. as a (sparse) array, or a long data frame:

library(purrr)

list_of_mat_to_long_df <- function(list_of_mat){
  all_rownames <- map(list_of_mat, ~ rownames(.x)) |>
    reduce(union) |>
    unique()
  all_colnames <- map(list_of_mat, ~ colnames(.x)) |>
    reduce(union) |>
    unique()
  
  expanded <- expand.grid(mat_name = names(list_of_mat),
                          row_name = all_rownames,
                          col_name = all_colnames)
  
  expanded$value <- purrr::pmap_dbl(expanded,
                                    \(mat_name, row_name, col_name) tryCatch(list_of_mat[[mat_name]][row_name, col_name],
                                                                             error = \(e)0))
  expanded
}

list_of_mat_to_long_df(list_of_mat)
#>    mat_name row_name col_name value
#> 1   January        a        A     1
#> 2  February        a        A     3
#> 3     March        a        A     3
#> 4   January        b        A     0
#> 5  February        b        A     0
#> 6     March        b        A     2
#> 7   January        c        A     2
#> 8  February        c        A     0
#> 9     March        c        A     3
#> 10  January        a        B     0
#> 11 February        a        B     0
#> 12    March        a        B     1
#> 13  January        b        B     1
#> 14 February        b        B     2
#> 15    March        b        B     1
#> 16  January        c        B     2
#> 17 February        c        B     2
#> 18    March        c        B     2
#> 19  January        a        D     3
#> 20 February        a        D     0
#> 21    March        a        D     0
#> 22  January        b        D     0
#> 23 February        b        D     0
#> 24    March        b        D     0
#> 25  January        c        D     1
#> 26 February        c        D     0
#> 27    March        c        D     0
#> 28  January        a        E     0
#> 29 February        a        E     1
#> 30    March        a        E     2
#> 31  January        b        E     1
#> 32 February        b        E     1
#> 33    March        b        E     0
#> 34  January        c        E     1
#> 35 February        c        E     2
#> 36    March        c        E     2
#> 37  January        a        C     0
#> 38 February        a        C     0
#> 39    March        a        C     0
#> 40  January        b        C     0
#> 41 February        b        C     0
#> 42    March        b        C     0
#> 43  January        c        C     0
#> 44 February        c        C     0
#> 45    March        c        C     1

In practice, these matrices are big (100k rows/columns), so the proposed solution is too slow. In addition, they can not be cast as dense matrices. Is there an efficient way to do this?

Note: this topic has a way to resize sparse matrices using the column index, which I can not directly use as I have to rely on column names.



Solution 1:[1]

I guess you have a couple of options. The first is to cbind the sparse matrices and construct a factor specifying the origin of each column in the resulting (wider) sparse matrix:

(xx <- do.call(cbind, list_of_mat))
## 3 x 11 sparse Matrix of class "dgCMatrix"
##    [[ suppressing 11 column names 'A', 'B', 'D' ... ]]
##                        
## a 1 . 3 . 3 . 1 3 1 . 2
## b . 1 . 1 . 2 1 2 1 . .
## c 2 2 1 1 . 2 2 3 2 1 2

(matname <- rep.int(gl(length(list_of_mat), 1L, labels = names(list_of_mat)),
                    vapply(list_of_mat, ncol, 0L)))
##  [1] January  January  January  January  February February February March   
##  [9] March    March    March   
## Levels: January February March

The second is to construct a long data frame like the one you've shown but excluding rows with value == 0 to conserve memory. Matrix has mat2triplet for this purpose:

(dd <- lapply(list_of_mat, 
              function(M) as.data.frame(mat2triplet(M)) |>
                              transform(rowname = rownames(M)[i],
                                        colname = colnames(M)[j]) |>
                              subset(select = -c(i, j))))
## $January
##   x rowname colname
## 1 1       a       A
## 2 2       c       A
## 3 1       b       B
## 4 2       c       B
## 5 3       a       D
## 6 1       c       D
## 7 1       b       E
## 8 1       c       E
## 
## $February
##   x rowname colname
## 1 3       a       A
## 2 2       b       B
## 3 2       c       B
## 4 1       a       E
## 5 1       b       E
## 6 2       c       E
## 
## $March
##   x rowname colname
## 1 3       a       A
## 2 2       b       A
## 3 3       c       A
## 4 1       a       B
## 5 1       b       B
## 6 2       c       B
## 7 1       c       C
## 8 2       a       E
## 9 2       c       E
## 

(tt <- do.call(rbind, unname(dd)) |> 
     transform(matname = rep.int(gl(length(dd), 1L, labels = names(dd)), 
                                 vapply(dd, nrow, 0L))))
##    x rowname colname  matname
## 1  1       a       A  January
## 2  2       c       A  January
## 3  1       b       B  January
## 4  2       c       B  January
## 5  3       a       D  January
## 6  1       c       D  January
## 7  1       b       E  January
## 8  1       c       E  January
## 9  3       a       A February
## 10 2       b       B February
## 11 2       c       B February
## 12 1       a       E February
## 13 1       b       E February
## 14 2       c       E February
## 15 3       a       A    March
## 16 2       b       A    March
## 17 3       c       A    March
## 18 1       a       B    March
## 19 1       b       B    March
## 20 2       c       B    March
## 21 1       c       C    March
## 22 2       a       E    March
## 23 2       c       E    March

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1