'Optimize network range function

I found a blog post by Richard Paquin Morel on computing a network range function in R (Burt 1981; Reagans and McEvily 2003). The function assigns a value to each network node based on the number of its contacts and the interconnectedness of these nodes. The network range can be computed accross subgroups of nodes (e.g. female and male nodes). These are stored as attributes of the vertices.

The author's example is very illustrative, but it is based on a relatively small network (about 100 nodes). I have a network with about 200,000 nodes, which means the performance of the function is not suited for my analyses.

I present you an example where the size of a random graph according to the Erdos-Renyi model can be manipulated to time the performance of the functions.

I am unfamiliar with optimizing R code, but I think the adjacency matrix needs to be stored more efficiently (e.g., a sparse matrix). My attempts so far did not succeed in producing a working function.

rm(list=ls())

library(igraph)    
library(statnet)
library(intergraph)
library(tictoc)

set.seed(42)



## Source: https://ramorel.github.io/network-range/
## Function to find network range for each node in a network
## Arguments:
##  net = adjacency matrix, igraph graph, or network object
##  attr = Vector of attributes associated with each node in net
##  directed = boolean indicated if the network is directed or not


netrange <- function(net, attr, directed = TRUE){
  require(reshape2)
  if (class(net) == "igraph") {
    net <- as_adjacency_matrix(net, sparse = F)
  }
  else {
    if(class(net) == "network") {
      net <- as.matrix.network(net)
    }
    else {
      net <- as.matrix(net)
    }
  }
  if(nrow(net) != length(attr)) {
    stop("Number of nodes must match length of attributes vector")
  }
  else {
    if (directed == TRUE){
      ns <- colnames(net)
      el <- melt(net, varnames=c("ego", "alter"), value.name = "weight")
      df <- cbind(rownames(net), attr)
      el$ego_grp <- df[match(el[,1], df[,1]), 2]
      el$alter_grp <- df[match(el[,2], df[,1]), 2]
      
      #FINDING p_k, the strength of ties within each group
      # z_iq = sum of strength of ties from nodes in group _k_ to all other alters
      # z_ij = sum of strength of ties from nodes in group _k_ to alters in group _k_
      
      z_iq <- sapply(unique(attr), function(x) {
        sum(el[which(el$ego_grp==x), "weight"])
      })
      z_ij <- sapply(unique(attr), function(x) {
        sum(el[which(el$ego_grp==x & el$alter_grp==x), "weight"])
      })
      p_k <- z_ij / z_iq
      p_k[is.na(p_k)] <- 0
      
      #FINDING p_ik, the strength of connection from person i to group k
      # x_iq = sum of strength of ties for _i_ to alters in group _k_
      # x_ij = sum of strength of ties for _i_ to all alters
      
      x_ij <- sapply(colnames(net), function(x) {
        sum(el[which(el$ego==x), "weight"])
      }
      )
      x_iq <- list(NULL)
      for(i in colnames(net)) {
        x_iq[[i]] <- sapply(unique(attr), function(x) {
          sum(el[which(el$ego==i & el$alter_grp==x), "weight"])
        }
        )
      }
      x_iq <- x_iq[-c(1)] #x_iq is now a list where each elements is a vector of node _i_ summed strength of tie to group _k_
      
      p_ik <- lapply(1:length(x_iq), 
                     function(x) x_iq[[x]] / x_ij[x])
      
      # FINDING nd_i, the network diversity score for node _i_
      
      nd_i <- sapply(1:length(p_ik), 
                     function(x) 1 - sum(p_k*p_ik[[x]]^2, na.rm = F)
      )
    }
    else {
      ns <- colnames(net)
      el <- melt(net, varnames=c("ego", "alter"), value.name = "weight")
      dup <- data.frame(t(apply(el[,1:2],1,sort)))
      
      el <- el[!duplicated(dup),]
      df <- cbind(rownames(net), attr)
      el$ego_grp <- df[match(el[,1], df[,1]), 2]
      el$alter_grp <- df[match(el[,2], df[,1]), 2]
      
      #FINDING p_k, the strength of ties within each group
      # z_iq = sum of strength of ties from nodes in group _k_ to all other alters
      # z_ij = sum of strength of ties from nodes in group _k_ to alters in group _k_
      
      z_iq <- sapply(unique(attr), function(x) {
        sum(el[which(el$ego_grp==x | el$alter_grp==x), "weight"])
      })
      z_ij <- sapply(unique(attr), function(x) {
        sum(el[which(el$ego_grp==x & el$alter_grp==x), "weight"])
      })
      p_k <- z_ij / z_iq
      p_k[is.na(p_k)] <- 0
      
      #FINDING p_ik, the strength of connection from person i to group k
      # x_iq = sum of strength of ties for _i_ to alters in group _k_
      # x_ij = sum of strength of ties for _i_ to all alters
      
      x_ij <- sapply(colnames(net), function(x) {
        sum(el[which(el$ego==x | el$alter==x), "weight"])
      }
      )
      x_iq <- list(NULL)
      for(i in colnames(net)) {
        x_iq[[i]] <- sapply(unique(attr), function(x) {
          sum(el[which(el$ego==i & el$alter_grp==x), "weight"],
              el[which(el$alter==i & el$ego_grp==x), "weight"])
        }
        )
      }
      x_iq <- x_iq[-c(1)] #x_iq is now a list where each elements is a vector of node _i_ summed strength of tie to group _k_
      
      p_ik <- lapply(1:length(x_iq), 
                     function(x) x_iq[[x]] / x_ij[x])
      
      
      # FINDING nd_i, the network diversity score for node _i_
      
      nd_i <- sapply(1:length(p_ik), 
                     function(x) 1 - sum(p_k*p_ik[[x]]^2, na.rm = F)
      )
    }
    return(nd_i)
  }
}


# Generate exemplary network

g <- igraph::erdos.renyi.game(1000, 150, type = "gnm")

## Add categorical (binary) vertex feature: female

V(g)$female <- sample(c(0,1), replace=TRUE, size=length(V(g)))
V(g)$female


## transform igraph to statnet 

net <- intergraph::asNetwork(g)


## Apply network function 

tic()
range_female <- netrange(net, 
                         net %v% "female",
                         directed = T)
seq_time <- toc()


Solution 1:[1]

Best thing to optimize is to use profiling using profvis which shows where your bottle necks are. I took part of your function though.

There we saw x_ij <- sapply(colnames(net), function(x) { sum(el[which(el$ego==x | el$alter==x), "weight"]) }) taking terribly long.

I prefer data.table myself so just provide the code here to speed up that part which takes a few seconds only. Try the same then for creating x_iq.

xx_ij <- el[, .(xxij = lapply(.SD, sum)), by = c("ego"), .SDcols = c("weight")]
xx_ij2 <- xx_ij$xxij
names(xx_ij2) <- xx_ij$ego

identical(x_ij, unlist(xx_ij2))
# TRUE

enter image description here

**profiling

library(profvis)
p <- profvis({
  ## code here
})

f <- paste0("profile_", as.Date(now()), ".html")
htmlwidgets::saveWidget(p, f)
browseURL(f)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1