'How to unnest and reorganize a complex nested list to run kmeans

I have a list containing multiple lists of dataframes. In the Rviewer, here is an example of what you see:

image1

note: dataframes B-F has the same named variables.

My questions are:

  1. Is there a simple way to transpose the information so that this will be one table so that:

the A:F will become character values under a new variable (e.g., "alphabet") and all of the nested variables will be combined so there aren't any duplicated variable names? For instance, List 1 would be broken up into: (please click link for img ->)

image2

note: all of the variables would be filled, I just left it blank here.

I'm trying to do this to run kmeans specifically on three variables, bm1, bm2, and ls which are in the sample code below.

  1. And after doing this, is there a simple way to revert it back to its original structure with some additional variables (e.g., clusters)?

Here is the dput(data) for the example code:

list(A = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4300, 
4800, 5300, 4300, 4800, 5300), y = c(4400, 4400, 4400, 4800, 
4800, 4800), fm1 = c(3800, 4400, 5000, 3600, 4200, 5200), fm2 = 
c(3900, 
4600, 5300, 3900, 4400, 5600), bm1 = c(400, 400, 400, 400, 400, 
400), bm2 = c(300, 300, 400, 300, 300, 400), ns = c(3600, 4200, 
4900, 3600, 4100, 5200), sn = c(0, 0, 0, 0, 0, 0), ls = c(0, 
0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), sln = c(0, 0, 0, 0, 
0, 0)), row.names = c(NA, 6L), class = "data.frame"), B = 
structure(list(
r = c(0, 0, 0, 0, 0, 0), x = c(4300, 4800, 5300, 4300, 4800, 
5300), y = c(4500, 4500, 4500, 4900, 4900, 4900), fm1 = c(1300, 
1400, 1500, 1100, 1200, 1200), fm2 = c(1400, 1500, 1500, 
1200, 1300, 1300), bm1 = c(100, 100, 100, 100, 100, 100), 
bm2 = c(100, 100, 100, 100, 100, 100), ns = c(1200, 1400, 
1400, 1100, 1100, 1200), sn = c(0, 0, 100, 100, 0, 100), 
ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), sln = c(0, 
0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame"), 
C = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4300, 4800, 
5300, 4300, 4800, 5300), y = c(4400, 4400, 4400, 4800, 4800, 
4800), fm1 = c(4100, 4400, 4600, 3700, 4100, 3900), fm2 = c(4400, 
4600, 4900, 4000, 4400, 4300), bm1 = c(200, 200, 200, 200, 
200, 200), bm2 = c(200, 200, 200, 200, 200, 200), ns = c(4200, 
4500, 4700, 3800, 4200, 4100), sn = c(0, 100, 100, 0, 0, 
200), ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), 
sln = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = 
"data.frame"), 
D = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4400, 4900, 
5400, 4400, 4900, 5400), y = c(4500, 4500, 4500, 4900, 4900, 
4900), fm1 = c(3000, 3200, 3300, 2500, 2600, 2600), fm2 = c(3400, 
3600, 3600, 2700, 2900, 2900), bm1 = c(300, 300, 300, 300, 
300, 200), bm2 = c(300, 200, 200, 200, 200, 200), ns = c(3100, 
3400, 3400, 2500, 2700, 2700), sn = c(0, 0, 0, 0, 0, 0), 
ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), sln = c(0, 
0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame"), 
E = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4400, 4900, 
5400, 4400, 4900, 5400), y = c(4500, 4500, 4500, 4900, 4900, 
4900), fm1 = c(2500, 2300, 2400, 2700, 2400, 2300), fm2 = c(2600, 
2400, 2600, 2900, 2600, 2500), bm1 = c(200, 200, 200, 200, 
200, 200), bm2 = c(200, 200, 200, 200, 200, 200), ns = c(2400, 
2200, 2400, 2700, 2400, 2300), sn = c(0, 100, 100, 0, 100, 
100), ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), 
sln = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = 
"data.frame"), 
F = structure(list(r = c(0, 0, 0, 0, 0, 0), x = c(4300, 4800, 
5300, 4300, 4800, 5300), y = c(4400, 4400, 4400, 4800, 4800, 
4800), fm1 = c(3300, 3500, 3400, 2700, 3100, 3100), fm2 = c(3500, 
3700, 3700, 2900, 3300, 3400), bm1 = c(200, 200, 200, 200, 
200, 200), bm2 = c(200, 200, 200, 200, 200, 200), ns = c(3300, 
3600, 3500, 2700, 3100, 3200), sn = c(0, 100, 100, 0, 0, 
0), ls = c(0, 0, 0, 0, 0, 0), fa = c(0, 0, 0, 0, 0, 0), sln = c(0, 
0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame"))


Solution 1:[1]

Is it as simple as this:

library(dplyr)
dat <- lapply(dat, function(x){
  tmp <- x %>% select(where(is.numeric))
  k <- kmeans(tmp, centers=2)
  x$cluster <- k$cluster
  x
})
dat
#> $A
#>   r    x    y  fm1  fm2 bm1 bm2   ns sn ls fa sln cluster
#> 1 0 4300 4400 3800 3900 400 300 3600  0  0  0   0       1
#> 2 0 4800 4400 4400 4600 400 300 4200  0  0  0   0       1
#> 3 0 5300 4400 5000 5300 400 400 4900  0  0  0   0       2
#> 4 0 4300 4800 3600 3900 400 300 3600  0  0  0   0       1
#> 5 0 4800 4800 4200 4400 400 300 4100  0  0  0   0       1
#> 6 0 5300 4800 5200 5600 400 400 5200  0  0  0   0       2
#> 
#> $B
#>   r    x    y  fm1  fm2 bm1 bm2   ns  sn ls fa sln cluster
#> 1 0 4300 4500 1300 1400 100 100 1200   0  0  0   0       1
#> 2 0 4800 4500 1400 1500 100 100 1400   0  0  0   0       2
#> 3 0 5300 4500 1500 1500 100 100 1400 100  0  0   0       2
#> 4 0 4300 4900 1100 1200 100 100 1100 100  0  0   0       1
#> 5 0 4800 4900 1200 1300 100 100 1100   0  0  0   0       1
#> 6 0 5300 4900 1200 1300 100 100 1200 100  0  0   0       2
#> 
#> $C
#>   r    x    y  fm1  fm2 bm1 bm2   ns  sn ls fa sln cluster
#> 1 0 4300 4400 4100 4400 200 200 4200   0  0  0   0       2
#> 2 0 4800 4400 4400 4600 200 200 4500 100  0  0   0       1
#> 3 0 5300 4400 4600 4900 200 200 4700 100  0  0   0       1
#> 4 0 4300 4800 3700 4000 200 200 3800   0  0  0   0       2
#> 5 0 4800 4800 4100 4400 200 200 4200   0  0  0   0       2
#> 6 0 5300 4800 3900 4300 200 200 4100 200  0  0   0       2
#> 
#> $D
#>   r    x    y  fm1  fm2 bm1 bm2   ns sn ls fa sln cluster
#> 1 0 4400 4500 3000 3400 300 300 3100  0  0  0   0       1
#> 2 0 4900 4500 3200 3600 300 200 3400  0  0  0   0       1
#> 3 0 5400 4500 3300 3600 300 200 3400  0  0  0   0       1
#> 4 0 4400 4900 2500 2700 300 200 2500  0  0  0   0       2
#> 5 0 4900 4900 2600 2900 300 200 2700  0  0  0   0       2
#> 6 0 5400 4900 2600 2900 200 200 2700  0  0  0   0       2
#> 
#> $E
#>   r    x    y  fm1  fm2 bm1 bm2   ns  sn ls fa sln cluster
#> 1 0 4400 4500 2500 2600 200 200 2400   0  0  0   0       1
#> 2 0 4900 4500 2300 2400 200 200 2200 100  0  0   0       2
#> 3 0 5400 4500 2400 2600 200 200 2400 100  0  0   0       2
#> 4 0 4400 4900 2700 2900 200 200 2700   0  0  0   0       1
#> 5 0 4900 4900 2400 2600 200 200 2400 100  0  0   0       2
#> 6 0 5400 4900 2300 2500 200 200 2300 100  0  0   0       2
#> 
#> $F
#>   r    x    y  fm1  fm2 bm1 bm2   ns  sn ls fa sln cluster
#> 1 0 4300 4400 3300 3500 200 200 3300   0  0  0   0       1
#> 2 0 4800 4400 3500 3700 200 200 3600 100  0  0   0       2
#> 3 0 5300 4400 3400 3700 200 200 3500 100  0  0   0       2
#> 4 0 4300 4800 2700 2900 200 200 2700   0  0  0   0       1
#> 5 0 4800 4800 3100 3300 200 200 3100   0  0  0   0       1
#> 6 0 5300 4800 3100 3400 200 200 3200   0  0  0   0       2

This code will put all the data together with the appropriate identifier for which dataset it comes from:

dat <- lapply(1:length(dat), function(i){
  x <- dat[[i]]
  x$alphabet <- names(dat)[i]
  x
})

all_dat <- dplyr::bind_rows(dat)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1