'separate_columns for tidyr

Let's say I had a survey question that read:

What did you eat?
  • [ ] apple
  • [ ] pear
  • [x] banana
  • [x] grapes

Now, I have the endorsed options as comma-separated strings in one variable. I wrote myself a little helper to turn this comma-separated list of answers into boolean dummies showing whether each box was checked.

df <- data.frame(
  x = 1:5,
  ate = c("apple", "apple, pear, banana", "banana, grapes", NA_character_, ""),
  stringsAsFactors = FALSE
)

separate_columns <- function(df, col, convert = TRUE, sep = ", ") {
  colname <- deparse(substitute(col))
  # sorry about this ugly non-rlang approach, hoping not to reuse this
  df$.splitcol <- df %>% pull(colname)
  separate_rows(df, .splitcol, convert = convert, sep = sep) %>% 
    mutate(.splitcol = stringr::str_c(colname, "_", .splitcol), value = 1) %>% 
    mutate(.splitcol = if_else(is.na(.splitcol), stringr::str_c(colname, "_nonresponse"), .splitcol)) %>% 
    spread(.splitcol, value, fill = 0) %>% 
    select(-colname)
}

separate_columns(df, ate)

Gets me to this:

x ate_apple ate_banana ate_grapes ate_nonresponse ate_pear 1 1 0 0 0 0 2 1 1 0 0 1 3 0 1 1 0 0 4 0 0 0 1 0 5 0 0 0 1 0

Writing the helper felt clunky, and I feel like I'm missing a more tidyverse way of accomplishing the same transformation (despite lots of searching).

Also, I found no easy way for missings to propagate using this method (I'd prefer if all dummies would be missing if the response was NA, but 0 if it was an empty string). So, I'd rather get this

x ate_apple ate_banana ate_grapes ate_pear 1 1 0 0 0 2 1 1 0 1 3 0 1 1 0 4 NA NA NA NA 5 0 0 0 0

Is there a nicer tidyverse way?



Solution 1:[1]

After changing into 'long' format by splitting the 'ate' column by the delimiter ,, create a column of 1 and spread from 'long' to 'wide'

library(tidyverse)
df %>% 
  separate_rows(ate, sep=", ", convert = TRUE) %>%
  mutate(ate = replace(ate, is.na(ate), "NA"), 
         n = paste(NA ^ (ate == "NA")), 
         ate = paste0("ate_", replace(ate, ate == "", "nonresponse" ))) %>% 
  spread(ate, n, fill = "0") %>% 
  mutate_at(vars(-x, -ate_NA), 
   funs(replace(as.integer(.), ate_NA=="NA", NA_integer_))) %>% 
   select(-ate_NA)
#   x ate_apple ate_banana ate_grapes ate_nonresponse ate_pear
#1 1         1          0          0               0        0
#2 2         1          1          0               0        1
#3 3         0          1          1               0        0
#4 4        NA         NA         NA              NA       NA
#5 5         0          0          0               1        0

Solution 2:[2]

I take a different approach, by first extracting want was there to eat and then matching it in the data:

total_eat_list <- map(df$ate, str_split, patter = ",") %>% 
  unlist() %>% 
  str_trim() %>% 
  na.exclude() %>% 
  unique()

Remove empty strings:

total_eat_list <- total_eat_list[total_eat_list != ""]
total_eat_list 
# [1] "apple"  "pear"   "banana" "grapes"

Now lets map everything in the original data:

map_df(total_eat_list, ~
  df %>% 
  mutate(ate_what = str_c("ate_", .x), 
         ind = case_when(str_detect(string = df$ate, .x) ~ 1, 
                         !str_detect(string = df$ate, .x) ~ 0, 
                         TRUE ~ NA_real_))) %>% 
  spread(ate_what, ind) %>% 
  select(-ate) 

# A tibble: 5 x 5
#       x ate_apple ate_banana ate_grapes ate_pear
#   <int>     <dbl>      <dbl>      <dbl>    <dbl>
# 1     1         1          0          0        0
# 2     2         1          1          0        1
# 3     3         0          1          1        0
# 4     4        NA         NA         NA       NA
# 5     5         0          0          0        0

The nice thing is that NAs are infectious for the str_-functions.


As function:

who_ate_what <- function(data, col) {
  col <- enquo(col)
  col_name <- quo_name(col)

  match_list <- data %>%
    select(!!col) %>%
    map(str_split, patter = ",") %>%
    unlist() %>%
    str_trim() %>%
    na.exclude() %>%
    unique()

  match_list <- match_list[match_list != ""]

  map_df(match_list, ~
           data %>%
           mutate(matches = str_c(!!col_name, "_", .x),
                  ind = case_when(str_detect(string = !!col, .x) ~ 1,
                                  !str_detect(string = !!col, .x) ~ 0,
                                  TRUE ~ NA_real_)
           )) %>%
    spread(matches, ind) %>% 
    select(-!!col)
}

Solution 3:[3]

This is way too verbose I'm sure, but I guess its a start.

library(tidyverse)

df <- data.frame(
  x = 1:5,
  ate = c("apple", "apple, pear, banana", "banana, grapes", NA_character_, ""),
  stringsAsFactors = FALSE
)

df %>% 
  nest(-x) %>% 
  mutate(data = map(data, ~str_split(.x$ate, ",") %>% unlist())) %>% 
  unnest() %>% 
  group_by(x, data) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  spread(data, n, fill = NA) %>%
  select(-`<NA>`) %>%
  mutate(rs = rowSums(.[2:ncol(.)],na.rm = TRUE)) %>%
  gather(nm, val, -x, -rs) %>%
  mutate(val = case_when(
    is.na(val) & rs > 0 ~ "0",
    is.na(val) & rs == 0 ~ "NA",
    !is.na(val) ~ as.character(val)
  ), val = as.numeric(val)) %>%
  spread(nm, val, fill = NA) %>%
  select(-rs, -V1)

#> # A tibble: 5 x 6
#>       x ` banana` ` grapes` ` pear` apple banana
#>   <int>     <dbl>     <dbl>   <dbl> <dbl>  <dbl>
#> 1     1         0         0       0     1      0
#> 2     2         1         0       1     1      0
#> 3     3         0         1       0     0      1
#> 4     4        NA        NA      NA    NA     NA
#> 5     5         0         0       0     0      0

EDIT Lets wrap this into a function and take care of the name issue. I adopted the splitting from your original function to make the use of quosures easier.

my_sep_fun <- function(data, col){
  col <- enquo(col)
  col_name <- quo_name(col)

  data %>% 
   separate_rows(!!col, sep =', ', convert = TRUE) %>%
   group_by(x, !!col) %>%
   summarise(n = n()) %>%
   ungroup() %>%
   spread(!!col, n, fill = NA) %>%
   select(-`<NA>`) %>%
   mutate(rs = rowSums(.[2:ncol(.)],na.rm = TRUE)) %>%
   gather(nm, val, -x, -rs) %>%
   mutate(val = case_when(
    is.na(val) & rs > 0 ~ "0",
    is.na(val) & rs == 0 ~ "NA",
    !is.na(val) ~ as.character(val)
   ), val = as.numeric(val)) %>%
   spread(nm, val, fill = NA) %>%
   select(-rs, -V1) %>%
   rename_at(vars(2:ncol(.)), funs(paste0(!!col_name,"_", .)))
}

my_sep_fun(df, ate)
#> # A tibble: 5 x 5
#>       x ate_apple ate_banana ate_grapes ate_pear
#>   <int>     <dbl>      <dbl>      <dbl>    <dbl>
#> 1     1         1          0          0        0
#> 2     2         1          1          0        1
#> 3     3         0          1          1        0
#> 4     4        NA         NA         NA       NA
#> 5     5         0          0          0        0

Created on 2018-08-20 by the reprex package (v0.2.0).

Solution 4:[4]

One solution, much less verbose, in just three lines. Once you have the dataframe:

First, separate the values in each cell:

df <- separate_rows_(df, 'ate')

Second, dummify every answer using the function dummify from DataExplorer:

df <- DataExplorer::dummify(df, 'ate')

Third, aggregate the redundant rows like that:

df <- aggregate(df[,2:6], by=df$x, FUN= sum)

(you could also apply a max function here since you want to capture all 1's in the columns).

Done!

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1
Solution 2
Solution 3
Solution 4 Martin Gal