'Rearranging a complex dataset in R

I ran an experiment where participants were randomly assigned to viewing 5 out of 8 treatments on different issues. Moreover, for each issue they were assigned to, participants could see 1 out of N different variations of the treatment. The way the data is organized in the dataset, there is a separate set of variables for each issue, so for each row 5 sets of columns (for the 5 treatments participants saw) has valid values and the remaining 3 sets of columns (for the 3 treatments participants did not see) are all empty. After seeing treatments on 5 issues, participants answered questions about the topics for all 8 issues, so that for the 3 issues on which participants were not assigned to see a treatment, they serve as control group.

I would like to rearrange the dataset so that each row now representing a respondent is stacked into 8 rows so that:

  • For the 5 out of 8 issues participants saw treatments for, one new column designates which topic the treatment refers to, a second new column designates the treatment (based on column has valid values for that row), a third set of columns reports the valid values for the columns of the treatment the participant was assigned to, and the fourth set of columns reports the values of the final questions asked for that issue.
  • For the 3 out of 8 treatments participants did not see, the same first new column should designate the issue, the second report NA if they do not have valid values for the corresponding column, the third set of columns also report NAs (because participants did not answer questions on the treatment) and the fourth set of columns should report the values of the final questions asked for that issue.

A simplified version of my dataset looks like the one generated by this code:

ID <- c(1, 2)
T1_1    <- c(   1   ,   NA  )
T1_2    <- c(   NA  ,   NA  )
T1_3    <- c(   NA  ,   NA  )
T2_1    <- c(   NA  ,   NA  )
T2_2    <- c(   NA  ,   NA  )
T2_3    <- c(   NA  ,   5   )
T3_1    <- c(   2   ,   0   )
T3_2    <- c(   NA  ,   NA  )
T3_3    <- c(   NA  ,   NA  )
T4_1    <- c(   NA  ,   NA  )
T4_2    <- c(   NA  ,   NA  )
T4_3    <- c(   NA  ,   1   )
T5_1    <- c(   6   ,   2   )
T5_2    <- c(   NA  ,   NA  )
T5_3    <- c(   NA  ,   NA  )
T6_1    <- c(   NA  ,   5   )
T6_2    <- c(   NA  ,   NA  )
T6_3    <- c(   NA  ,   NA  )
T7_1    <- c(   NA  ,   NA  )
T7_2    <- c(   1   ,   NA  )
T7_3    <- c(   NA  ,   NA  )
T8_1    <- c(   0   ,   NA  )
T8_2    <- c(   NA  ,   NA  )
T8_3    <- c(   NA  ,   NA  )
DV1_1   <- c(   1   ,   2   )
DV1_2   <- c(   10  ,   4   )
DV2_1   <- c(   10  ,   9   )
DV2_2   <- c(   5   ,   3   )
DV3_1   <- c(   2   ,   4   )
DV3_2   <- c(   10  ,   1   )
DV4_1   <- c(   3   ,   2   )
DV4_2   <- c(   1   ,   3   )
DV5_1   <- c(   6   ,   7   )
DV5_2   <- c(   0   ,   10  )
DV6_1   <- c(   2   ,   4   )
DV6_2   <- c(   10  ,   9   )
DV7_1   <- c(   1   ,   3   )
DV7_2   <- c(   2   ,   3   )
DV8_1   <- c(   8   ,   9   )
DV8_2   <- c(   5   ,   6   )

data <- rbind(data.frame(ID,T1_1,T1_2,T1_3,T2_1,T2_2,T2_3,T3_1,T3_2,T3_3,T4_1,T4_2,T4_3,T5_1,T5_2,T5_3,T6_1,T6_2,T6_3,T7_1,T7_2,T7_3,T8_1,T8_2,T8_3,DV1_1,DV1_2,DV2_1,DV2_2,DV3_1,DV3_2,DV4_1,DV4_2,DV5_1,DV5_2,DV6_1,DV6_2,DV7_1,DV7_2,DV8_1,DV8_2))

> data
  ID T1_1 T1_2 T1_3 T2_1 T2_2 T2_3 T3_1 T3_2 T3_3 T4_1 T4_2 T4_3 T5_1 T5_2 T5_3 T6_1 T6_2 T6_3 T7_1 T7_2 T7_3 T8_1
1  1    1   NA   NA   NA   NA   NA    2   NA   NA   NA   NA   NA    6   NA   NA   NA   NA   NA   NA    1   NA    0
2  2   NA   NA   NA   NA   NA    5    0   NA   NA   NA   NA    1    2   NA   NA    5   NA   NA   NA   NA   NA   NA
  T8_2 T8_3 DV1_1 DV1_2 DV2_1 DV2_2 DV3_1 DV3_2 DV4_1 DV4_2 DV5_1 DV5_2 DV6_1 DV6_2 DV7_1 DV7_2 DV8_1 DV8_2
1   NA   NA     1    10    10     5     2    10     3     1     6     0     2    10     2     2     8     5
2   NA   NA     2     4     9     3     4     1     2     3     7    10     4     9     3     3     9     6

I would like to rearrange the dataset as follows:

> data2
   ID Topic Treat Value DV_1 DV_2
1   1     1     1     1    1   10
2   1     2    NA    NA   10    5
3   1     3     1     2    2   10
4   1     4    NA    NA    3    1
5   1     5     1     6    6    0
6   1     6    NA    NA    2   10
7   1     7     2     1    1    2
8   1     8     1     0    8    5
9   2     1    NA    NA    2    4
10  2     2     3     5    9    3
11  2     3     1     0    4    1
12  2     4     3     1    2    3
13  2     5     1     2    7   10
14  2     6     1     5    4    9
15  2     7    NA    NA    3    3
16  2     8    NA    NA    9    6

Many thanks in advance for your help!



Solution 1:[1]

Using dplyr and tidyr, I would do the following.

Gather the data for treatments (df_t) and questions (df_dv) separately.

library(dplyr)
library(tidyr)

df_t <- data %>% 
  select(ID, starts_with("T")) %>% # Select relevant columns for this dataset
  pivot_longer(-ID,
               names_pattern = "T(\\d+)_(\\d+)", # Grab the numbers from the column name
               names_to = c("Topic", "Treat"), # First number goes into column 'Topic', second into 'Treat' 
               values_to = "Value") %>% # Values go into column 'Value'
  filter(!is.na(Value)) # Remove rows with no 'Value'

# Similar for 'DV' columns
df_dv <- data %>% 
  select(ID, starts_with("DV")) %>% 
  pivot_longer(-ID, names_pattern = "DV(\\d+)_(\\d+)", names_to = c("Topic", "DV")) %>% 
  pivot_wider(names_from = DV, values_from = value, names_prefix = "DV_")

# Combine the two
df_dv %>% 
  full_join(df_t)

which gives

# A tibble: 16 × 6
      ID Topic  DV_1  DV_2 Treat Value
   <dbl> <chr> <dbl> <dbl> <chr> <dbl>
 1     1 1         1    10 1         1
 2     1 2        10     5 NA       NA
 3     1 3         2    10 1         2
 4     1 4         3     1 NA       NA
 5     1 5         6     0 1         6
 6     1 6         2    10 NA       NA
 7     1 7         1     2 2         1
 8     1 8         8     5 1         0
 9     2 1         2     4 NA       NA
10     2 2         9     3 3         5
11     2 3         4     1 1         0
12     2 4         2     3 3         1
13     2 5         7    10 1         2
14     2 6         4     9 1         5
15     2 7         3     3 NA       NA
16     2 8         9     6 NA       NA

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1