'Filter rows per group with a specific condition
I have a grouped data frame (meaning that I have a list of patients and each patient has several treatments). I now want to remove all rows, that do not meet a specific criteria:
The last column contains yes and no. I only want the rows that contains the last no and the first yes.
PATIENT.ID Caffeinefactor
---------------------------
21 no
21 no
21 no
21 yes
21 yes
34 no
34 no
34 yes
34 yes
16 no
16 no
DF = structure(list(PATIENT.ID = c(210625L, 210625L, 210625L, 210625L,
210625L, 210625L, 210625L, 210625L, 210625L, 210625L, 210625L,
210625L, 210625L, 210625L, 210625L, 210625L, 210625L, 220909L,
220909L, 220909L, 220909L, 220909L, 220909L, 220909L, 220909L,
220909L, 220909L, 221179L, 221179L, 221179L, 221179L, 221179L,
221179L, 221179L, 221179L, 221179L, 221179L, 221179L, 221179L,
221179L, 221179L, 301705L, 301705L, 301705L, 301705L, 301705L,
301705L, 301705L, 301705L, 301705L, 301705L, 301705L, 301705L,
301705L, 301705L, 301705L, 303926L, 303926L, 303926L, 303926L
), PATIENT.TREATMENT.NUMBER = c(1L, 2L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 17L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 12L, 13L, 14L, 15L, 16L, 1L, 2L, 3L, 4L), Caffeinefactor = c("no",
"no", "no", "no", "yes", "yes", "yes", "no", "yes", "yes", "yes",
"yes", "yes", "no", "no", "yes", "yes", "yes", "yes", "yes",
"yes", "yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no",
"no", "no", "no", "no", "no", "no", "yes", "yes", "yes", "yes",
"yes", "no", "no", "no", "no", "no", "no", "yes", "no", "yes",
"yes", "yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no"
)), row.names = c(NA, -60L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x7fe7f7002ee0>)
Expected Output:
PATIENT.ID Caffeinefactor
---------------------------
21 no
21 yes
34 no
34 yes
I started off with this code:
daf1 <- df %>%
group_by(PATIENT.ID) %>%
mutate(firstc = (Caffeinefactor == 'yes' & lag(Caffeinefactor) == 'no')) %>%
# Find the first row with `double_B` in each group, filter out rows after it
filter(row_number() <= min(which(firstc == TRUE)))
however, I cannot seem to solve the problem, to delete everything before AND after the two needed rows.
Solution 1:[1]
I edited @Adromines code and now I resolved the issues with the "yes,no,yes" rows:
daf1 <- df %>%
setDT(df) %>%
# add row id to confirm if the correct rows were selected
dplyr::mutate(row_id = row_number()) %>%
dplyr::group_by(PATIENT.ID) %>%
dplyr::mutate(first_yes = cumsum(Caffeinefactor == "yes"),
last_no = (Caffeinefactor == "no") * row_number(),
last_no = ifelse(first_yes == 0, last_no, 0)) %>%
dplyr:: filter((first_yes == 1 & Caffeine >0) | last_no == max(last_no)) %>%
dplyr::select(-first_yes, -last_no)
Solution 2:[2]
library(data.table)
# dummy data
df <- data.table(patient = c(21, 21, 21, 21, 21, 34, 34, 34, 34, 16, 16)
, caffiene = c("no", "no", "yes", "yes", "yes", "no", "no", "yes", "yes", "no", "no")
)
df[, time := 1:.N][]
patient caffiene time
1: 21 no 1
2: 21 no 2
3: 21 yes 3
4: 21 yes 4
5: 21 yes 5
6: 34 no 6
7: 34 no 7
8: 34 yes 8
9: 34 yes 9
10: 16 no 10
11: 16 no 11
# identify 1st yes
df[caffiene == 'yes', nth_yes := 1:.N, patient]
# fetch row index of 1st 'yes' and the 'no' immediately preceeding
x <- df[, .I[nth_yes == 1 & !is.na(nth_yes)] ]
y <- x-1
# return corresponding rows
df[c(y,x)][, nth_yes := NULL][order(patient)]
patient caffiene time
1: 21 no 2
2: 21 yes 3
3: 34 no 7
4: 34 yes 8
Solution 3:[3]
In case there are other columns, you can try the following:
EDITED: it should retrieve the first yes and the last no before the first yes.
Edit 2: based on the provided dataset and the new conditions:
- take the row with the first "yes"
- if there is a "no" before "yes", then take the last no
- if there is not a "no" before "yes", then take the first "no" after "yes"
The code is quite hacky though.
library(dplyr)
df %>%
# add row id to confirm if the correct rows were selected
mutate(row_id = row_number()) %>%
group_by(PATIENT.ID) %>%
mutate(first_yes = cumsum(Caffeinefactor == "yes"),
# in case there are multiple first_yes == 1,
# then only keep the first one as 1
first_yes = ifelse(first_yes == lag(first_yes, default = -1), -1, first_yes),
# is no
last_no = (Caffeinefactor == "no") * row_number(),
# keep number as is if no appears before yes,
# otherwise make negative to differentiate
last_no = ifelse(first_yes == 0, last_no, -last_no),
# is there a no before yes for this patient
no_before_yes = any(Caffeinefactor == "no" & first_yes < 1)) %>%
# create conditions as variables
mutate(
# first yes, simple enough
cond1 = first_yes == 1,
# if no before yes, then take the last no
cond2 = no_before_yes & (last_no == max(last_no) & first_yes < 1),
# if no after yes, then take the first no after yes
cond3 = !no_before_yes & (last_no == min(last_no) & first_yes >= 1 & last_no < 0)
) %>%
filter(cond1 | cond2 | cond3) %>%
select(-cond1, -cond2, -cond3, -first_yes, -last_no, -no_before_yes)
#> # A tibble: 8 x 4
#> # Groups: PATIENT.ID [5]
#> PATIENT.ID PATIENT.TREATMENT.NUMBER Caffeinefactor row_id
#> <int> <int> <chr> <int>
#> 1 210625 1 no 1
#> 2 210625 6 yes 5
#> 3 220909 1 yes 18
#> 4 221179 1 no 28
#> 5 221179 10 yes 37
#> 6 301705 1 no 42
#> 7 301705 7 yes 48
#> 8 303926 1 no 57
Input:
df <- data.frame(
PATIENT.ID= c(21, 21, 21, 21, 21, 34, 34, 34, 34, 16, 16),
Caffeinefactor = c("no", "no", "yes", "yes", "yes", "no", "no", "yes", "yes", "no", "no")
)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Rdog |
| Solution 2 | |
| Solution 3 |
