'Filter rows per group with a specific condition

I have a grouped data frame (meaning that I have a list of patients and each patient has several treatments). I now want to remove all rows, that do not meet a specific criteria:

The last column contains yes and no. I only want the rows that contains the last no and the first yes.

PATIENT.ID   Caffeinefactor    
---------------------------
21       no      
21       no      
21       no
21       yes      
21       yes      
34       no      
34       no      
34       yes      
34       yes      
16       no      
16       no      
DF = structure(list(PATIENT.ID = c(210625L, 210625L, 210625L, 210625L, 
210625L, 210625L, 210625L, 210625L, 210625L, 210625L, 210625L, 
210625L, 210625L, 210625L, 210625L, 210625L, 210625L, 220909L, 
220909L, 220909L, 220909L, 220909L, 220909L, 220909L, 220909L, 
220909L, 220909L, 221179L, 221179L, 221179L, 221179L, 221179L, 
221179L, 221179L, 221179L, 221179L, 221179L, 221179L, 221179L, 
221179L, 221179L, 301705L, 301705L, 301705L, 301705L, 301705L, 
301705L, 301705L, 301705L, 301705L, 301705L, 301705L, 301705L, 
301705L, 301705L, 301705L, 303926L, 303926L, 303926L, 303926L
), PATIENT.TREATMENT.NUMBER = c(1L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 17L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 12L, 13L, 14L, 15L, 16L, 1L, 2L, 3L, 4L), Caffeinefactor = c("no", 
"no", "no", "no", "yes", "yes", "yes", "no", "yes", "yes", "yes", 
"yes", "yes", "no", "no", "yes", "yes", "yes", "yes", "yes", 
"yes", "yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no", 
"no", "no", "no", "no", "no", "no", "yes", "yes", "yes", "yes", 
"yes", "no", "no", "no", "no", "no", "no", "yes", "no", "yes", 
"yes", "yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no"
)), row.names = c(NA, -60L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x7fe7f7002ee0>)

Expected Output:

PATIENT.ID   Caffeinefactor    
---------------------------    
21       no
21       yes                
34       no      
34       yes           

I started off with this code:

daf1 <- df %>% 
  group_by(PATIENT.ID) %>% 
  mutate(firstc = (Caffeinefactor == 'yes' & lag(Caffeinefactor) == 'no')) %>% 
  # Find the first row with `double_B` in each group, filter out rows after it
  filter(row_number() <= min(which(firstc == TRUE)))   

however, I cannot seem to solve the problem, to delete everything before AND after the two needed rows.

r


Solution 1:[1]

I edited @Adromines code and now I resolved the issues with the "yes,no,yes" rows:

daf1 <- df %>% 
  setDT(df) %>% 
  # add row id to confirm if the correct rows were selected
  dplyr::mutate(row_id = row_number()) %>% 
  dplyr::group_by(PATIENT.ID) %>% 
  dplyr::mutate(first_yes = cumsum(Caffeinefactor == "yes"), 
         last_no = (Caffeinefactor == "no") * row_number(), 
         last_no = ifelse(first_yes == 0, last_no, 0)) %>% 
  dplyr:: filter((first_yes == 1 & Caffeine >0) | last_no == max(last_no)) %>%
  dplyr::select(-first_yes, -last_no)

Solution 2:[2]

library(data.table)

# dummy data              
df <- data.table(patient = c(21, 21, 21, 21, 21, 34, 34, 34, 34, 16, 16)
                 , caffiene = c("no", "no", "yes", "yes", "yes", "no", "no", "yes", "yes", "no", "no")
                 )
df[, time := 1:.N][]

    patient caffiene time
 1:      21       no    1
 2:      21       no    2
 3:      21      yes    3
 4:      21      yes    4
 5:      21      yes    5
 6:      34       no    6
 7:      34       no    7
 8:      34      yes    8
 9:      34      yes    9
10:      16       no   10
11:      16       no   11

# identify 1st yes
df[caffiene == 'yes', nth_yes := 1:.N, patient]

# fetch row index of 1st 'yes' and the 'no' immediately preceeding
x <- df[, .I[nth_yes == 1 & !is.na(nth_yes)] ]
y <- x-1

# return corresponding rows
df[c(y,x)][, nth_yes := NULL][order(patient)]

   patient caffiene time
1:      21       no    2
2:      21      yes    3
3:      34       no    7
4:      34      yes    8

Solution 3:[3]

In case there are other columns, you can try the following:

EDITED: it should retrieve the first yes and the last no before the first yes.

Edit 2: based on the provided dataset and the new conditions:

  • take the row with the first "yes"
  • if there is a "no" before "yes", then take the last no
  • if there is not a "no" before "yes", then take the first "no" after "yes"

The code is quite hacky though.

library(dplyr)

df %>% 
    # add row id to confirm if the correct rows were selected
    mutate(row_id = row_number()) %>% 
    group_by(PATIENT.ID) %>% 
    mutate(first_yes = cumsum(Caffeinefactor == "yes"),
           
           # in case there are multiple first_yes == 1,
           # then only keep the first one as 1
           first_yes = ifelse(first_yes == lag(first_yes, default = -1), -1, first_yes),
           
           # is no
           last_no = (Caffeinefactor == "no") * row_number(),
           
           # keep number as is if no appears before yes,
           # otherwise make negative to differentiate
           last_no = ifelse(first_yes == 0, last_no, -last_no),
           
           # is there a no before yes for this patient
           no_before_yes = any(Caffeinefactor == "no" & first_yes < 1)) %>% 
    
    # create conditions as variables
    mutate(
        # first yes, simple enough
        cond1 = first_yes == 1,
        
        # if no before yes, then take the last no
        cond2 = no_before_yes & (last_no == max(last_no) & first_yes < 1), 
        
        # if no after yes, then take the first no after yes
        cond3 = !no_before_yes & (last_no == min(last_no) & first_yes >= 1 & last_no < 0)  
    ) %>% 
    filter(cond1 | cond2 | cond3) %>% 
    select(-cond1, -cond2, -cond3, -first_yes, -last_no, -no_before_yes)
#> # A tibble: 8 x 4
#> # Groups:   PATIENT.ID [5]
#>   PATIENT.ID PATIENT.TREATMENT.NUMBER Caffeinefactor row_id
#>        <int>                    <int> <chr>           <int>
#> 1     210625                        1 no                  1
#> 2     210625                        6 yes                 5
#> 3     220909                        1 yes                18
#> 4     221179                        1 no                 28
#> 5     221179                       10 yes                37
#> 6     301705                        1 no                 42
#> 7     301705                        7 yes                48
#> 8     303926                        1 no                 57

Input:

df <- data.frame(
    PATIENT.ID= c(21, 21, 21, 21, 21, 34, 34, 34, 34, 16, 16), 
    Caffeinefactor = c("no", "no", "yes", "yes", "yes", "no", "no", "yes", "yes", "no", "no")
)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Rdog
Solution 2
Solution 3