'Sample rows of data for iteratively smaller samples

I have the following dataset:

group<- c(rep(1, 200), rep(2, 200), rep(3, 200), rep(4, 200), rep(5, 200), rep(6, 200)) 
injection<- c(rep(1, 200), rep(0, 600), rep(1, 400)) 
art_light<- c(rep(1, 400), rep(0, 600), rep(1, 200))
seasonal_light<- c(rep(1, 600), rep(0, 200), rep(1, 200),  rep(1, 200)) 

## generate data frame for species 1-3; where Species B is unaffected by all of these things and A and C are subtly different
## species is a random effect and so results are more generalizable
dat1 <- data.frame(group, injection,art_light,seasonal_light)
dat1$species = "A"
dat2 <- data.frame(group, injection,art_light,seasonal_light)
dat2$species = "B"
dat3 <- data.frame(group, injection,art_light,seasonal_light)
dat3$species = "C"

#################################
# Simulated Response  Variables #
#################################

alpha = 1
beta1 = 10
beta2 = 2
beta3 = 20
beta4 = 10

e1= rnorm(1200, 5, sd=1)
e2 = rlnorm(1200)
e = rcauchy(1200) 
e3 = floor(runif(1200, min = 0, max = 20))
e4 = rpois(1200, lambda = 4)
e5 = rlnorm(1200)

dat1$lh<-alpha + beta1*injection + beta2*art_light + 
beta3*seasonal_light + beta4*injection*seasonal_light + e1
dat1$hb<-alpha + beta1*injection + e2

dat2$lh<- e
dat2$hb<- alpha + beta3*injection + e3

dat3$lh<-alpha + beta1*injection + beta2*art_light + 
beta3*seasonal_light +e4
dat3$hb<-alpha + beta2*injection + e5
dat <- do.call("rbind", list(dat1, dat2, dat3))

I want to randomly sample rows within each group with no replacement. Sampling will be done iteratively such that in the first iteration of sampling, the number of rows sampled are the number of rows in group (n) and in the next iteration the number of sampled rows would be n-1.

These results should be combined into a data frame. Each subsample will be distinguished by a new variable sample_num that represents the number of rows sampled. Below is an example of the result data frame for the first 4 samples. The actual result should continue for all possible subsample sizes down to sampling of 10 rows.

samp1<-
  dat %>% 
  group_by(group) %>% 
  sample_n(size = 600)
samp1$sample_num<-600

samp2<-
  dat %>% 
  group_by(group) %>% 
  sample_n(size = 599)
samp2$sample_num<-599

samp3<-
  dat %>% 
  group_by(group) %>% 
  sample_n(size = 598)
samp3$sample_num<-598

samp4<-
  dat %>% 
  group_by(group) %>% 
  sample_n(size = 597)
samp4$sample_num<-597

samp_dat<- rbind(samp1,samp2,samp3,samp4)


Solution 1:[1]

You can do:

library(dplyr)
library(purrr)

res <- map_df(set_names(600:10),
              ~ dat %>%
                group_by(group) %>%
                sample_n(size = .x),
              .id = "sample_num")

Giving:

# A tibble: 1,081,530 x 8
# Groups:   group [6]
   sample_num group injection art_light seasonal_light species       lh    hb
   <chr>      <dbl>     <dbl>     <dbl>          <dbl> <chr>      <dbl> <dbl>
 1 600            1         1         1              1 B       -403.    38   
 2 600            1         1         1              1 A         49.3   12.5 
 3 600            1         1         1              1 B          0.961 31   
 4 600            1         1         1              1 A         48.7   12.1 
 5 600            1         1         1              1 B         -0.691 23   
 6 600            1         1         1              1 A         47.7   11.4 
 7 600            1         1         1              1 C         37      3.55
 8 600            1         1         1              1 B         -0.327 22   
 9 600            1         1         1              1 B         -7.71  32   
10 600            1         1         1              1 B          0.153 36   
# ... with 1,081,520 more rows

Solution 2:[2]

data.table is incredibly fast at this sort of thing

library(data.table)

rbindlist(lapply(600:10, \(x) setDT(dat)[,.SD[sample(1:.N,x)], by=.(group)][,sample:=x]))

Output:

         group injection art_light seasonal_light species        lh        hb sample
      1:     1         1         1              1       C 35.000000  3.196606    600
      2:     1         1         1              1       A 46.424639 12.210558    600
      3:     1         1         1              1       C 33.000000  5.303823    600
      4:     1         1         1              1       A 47.316622 11.814838    600
      5:     1         1         1              1       C 39.000000  3.769120    600
     ---                                                                            
1081526:     6         1         1              1       A 47.249496 11.360076     10
1081527:     6         1         1              1       B -3.188948 29.000000     10
1081528:     6         1         1              1       A 47.263460 12.062339     10
1081529:     6         1         1              1       C 38.000000  3.307954     10
1081530:     6         1         1              1       B -2.760421 35.000000     10

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Ritchie Sacramento
Solution 2