'Summary DataFrame: Aggregating data at specific timestamps for different categories and the day overall
I have a large dataframe that's updated every day containing the following:
structure(list(date = structure(c(19038, 19038, 19038, 19038,
19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038,
19038, 19038), class = "Date"), type = c("USD", "USD", "USD",
"USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",
"USD", "USD", "USD"), quantity = c(0.035, 0.011, 0.02, 0.02,
0.019, 0.036, 0.001, 0.003, 0.004, 0.065, 0.002, 0.001, 0.002,
0.002, 0.013), cumvol = c(0.035, 0.046, 0.066, 0.086, 0.105,
0.141, 0.142, 0.145, 0.149, 0.214, 0.216, 0.217, 0.219, 0.221,
0.234), time = structure(c(24422, 24430, 24453, 24463, 24468,
24476, 24485, 24583, 24790, 24862, 24950, 24953, 25047, 25085,
25085), units = "secs", class = c("hms", "difftime"))), row.names = c(NA,
-15L), class = c("data.table", "data.frame"))
This is the code used to attempt to achieve the result (helpfully produced by Dan Adams)
times_of_interest <- c(7, 9, 11)
d %>%
mutate(across(everything(), as.character)) %>%
mutate(across(quantity:cumvol, as.numeric)) %>%
mutate(type = factor(type)) %>%
mutate(date = ymd(date)) %>%
mutate(time = as_hms(time)) %>%
mutate(date = date + sample(0:1, nrow(.), T)) %>%
mutate(time_hr = hour(time)) %>%
filter(time_hr %in% times_of_interest) %>%
group_by(date, type, time_hr) %>%
summarize(cat_total = sum(quantity), .groups = "drop") %>%
group_by(date) %>%
mutate(date_total = sum(cat_total)) %>%
ungroup()
The output delivers this:
structure(list(date = structure(c(19034, 19034, 19034, 19034,
19035, 19035, 19035, 19035, 19035, 19035, 19035, 19035, 19037,
19037, 19037, 19037, 19037, 19037, 19037, 19037, 19037, 19038,
19038, 19038, 19038, 19038, 19038, 19038, 19038, 19038, 19039,
19039, 19039), class = "Date"), type = structure(c(3L, 5L, 5L,
5L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 2L, 2L, 3L, 3L, 3L, 1L, 5L,
5L, 5L, 2L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 2L, 3L, 5L), .Label = c("DNB",
"UKY", "UKS", "T/N cl DBV", "USD"), class = "factor"), time_hr = c(11L,
7L, 9L, 11L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 9L, 11L, 7L,
9L, 11L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 7L, 9L, 11L, 7L, 9L,
11L, 7L, 7L, 7L), cat_total = c(0.739, 8.714, 3.281, 3.263, 0.815,
1.025, 2.071, 0.661, 0.328, 5.633, 0.776, 2.126, 0.465, 0.15,
2.046, 1.203, 0.137, 0.058, 2.341, 4.215, 1.705, 0.01, 0.335,
0.15, 4.323, 1.157, 0.031, 8.607, 3.624, 2.603, 0.865, 1.599,
5.721), date_total = c(9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441, 9893.441,
9893.441)), row.names = c(NA, -33L), class = c("tbl_df", "tbl",
"data.frame"))
What I'd like to do is change the summary table so that it takes the total quantities in the types (and overall) at some specific timestamps; say 12:00, 15:00, and 16:00, for each day.
Unfortunately, for whatever reason the output for this dataframe is always at two times (the first two in the times_of_interest vector)./
str output is:
Classes ‘data.table’ and 'data.frame': 15 obs. of 5 variables:
$ date : Date, format: "2022-02-15" "2022-02-15" "2022-02-15" "2022-02-15" ...
$ type : chr "USD" "USD" "USD" "USD" ...
$ quantity: num 0.035 0.011 0.02 0.02 0.019 0.036 0.001 0.003 0.004 0.065 ...
$ cumvol : num 0.035 0.046 0.066 0.086 0.105 0.141 0.142 0.145 0.149 0.214 ...
$ time : 'hms' num 06:47:02 06:47:10 06:47:33 06:47:43 ...
..- attr(*, "units")= chr "secs"
- attr(*, ".internal.selfref")=<externalptr>
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
