Created
April 12, 2021 10:23
-
-
Save IyarLin/df8cdc6b277d957ea3df7ab9c8d88de4 to your computer and use it in GitHub Desktop.
dtplyr speed benchmarking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require(data.table) | |
| require(dtplyr) | |
| require(dplyr) | |
| N <- 1e8 | |
| K <- 100 | |
| set.seed(1) | |
| DT <- data.table( | |
| id1 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char) | |
| id2 = sample(sprintf("id%03d", 1:K), N, TRUE), # large groups (char) | |
| id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), # small groups (char) | |
| id4 = sample(K, N, TRUE), # large groups (int) | |
| id5 = sample(K, N, TRUE), # large groups (int) | |
| id6 = sample(N / K, N, TRUE), # small groups (int) | |
| v1 = sample(5, N, TRUE), # int in range [1,5] | |
| v2 = sample(5, N, TRUE), # int in range [1,5] | |
| v3 = sample(round(runif(100, max = 100), 4), N, TRUE) # numeric e.g. 23.5749 | |
| ) | |
| q1a <- system.time(DT[, sum(v1), keyby = id1])[3] | |
| q1b <- system.time(DT[, sum(v1), keyby = id1])[3] | |
| q2a <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3] | |
| q2b <- system.time(DT[, sum(v1), keyby = "id1,id2"])[3] | |
| q3a <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3] | |
| q3b <- system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])[3] | |
| q4a <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3] | |
| q4b <- system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])[3] | |
| q5a <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3] | |
| q5b <- system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])[3] | |
| data_table_results <- list( | |
| q1a = q1a, q1b = q1b, | |
| q2a = q2a, q2b = q2b, | |
| q3a = q3a, q3b = q3b, | |
| q4a = q4a, q4b = q4b, | |
| q5a = q5a, q5b = q5b | |
| ) | |
| DF <- data.frame( | |
| stringsAsFactors = FALSE, | |
| id1 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
| id2 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
| id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), | |
| id4 = sample(K, N, TRUE), | |
| id5 = sample(K, N, TRUE), | |
| id6 = sample(N / K, N, TRUE), | |
| v1 = sample(5, N, TRUE), | |
| v2 = sample(5, N, TRUE), | |
| v3 = sample(round(runif(100, max = 100), 4), N, TRUE) | |
| ) | |
| q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>% | |
| summarise(sum(v1)) %>% as_tibble())[3] | |
| q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>% | |
| summarise(sum(v1)) %>% as_tibble())[3] | |
| q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>% | |
| summarise(sum(v1)) %>% as_tibble())[3] | |
| q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>% | |
| summarise(sum(v1)) %>% as_tibble())[3] | |
| q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
| q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% | |
| summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
| q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>% | |
| summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
| q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>% | |
| summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
| q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>% | |
| summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
| q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>% | |
| summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
| dtplyr_results <- list( | |
| q1a = q1a, q1b = q1b, | |
| q2a = q2a, q2b = q2b, | |
| q3a = q3a, q3b = q3b, | |
| q4a = q4a, q4b = q4b, | |
| q5a = q5a, q5b = q5b | |
| ) | |
| DF <- data.frame( | |
| stringsAsFactors = FALSE, | |
| id1 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
| id2 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
| id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), | |
| id4 = sample(K, N, TRUE), | |
| id5 = sample(K, N, TRUE), | |
| id6 = sample(N / K, N, TRUE), | |
| v1 = sample(5, N, TRUE), | |
| v2 = sample(5, N, TRUE), | |
| v3 = sample(round(runif(100, max = 100), 4), N, TRUE) | |
| ) | |
| DF <- as.data.table(DF) | |
| q1a <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>% | |
| summarise(sum(v1)) %>% as_tibble())[3] | |
| q1b <- system.time(DF %>% lazy_dt() %>% group_by(id1) %>% | |
| summarise(sum(v1)) %>% as_tibble())[3] | |
| q2a <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>% | |
| summarise(sum(v1)) %>% as_tibble())[3] | |
| q2b <- system.time(DF %>% lazy_dt() %>% group_by(id1, id2) %>% | |
| summarise(sum(v1)) %>% as_tibble())[3] | |
| q3a <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
| q3b <- system.time(DF %>% lazy_dt() %>% group_by(id3) %>% | |
| summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
| q4a <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>% | |
| summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
| q4b <- system.time(DF %>% lazy_dt() %>% group_by(id4) %>% | |
| summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
| q5a <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>% | |
| summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
| q5b <- system.time(DF %>% lazy_dt() %>% group_by(id6) %>% | |
| summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
| dt_dtplyr_results <- list( | |
| q1a = q1a, q1b = q1b, | |
| q2a = q2a, q2b = q2b, | |
| q3a = q3a, q3b = q3b, | |
| q4a = q4a, q4b = q4b, | |
| q5a = q5a, q5b = q5b | |
| ) | |
| DF <- data.frame( | |
| stringsAsFactors = FALSE, | |
| id1 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
| id2 = sample(sprintf("id%03d", 1:K), N, TRUE), | |
| id3 = sample(sprintf("id%010d", 1:(N / K)), N, TRUE), | |
| id4 = sample(K, N, TRUE), | |
| id5 = sample(K, N, TRUE), | |
| id6 = sample(N / K, N, TRUE), | |
| v1 = sample(5, N, TRUE), | |
| v2 = sample(5, N, TRUE), | |
| v3 = sample(round(runif(100, max = 100), 4), N, TRUE) | |
| ) | |
| q1a <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3] | |
| q1b <- system.time(DF %>% group_by(id1) %>% summarise(sum(v1)) %>% as_tibble())[3] | |
| q2a <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3] | |
| q2b <- system.time(DF %>% group_by(id1, id2) %>% summarise(sum(v1)) %>% as_tibble())[3] | |
| q3a <- system.time(DF %>% group_by(id3) %>% summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
| q3b <- system.time(DF %>% group_by(id3) %>% | |
| summarise(sum(v1), mean(v3)) %>% as_tibble())[3] | |
| q4a <- system.time(DF %>% group_by(id4) %>% | |
| summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
| q4b <- system.time(DF %>% group_by(id4) %>% | |
| summarise_at(vars(v1:v3), mean) %>% as_tibble())[3] | |
| q5a <- system.time(DF %>% group_by(id6) %>% | |
| summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
| q5b <- system.time(DF %>% group_by(id6) %>% | |
| summarise_at(vars(v1:v3), sum) %>% as_tibble())[3] | |
| dplyr_results <- list( | |
| q1a = q1a, q1b = q1b, | |
| q2a = q2a, q2b = q2b, | |
| q3a = q3a, q3b = q3b, | |
| q4a = q4a, q4b = q4b, | |
| q5a = q5a, q5b = q5b | |
| ) | |
| results <- data.frame( | |
| package = rep(c("data.table", "dtplyr", "dt_dtplyr", "dplyr"), each = 10), | |
| query = rep(rep(c( | |
| "Test 1: 100 ad hoc groups of 1,000,000 rows; result 100 X 2", | |
| "Test 2: 10,000 ad hoc groups of 10,000 rows; result 10,000 X 3", | |
| "Test 3: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 3", | |
| "Test 4: 100 ad hoc groups of 1,000,000 rows; result 100 X 4", | |
| "Test 5: 1,000,000 ad hoc groups of 100 rows; result 1,000,000 X 4" | |
| ), | |
| each = 2 | |
| ), 4), | |
| run = rep(rep(1:2, 5), 4), | |
| time = c( | |
| unlist(data_table_results), unlist(dtplyr_results), | |
| unlist(dt_dtplyr_results), unlist(dplyr_results) | |
| ) | |
| ) | |
| first_run <- results %>% filter(run == 1) | |
| second_run <- results %>% filter(run == 2) | |
| first_run %>% ggplot(aes(time, package, fill = package)) + | |
| geom_col(width = 0.5) + | |
| geom_col(data = second_run, alpha = 0.5) + | |
| facet_wrap(facets = vars(query), strip.position = "top", ncol = 1) + | |
| theme_bw() + xlab("Time elapsed (seconds)") + | |
| theme( | |
| axis.text.y = element_blank(), | |
| strip.text = element_text(size = 6), | |
| text = element_text(size = 20), | |
| axis.title.y = element_blank(), | |
| plot.title = element_text(size = 10), | |
| axis.ticks.y = element_blank(), | |
| panel.grid.major.y = element_blank() | |
| ) + | |
| ggtitle("Input table: 100,000,000 rows X 9 columns (4.9 Gb) - Random Order") + | |
| scale_x_continuous(expand = c(0, 1.05)) | |
| sessionInfo() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment