require(data.table) ## 1.9.3
set.seed(1L)
DT = data.table(ID = sample(1e3, 1e8, TRUE), GROUP = sample(letters, 1e8, TRUE))All benchmarks are minimum of three consecutive runs.
system.time(ans1 <- DT[, list(N=length(unique(GROUP))), by=ID])
# user system elapsed
# 8.677 1.939 10.864 system.time(ans2 <- unique(DT)[, .N, by=ID])
# user system elapsed
# 7.054 0.948 8.181 identical(ans1, ans2) # [1] TRUErequire(dplyr) ## latest commit from github
setDF(DT)system.time(DT_g <- DT %>% group_by(ID))
# user system elapsed
# 7.688 1.369 9.686
gc() ## needed this for measuring timing correctly. group_by seems to take quite a bit of memory.system.time(ans3 <- DT_g %>% summarise(N = n_distinct(GROUP)))
# user system elapsed
# 16.170 0.050 16.618 system.time(ans4 <- DT_g %>% summarise(N = length(unique(GROUP))))
# user system elapsed
# 7.108 2.421 9.705 identical(ans3, ans4) # [1] TRUEidentical(setDF(setorder(ans1)), as.data.frame(ans3)) # [1] TRUEn_distinct() seems slower than length(unique(.)) here.. not sure why.. especially when ?n_distinct says:
This is a faster and more concise equivalent of
length(unique(x))