Personal code snippets of @tmasjc

Site powered by Hugo

Image by Mads Schmidt Rasmussen from unsplash.com

Minimal Bootstrap Theme by Zachary Betz

Data.Table

Feb 9, 2018 #data.table

A quick walkthrough on using data.table in performing long-to-wide operation on data frame. Significant improvement on speed.

library(dplyr)
library(tidyr)
library(data.table)

# set
options(DT.options = list(pageLength = 5, dom = 'tip'))
set.seed(123)
rand <- runif(1e7, min = 1, max = length(LETTERS)) %>% round()

# Make some dummy data
df <- data.frame(
    letters = LETTERS[rand] %>% sample(),
    num = (1:length(LETTERS))[rand] %>% sample()
)
# Add count for spreading later
df <- df %>% group_by(letters) %>% mutate(count = 1:length(letters)) %>% arrange(letters, count)
head(df, 10)
## # A tibble: 10 x 3
## # Groups:   letters [1]
##    letters   num count
##    <fct>   <int> <int>
##  1 A          20     1
##  2 A          17     2
##  3 A          10     3
##  4 A           1     4
##  5 A          23     5
##  6 A          10     6
##  7 A           5     7
##  8 A           4     8
##  9 A          19     9
## 10 A           6    10

Let’s try out different methods on converting above long data frame to wide format. Our objective is to group them by letters and each row lists how many times the particular letter is called, from 1 to N.

# For example
(df %>% spread(count, num) %>% head())[1:5, 1:10]
## # A tibble: 5 x 10
## # Groups:   letters [26]
##   letters   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`
##   <fct>   <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 A          20    17    10     1    23    10     5     4    19
## 2 B           4     5     6    12    11     3     6    24    20
## 3 C           2    20    10     4    14    12     5     8    10
## 4 D           3    21    15    24    25    14    17    13    25
## 5 E          23    26     6    14    20     5    21    10    11

Method 1: The Usual Spread

t1 <- system.time(m1 <- df %>% spread(count, num))
(m1 %>% head())[1:5, 1:10]
## # A tibble: 5 x 10
## # Groups:   letters [26]
##   letters   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`
##   <fct>   <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 A          20    17    10     1    23    10     5     4    19
## 2 B           4     5     6    12    11     3     6    24    20
## 3 C           2    20    10     4    14    12     5     8    10
## 4 D           3    21    15    24    25    14    17    13    25
## 5 E          23    26     6    14    20     5    21    10    11

Method 2: D-Cast

t2 <- system.time(m2 <- df %>% dcast(letters ~ count, value.var = "num"))
## Warning in dcast(., letters ~ count, value.var = "num"): The dcast generic
## in data.table has been passed a grouped_df and will attempt to redirect
## to the reshape2::dcast; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. Please do this redirection yourself like
## reshape2::dcast(.). In the next version, this warning will become an error.
(m2 %>% head())[1:5, 1:10]
##   letters  1  2  3  4  5  6  7  8  9
## 1       A 20 17 10  1 23 10  5  4 19
## 2       B  4  5  6 12 11  3  6 24 20
## 3       C  2 20 10  4 14 12  5  8 10
## 4       D  3 21 15 24 25 14 17 13 25
## 5       E 23 26  6 14 20  5 21 10 11

Method 3: D-Cast with Data Table

# Convert to data table
dt <- df %>% as.data.table()
t3 <- system.time(m3 <- dt %>% dcast(letters ~ count, value.var = "num"))
(m3 %>% head())[1:5, 1:10]
##    letters  1  2  3  4  5  6  7  8  9
## 1:       A 20 17 10  1 23 10  5  4 19
## 2:       B  4  5  6 12 11  3  6 24 20
## 3:       C  2 20 10  4 14 12  5  8 10
## 4:       D  3 21 15 24 25 14 17 13 25
## 5:       E 23 26  6 14 20  5 21 10 11

Result

Shown below.

res <- list(t1, t2, t3)
names(res) <- c("Spread", "DCast", "DCast w/ Data Table")
res
## $Spread
##    user  system elapsed 
##  10.273   1.036  11.678 
## 
## $DCast
##    user  system elapsed 
##   7.415   0.996   9.336 
## 
## $`DCast w/ Data Table`
##    user  system elapsed 
##   5.034   0.556   4.058