Personal code snippets of @tmasjc

Image by Mads Schmidt Rasmussen from unsplash.com

Minimal Bootstrap Theme by Zachary Betz

# Programming in dplyr - Row Filtering

Oct 28, 2018 #dplyr #tidyeval

#### How do you program the row filtering dynamically when working with dplyr verb?

Say we have a dataframe with 2 columns, x and y.

library(tidyverse)
library(rlang)

# make some sample
set.seed(1234)
df <- 2 %>%
rerun(rnorm(100, 10, 2) %>% round()) %>%
bind_cols()
colnames(df) <- c("x", "y")

# simple plot
p <- df %>%
ggplot(aes(x, y)) +
geom_point(position = position_jitter(width = 1),
alpha = 0.7) +
theme_minimal()
p

Our objective here is to ‘zone’ out the points that fall in the 50th percentile of both x and y.

summary(df)
##        x               y
##  Min.   : 5.00   Min.   : 4.00
##  1st Qu.: 8.00   1st Qu.: 9.00
##  Median : 9.00   Median :10.00
##  Mean   : 9.71   Mean   :10.06
##  3rd Qu.:11.00   3rd Qu.:11.00
##  Max.   :15.00   Max.   :16.00
# filter values within (w) q1, q3
(w <- df %>%
filter(x >= quantile(x, probs = 0.25),
x <= quantile(x, probs = 0.75),
y >= quantile(y, probs = 0.25),
y <= quantile(y, probs = 0.75)))
## # A tibble: 42 x 2
##        x     y
##    <dbl> <dbl>
##  1     8    11
##  2    11     9
##  3    11    10
##  4     9    10
##  5     9    11
##  6     8    10
##  7     9    10
##  8     8     9
##  9     8    10
## 10     8    11
## # … with 32 more rows
# points that fall outside q1, q3
v <- anti_join(df, w, by = c("x", "y"))

# modify previous plot
w %>%
ggplot(aes(x, y)) +
geom_jitter(width = 1, col = "red") +
geom_jitter(data = v, aes(x, y), alpha = 0.7) +
theme_minimal()

The way that we express our filtering conditions is what we aim to simplify in this post.

filter(
data = df,
x >= quantile(x, probs = 0.25),
x <= quantile(x, probs = 0.75),
y >= quantile(y, probs = 0.25),
y <= quantile(y, probs = 0.75)
))

## Expression

We start from forming a simple expression in literal text.

text_expression <- function(var, ops, pct) {
sprintf("%s %s quantile(%s, probs = %g)", var, ops, var, pct)
}
text_expression("x", ">=", .75)
## [1] "x >= quantile(x, probs = 0.75)"
text_expression("y", "<=", .75)
## [1] "y <= quantile(y, probs = 0.75)"

## Rlang

We use rlang to transform text into R expression.

custom_filter <- function(x, filter_expr) {
expr <- parse_expr(filter_expr)
df %>% filter(!!expr)
}
custom_filter(x, text_expression("x", ">=", .75))
## # A tibble: 32 x 2
##        x     y
##    <dbl> <dbl>
##  1    11     9
##  2    12    10
##  3    11     8
##  4    11    10
##  5    12    10
##  6    15    10
##  7    11    12
##  8    11     9
##  9    12    12
## 10    13     9
## # … with 22 more rows

## Multiple Arguments

It takes the following steps,

1. Use purrr::map to form multiple expressions
2. Use ; or \n to join formed expressions
3. Use rlang::parse_exprs to transform texts into R expressions
4. Use !!! (3x bang) to slice expressions
# the following has 3 expressions
parse_exprs("NULL; list()\n foo(bar)")
## [[1]]
## NULL
##
## [[2]]
## list()
##
## [[3]]
## foo(bar)

Let’s continue onto our example.

# form 'head' of our expressions
(ehead <- map(.x = list("x", "y"), .f = paste, list(">=", "<=")))
## [[1]]
## [1] "x >=" "x <="
##
## [[2]]
## [1] "y >=" "y <="
# form 'taik' of our expressions
(etail <- map(.x = list("x", "y"),
.f = ~ sprintf('quantile(%s, probs = %g)', .x, .y),
list = c(.25, .75)))
## [[1]]
## [1] "quantile(x, probs = 0.25)" "quantile(x, probs = 0.75)"
##
## [[2]]
## [1] "quantile(y, probs = 0.25)" "quantile(y, probs = 0.75)"
# combine head and tail
(exprs <- map2(ehead, etail, ~ sprintf("%s %s", .x, .y)))
## [[1]]
## [1] "x >= quantile(x, probs = 0.25)" "x <= quantile(x, probs = 0.75)"
##
## [[2]]
## [1] "y >= quantile(y, probs = 0.25)" "y <= quantile(y, probs = 0.75)"
# collapse our expressions into a long string
(exprs <- exprs %>%
unlist() %>%
# use ';' or "\n"
paste0(collapse = ";"))
## [1] "x >= quantile(x, probs = 0.25);x <= quantile(x, probs = 0.75);y >= quantile(y, probs = 0.25);y <= quantile(y, probs = 0.75)"

## Apply Custom Filtering

# modify our custom filtering to fit multiple expressions
custom_filter = function(x, exprs) {
# take note of the plurar form
exprs_list <- parse_exprs(exprs)
# apply on data frame
x %>% filter(!!!exprs_list)
}
# thats it
(w2 <- df %>% custom_filter(exprs))
## # A tibble: 42 x 2
##        x     y
##    <dbl> <dbl>
##  1     8    11
##  2    11     9
##  3    11    10
##  4     9    10
##  5     9    11
##  6     8    10
##  7     9    10
##  8     8     9
##  9     8    10
## 10     8    11
## # … with 32 more rows
w2 %>%
ggplot(aes(x, y)) +
geom_point(position = position_jitter(width = 1), col = "red") +
geom_point(data = anti_join(df, w), aes(x, y)) +
theme_minimal()

# final check
all(w == w2)
## [1] TRUE