A lot of this is from https://r4ds.hadley.nz/
library(naniar)
vis_dat()
- look at all datavis_miss()
- look at missing valueslibrary(naniar)
vis_miss(airquality)
last observation carried forward –> NA values indicate repetition of previous value. Use fill()
treatment <- tribble(
~person, ~treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, NA,
"Katherine Burke", 1, 4
)
treatment %>%
fill(person)
## # A tibble: 4 × 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 Derrick Whitmore 2 10
## 3 Derrick Whitmore 3 NA
## 4 Katherine Burke 1 4
Fixed values -> Replace missing values with particular value replace_na()
treatment %>%
mutate(response = replace_na(response, 0))
## # A tibble: 4 × 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 <NA> 2 10
## 3 <NA> 3 0
## 4 Katherine Burke 1 4
treatment %>%
mutate(response = coalesce(response, 0))
## # A tibble: 4 × 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 <NA> 2 10
## 3 <NA> 3 0
## 4 Katherine Burke 1 4
Replace non NA with NAs -> Use na_if()
treatment %>%
mutate(response = na_if(response, 4))
## # A tibble: 4 × 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 <NA> 2 10
## 3 <NA> 3 NA
## 4 Katherine Burke 1 NA
implicit missing values –> use complete()
stocks %>%
complete(year, qtr)
or anti_join()
. Rows in X that aren’t in Y.
library(nycflights13)
flights %>%
distinct(faa = dest) %>%
anti_join(airports)
## # A tibble: 4 × 1
## faa
## <chr>
## 1 BQN
## 2 SJU
## 3 STT
## 4 PSE
Prevent dropped implict levels in ggplot with drop = False
ggplot(health, aes(smoker)) +
geom_bar() +
scale_x_discrete(drop = FALSE)
dplyr::group_by(), use .drop = FALSE
simputation package
library(simputation)
impute_lm(df, mod|group)
da1 <- impute_lm(dat, Sepal.Length ~ Sepal.Width + Species)
head(da1,3)
pivot_longer(
)
billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank"
)
## # A tibble: 24,092 × 5
## artist track date.entered week rank
## <chr> <chr> <date> <chr> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99
## 8 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk8 NA
## 9 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk9 NA
## 10 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk10 NA
## # … with 24,082 more rows
billboard %>%
pivot_longer(
cols = starts_with("wk"), # optional
names_prefix = "wk",
values_to = "rank"
)
## # A tibble: 24,092 × 5
## artist track date.entered name rank
## <chr> <chr> <date> <chr> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 7 99
## 8 2 Pac Baby Don't Cry (Keep... 2000-02-26 8 NA
## 9 2 Pac Baby Don't Cry (Keep... 2000-02-26 9 NA
## 10 2 Pac Baby Don't Cry (Keep... 2000-02-26 10 NA
## # … with 24,082 more rows
pivot_wider(
)
lag()
lead()
diamonds %>%
group_by(cut) %>%
arrange(carat) %>%
mutate(previous_depth = lag(depth, 1))
## # A tibble: 53,940 × 11
## # Groups: cut [5]
## carat cut color clarity depth table price x y z previous_depth
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0.2 Premi… E SI2 60.2 62 345 3.79 3.75 2.27 NA
## 2 0.2 Premi… E VS2 59.8 62 367 3.79 3.77 2.26 60.2
## 3 0.2 Premi… E VS2 59 60 367 3.81 3.78 2.24 59.8
## 4 0.2 Premi… E VS2 61.1 59 367 3.81 3.78 2.32 59
## 5 0.2 Premi… E VS2 59.7 62 367 3.84 3.8 2.28 61.1
## 6 0.2 Ideal E VS2 59.7 55 367 3.86 3.84 2.3 NA
## 7 0.2 Premi… F VS2 62.6 59 367 3.73 3.71 2.33 59.7
## 8 0.2 Ideal D VS2 61.5 57 367 3.81 3.77 2.33 59.7
## 9 0.2 Very … E VS2 63.4 59 367 3.74 3.71 2.36 NA
## 10 0.2 Ideal E VS2 62.2 57 367 3.76 3.73 2.33 61.5
## # … with 53,930 more rows
library(slider)
mutate(rolling_mean = slide_mean(column, before = X))
library(slider)
flights %>%
slice(1:10) %>%
select(dep_time, dep_delay, carrier) %>%
group_by(carrier) %>%
arrange(dep_time, .by_group = TRUE) %>%
mutate(x = slide_mean(dep_delay, before = 2))
## # A tibble: 10 × 4
## # Groups: carrier [5]
## dep_time dep_delay carrier x
## <int> <dbl> <chr> <dbl>
## 1 542 2 AA 2
## 2 558 -2 AA 0
## 3 544 -1 B6 -1
## 4 555 -5 B6 -3
## 5 557 -3 B6 -3
## 6 554 -6 DL -6
## 7 557 -3 EV -3
## 8 517 2 UA 2
## 9 533 4 UA 3
## 10 554 -4 UA 0.667
# by group
flights %>%
slice(1:10) %>%
select(dep_time, dep_delay, carrier) %>%
group_by(carrier) %>%
arrange(dep_time, .by_group = TRUE) %>%
group_by(carrier) %>%
nest() %>%
mutate(x = map(data, ~slide_mean(.$dep_delay, before = 2))) %>%
unnest()
## # A tibble: 10 × 4
## # Groups: carrier [5]
## carrier dep_time dep_delay x
## <chr> <int> <dbl> <dbl>
## 1 AA 542 2 2
## 2 AA 558 -2 0
## 3 B6 544 -1 -1
## 4 B6 555 -5 -3
## 5 B6 557 -3 -3
## 6 DL 554 -6 -6
## 7 EV 557 -3 -3
## 8 UA 517 2 2
## 9 UA 533 4 3
## 10 UA 554 -4 0.667
func_name <- function(x, y) {
sum(c(x,y))
}
func_name(2,5)
## [1] 7
get_greeting <- function(current_ti){
current_time <- now()
}
Validating reusable functions -> check important preconditions stopifnot
wt_mean <- function(x, w) {
stopifnot(length(x) == length(w))
sum(w * x) / sum(w)
}
wt_mean(c(4,3,5), c(3,4,4))
## [1] 4
if (this) {
# do that
} else if (that) {
# do something else
} else {
#
}
for (i in 1:5){
x = sum(c(i, i-1))
print(x)
}
## [1] 1
## [1] 3
## [1] 5
## [1] 7
## [1] 9
Bar plots
diamonds %>%
group_by(cut) %>%
summarize(n = n()) %>%
ggplot(aes(x = cut, y = n)) +
geom_bar(stat = "identity")
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
Histogram
ggplot(data = diamonds, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.5)
Two categorical variables: geom_count()
Ask: What are the primary keys in the data frame?
Mutating joins –>
** A left join keeps all observations in x. ** A right join keeps all observations in y. ** A full join keeps all observations in x and y
Filtering joins –>
Percent
diamonds %>%
count(color) %>%
summarize(percent = n/sum(n) * 100)
## # A tibble: 7 × 1
## percent
## <dbl>
## 1 12.6
## 2 18.2
## 3 17.7
## 4 20.9
## 5 15.4
## 6 10.1
## 7 5.21
slice_head(n = 1)
takes the first row from each group.slice_tail(n = 1)
takes the last row in each group.slice_min(x, n = 1)
takes the row with the smallest value of x.slice_max(x, n = 1)
takes the row with the largest value of x.library(lubridate)
today()
time %within% interval(t1, t2)
day()
month(1)
[no quotes., values are integers]year()
iris %>%
as_tibble() %>%
mutate(across(c(Sepal.Length, Sepal.Width), round))
## # A tibble: 150 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5 4 1.4 0.2 setosa
## 2 5 3 1.4 0.2 setosa
## 3 5 3 1.3 0.2 setosa
## 4 5 3 1.5 0.2 setosa
## 5 5 4 1.4 0.2 setosa
## 6 5 4 1.7 0.4 setosa
## 7 5 3 1.4 0.3 setosa
## 8 5 3 1.5 0.2 setosa
## 9 4 3 1.4 0.2 setosa
## 10 5 3 1.5 0.1 setosa
## # … with 140 more rows
a_func <- function(x){
stopifnot(is.data.frame(x))
stopifnot(!any(duplicated(x))) # duplicated
x %>%
verify(nrow(.) > 100) %>% # dataframe properties
verify(has_all_names("Species", "Petal.Width")) %>%
verify(is.factor(Species)) %>% # variable types
assert(within_bounds(0,50), -Species) %>% # variable values
assert(in_set(c("setosa", "versicolor", "virginica")), Species) %>%
verify(sum(is.na(Species)) < 2) %>%
verify(mean(Petal.Width)> 0) %>%
verify(sum(Petal.Width == 0.2) < 100)
}