Working with data basics in R

A lot of this is from https://r4ds.hadley.nz/

Missing values

Visualization

library(naniar)

vis_dat() - look at all data
vis_miss() - look at missing values

library(naniar)
vis_miss(airquality)

Tidying

last observation carried forward –> NA values indicate repetition of previous value. Use fill()

treatment <- tribble(
  ~person,           ~treatment, ~response,
  "Derrick Whitmore", 1,         7,
  NA,                 2,         10,
  NA,                 3,         NA,
  "Katherine Burke",  1,         4
)

treatment %>%
  fill(person)

## # A tibble: 4 × 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 Derrick Whitmore         2       10
## 3 Derrick Whitmore         3       NA
## 4 Katherine Burke          1        4

Fixed values -> Replace missing values with particular value replace_na()

treatment %>%
  mutate(response = replace_na(response, 0))

## # A tibble: 4 × 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 <NA>                     2       10
## 3 <NA>                     3        0
## 4 Katherine Burke          1        4

treatment %>%
  mutate(response = coalesce(response, 0))

## # A tibble: 4 × 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 <NA>                     2       10
## 3 <NA>                     3        0
## 4 Katherine Burke          1        4

Replace non NA with NAs -> Use na_if()

treatment %>%
  mutate(response = na_if(response, 4))

## # A tibble: 4 × 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 <NA>                     2       10
## 3 <NA>                     3       NA
## 4 Katherine Burke          1       NA

implicit missing values –> use complete()

stocks %>%
 complete(year, qtr)

or anti_join(). Rows in X that aren’t in Y.

library(nycflights13)

flights %>% 
  distinct(faa = dest) %>% 
  anti_join(airports)

## # A tibble: 4 × 1
##   faa  
##   <chr>
## 1 BQN  
## 2 SJU  
## 3 STT  
## 4 PSE

Prevent dropped implict levels in ggplot with drop = False

ggplot(health, aes(smoker)) +
  geom_bar() +
  scale_x_discrete(drop = FALSE)

dplyr::group_by(), use .drop = FALSE

Modeling missing data

simputation package

library(simputation)
impute_lm(df, mod|group)

da1 <- impute_lm(dat, Sepal.Length ~ Sepal.Width + Species)
head(da1,3)

Data Transformations

to long form

pivot_longer(

cols = i.e. which columns to pivot longer,
names_to = names of the variable stored in the column names (make this up)
values_to = names the variable stored in the cell values (make this up)

)

billboard %>%
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "rank"
  )

## # A tibble: 24,092 × 5
##    artist track                   date.entered week   rank
##    <chr>  <chr>                   <date>       <chr> <dbl>
##  1 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk1      87
##  2 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk2      82
##  3 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk3      72
##  4 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk4      77
##  5 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk5      87
##  6 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk6      94
##  7 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk7      99
##  8 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk8      NA
##  9 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk9      NA
## 10 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk10     NA
## # … with 24,082 more rows

billboard %>%
  pivot_longer(
    cols = starts_with("wk"), # optional
    names_prefix = "wk", 
    values_to = "rank"
  )

## # A tibble: 24,092 × 5
##    artist track                   date.entered name   rank
##    <chr>  <chr>                   <date>       <chr> <dbl>
##  1 2 Pac  Baby Don't Cry (Keep... 2000-02-26   1        87
##  2 2 Pac  Baby Don't Cry (Keep... 2000-02-26   2        82
##  3 2 Pac  Baby Don't Cry (Keep... 2000-02-26   3        72
##  4 2 Pac  Baby Don't Cry (Keep... 2000-02-26   4        77
##  5 2 Pac  Baby Don't Cry (Keep... 2000-02-26   5        87
##  6 2 Pac  Baby Don't Cry (Keep... 2000-02-26   6        94
##  7 2 Pac  Baby Don't Cry (Keep... 2000-02-26   7        99
##  8 2 Pac  Baby Don't Cry (Keep... 2000-02-26   8        NA
##  9 2 Pac  Baby Don't Cry (Keep... 2000-02-26   9        NA
## 10 2 Pac  Baby Don't Cry (Keep... 2000-02-26   10       NA
## # … with 24,082 more rows

to wide form

pivot_wider(

id_cols = which columns identify rows,
names_from = column variable name,
values_from = column variable name

)

Leading/Lagging values

simple lag/lead

lag() lead()

diamonds %>%
  group_by(cut) %>%
  arrange(carat) %>%
  mutate(previous_depth = lag(depth, 1))

## # A tibble: 53,940 × 11
## # Groups:   cut [5]
##    carat cut    color clarity depth table price     x     y     z previous_depth
##    <dbl> <ord>  <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>          <dbl>
##  1   0.2 Premi… E     SI2      60.2    62   345  3.79  3.75  2.27           NA  
##  2   0.2 Premi… E     VS2      59.8    62   367  3.79  3.77  2.26           60.2
##  3   0.2 Premi… E     VS2      59      60   367  3.81  3.78  2.24           59.8
##  4   0.2 Premi… E     VS2      61.1    59   367  3.81  3.78  2.32           59  
##  5   0.2 Premi… E     VS2      59.7    62   367  3.84  3.8   2.28           61.1
##  6   0.2 Ideal  E     VS2      59.7    55   367  3.86  3.84  2.3            NA  
##  7   0.2 Premi… F     VS2      62.6    59   367  3.73  3.71  2.33           59.7
##  8   0.2 Ideal  D     VS2      61.5    57   367  3.81  3.77  2.33           59.7
##  9   0.2 Very … E     VS2      63.4    59   367  3.74  3.71  2.36           NA  
## 10   0.2 Ideal  E     VS2      62.2    57   367  3.76  3.73  2.33           61.5
## # … with 53,930 more rows

Rolling average

library(slider)
mutate(rolling_mean = slide_mean(column, before = X))

library(slider)

flights %>%
  slice(1:10) %>%
  select(dep_time, dep_delay, carrier) %>%
  group_by(carrier) %>%
  arrange(dep_time, .by_group = TRUE) %>%
  mutate(x = slide_mean(dep_delay, before = 2))

## # A tibble: 10 × 4
## # Groups:   carrier [5]
##    dep_time dep_delay carrier      x
##       <int>     <dbl> <chr>    <dbl>
##  1      542         2 AA       2    
##  2      558        -2 AA       0    
##  3      544        -1 B6      -1    
##  4      555        -5 B6      -3    
##  5      557        -3 B6      -3    
##  6      554        -6 DL      -6    
##  7      557        -3 EV      -3    
##  8      517         2 UA       2    
##  9      533         4 UA       3    
## 10      554        -4 UA       0.667

# by group
flights %>%
  slice(1:10) %>%
  select(dep_time, dep_delay, carrier) %>%
  group_by(carrier) %>%
  arrange(dep_time, .by_group = TRUE) %>%
  group_by(carrier) %>%
  nest() %>%
  mutate(x = map(data, ~slide_mean(.$dep_delay, before = 2))) %>%
  unnest()

## # A tibble: 10 × 4
## # Groups:   carrier [5]
##    carrier dep_time dep_delay      x
##    <chr>      <int>     <dbl>  <dbl>
##  1 AA           542         2  2    
##  2 AA           558        -2  0    
##  3 B6           544        -1 -1    
##  4 B6           555        -5 -3    
##  5 B6           557        -3 -3    
##  6 DL           554        -6 -6    
##  7 EV           557        -3 -3    
##  8 UA           517         2  2    
##  9 UA           533         4  3    
## 10 UA           554        -4  0.667

Control Structure

Function

func_name <- function(x, y) {
  sum(c(x,y))
}

func_name(2,5)

## [1] 7

get_greeting <- function(current_ti){
  current_time <- now()
}

Validating reusable functions -> check important preconditions stopifnot

wt_mean <- function(x, w) {
  stopifnot(length(x) == length(w))
  sum(w * x) / sum(w)
}

wt_mean(c(4,3,5), c(3,4,4))

## [1] 4

if/else

if (this) {
  # do that
} else if (that) {
  # do something else
} else {
  # 
}

use || and && in if statements
for vectorized, use any() or all()

for loop

for (i in 1:5){
  x = sum(c(i, i-1))
  print(x)
}

## [1] 1
## [1] 3
## [1] 5
## [1] 7
## [1] 9

alternatively: map(list, func)

Plotting

Bar plots

diamonds %>%
  group_by(cut) %>%
  summarize(n = n()) %>%
  ggplot(aes(x = cut, y = n)) +
  geom_bar(stat = "identity")

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut))

Histogram

ggplot(data = diamonds, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.5)

Two categorical variables: geom_count()

Joins

Ask: What are the primary keys in the data frame?

Mutating joins –>

Inner join - all observations in both
Outer join - observations in at least one

** A left join keeps all observations in x. ** A right join keeps all observations in y. ** A full join keeps all observations in x and y

Filtering joins –>

semi_join(x, y) keeps all observations in x that have a match in y.
anti_join(x, y) drops all observations in x that have a match in y.

Aggregation

Percent

diamonds %>%
  count(color) %>%
  summarize(percent = n/sum(n) * 100)

## # A tibble: 7 × 1
##   percent
##     <dbl>
## 1   12.6 
## 2   18.2 
## 3   17.7 
## 4   20.9 
## 5   15.4 
## 6   10.1 
## 7    5.21

Misc Cleaning

Slice

slice_head(n = 1) takes the first row from each group.
slice_tail(n = 1) takes the last row in each group.
slice_min(x, n = 1) takes the row with the smallest value of x.
slice_max(x, n = 1) takes the row with the largest value of x.

Dates

library(lubridate)

today -> today()
time %within% interval(t1, t2)
day()
month(1) [no quotes., values are integers]
year()

Misc

a:b is an abbreviation for seq(a, b, 1)
arrange(.data, …, .by_group = FALSE)
across:

 iris %>%
  as_tibble() %>%
  mutate(across(c(Sepal.Length, Sepal.Width), round))

## # A tibble: 150 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1            5           4          1.4         0.2 setosa 
##  2            5           3          1.4         0.2 setosa 
##  3            5           3          1.3         0.2 setosa 
##  4            5           3          1.5         0.2 setosa 
##  5            5           4          1.4         0.2 setosa 
##  6            5           4          1.7         0.4 setosa 
##  7            5           3          1.4         0.3 setosa 
##  8            5           3          1.5         0.2 setosa 
##  9            4           3          1.4         0.2 setosa 
## 10            5           3          1.5         0.1 setosa 
## # … with 140 more rows

a_func <- function(x){
  
  stopifnot(is.data.frame(x))
  stopifnot(!any(duplicated(x)))             # duplicated

  x %>%
    verify(nrow(.) > 100) %>%                 # dataframe properties
    verify(has_all_names("Species", "Petal.Width")) %>%
    verify(is.factor(Species)) %>%            # variable types
    assert(within_bounds(0,50), -Species) %>% # variable values
    assert(in_set(c("setosa", "versicolor", "virginica")), Species) %>%
    verify(sum(is.na(Species)) < 2) %>%
    verify(mean(Petal.Width)> 0) %>% 
    verify(sum(Petal.Width ==  0.2) < 100) 
  
}