Data Transformations with Dplyr Solutions

The following are solutions to exercises from R For Data Science <https://r4ds.had.co.nz/transform.html.

Prerequisites

# Data wrangling and transformation
#install.packages("nycflights13")
library(nycflights13)
?flights
#View(flights)

head(flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
library(tidyverse)
## ── Attaching packages ──────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ─────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Practice with Dplyr

# VERBS
## filter()
## arrange()
## select()
## mutate()
## summarise()

## FILTER
# flights on New Years
jan1<-flights%>%
  filter(month==1,
         day==1)
jan1
## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# flights on Christmas
dec25<-flights%>%
  filter(month==12,
         day==25)
dec25
## # A tibble: 719 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12    25      456            500        -4      649            651
##  2  2013    12    25      524            515         9      805            814
##  3  2013    12    25      542            540         2      832            850
##  4  2013    12    25      546            550        -4     1022           1027
##  5  2013    12    25      556            600        -4      730            745
##  6  2013    12    25      557            600        -3      743            752
##  7  2013    12    25      557            600        -3      818            831
##  8  2013    12    25      559            600        -1      855            856
##  9  2013    12    25      559            600        -1      849            855
## 10  2013    12    25      600            600         0      850            846
## # … with 709 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# flights in nov OR dec
nov_dec<-flights%>%
  filter(month %in% c(11, 12))

# Demorgans Law
# (A or B )^c = A^c and B^c

ABcomp<-flights%>%
  filter(!(arr_delay>120| dep_delay>120))

AB<-flights%>%
  filter(arr_delay <=120, dep_delay<=120)

head(ABcomp==AB)
##      year month  day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## [1,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [2,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [3,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [4,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [5,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [6,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
##      arr_delay carrier flight tailnum origin dest air_time distance hour minute
## [1,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [2,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [3,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [4,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [5,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [6,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
##      time_hour
## [1,]      TRUE
## [2,]      TRUE
## [3,]      TRUE
## [4,]      TRUE
## [5,]      TRUE
## [6,]      TRUE
# is.na
x<-NA
y<-NA

x==y
## [1] NA
is.na(x)
## [1] TRUE

SOLUTIONS TO 5.2.4 Exercises

1.1 Find all flights that had an arrival delay of two or more hours

# Exercises
# 1. Find all flights that 
## 1. Had an arrival delay of two or more hours

ex2.1.1<-flights%>%
  filter(arr_delay>=120)
dim(ex2.1.1)
## [1] 10200    19

1.2 Find all flights that flew to Houston (IAH or HOU)

## 2. Flew to Houston (IAH or HOU)

ex2.1.2<-flights%>%
  filter(dest %in% c("IAH", "HOU"))
dim(ex2.1.2)
## [1] 9313   19

1.3 Find all flights that were operated by United (UA), American (AA), or Delta (DL)

# 3. Were operated by United, American, or Delta

ex2.1.3<-flights%>%
  filter(carrier %in%c("UA", "AA", "DL"))
dim(ex2.1.3)
## [1] 139504     19

1.4 Find all flights that departed in summer (July, August, and September)

# 4. Departed in summer (July, August, and September)

ex2.1.4<-flights%>%
  filter(month %in% c(7, 8, 9))
dim(ex2.1.4)
## [1] 86326    19

1.5 Find all flights that arrived more than two hours late, but didn’t leave late

# 5. Arrived more than two hours late, but didn't leave late

ex2.1.5<-flights%>%
  filter(arr_delay>=120, 
         dep_delay<=0)
dim(ex2.1.5)
## [1] 29 19

1.6 Find all flights that were delayed by at least an hour, but made up over 30 minutes in flight

# 6. Were delayed by at least an hour, but made up over 30 minutes 
## we'll come back to this 

ex2.1.6<-flights%>%
  mutate(madeup=dep_delay-arr_delay)%>%
  filter(dep_delay>=60, 
         madeup>=30)
dim(ex2.1.6)
## [1] 2074   20

1.7 Find all flights that departed between midnight and 6am (inclusive)

# 7. Departed between midnight and 6am (inclusive)

ex2.1.7<-flights%>%
  filter(dep_time>=0 , dep_time<=600)
dim(ex2.1.7)
## [1] 9344   19

3. How many flights have a missing dep_time? What other variables are missing? What might these rows represent?

# 3. How many flights have a missing dep_time
ex2.3<-flights%>%
  filter(is.na(dep_time)==TRUE)
head(ex2.3)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1       NA           1630        NA       NA           1815
## 2  2013     1     1       NA           1935        NA       NA           2240
## 3  2013     1     1       NA           1500        NA       NA           1825
## 4  2013     1     1       NA            600        NA       NA            901
## 5  2013     1     2       NA           1540        NA       NA           1747
## 6  2013     1     2       NA           1620        NA       NA           1746
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>