The following are solutions to exercises from R For Data Science <https://r4ds.had.co.nz/transform.html.
# Data wrangling and transformation
#install.packages("nycflights13")
library(nycflights13)
?flights
#View(flights)
head(flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
library(tidyverse)
## ── Attaching packages ──────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ─────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# VERBS
## filter()
## arrange()
## select()
## mutate()
## summarise()
## FILTER
# flights on New Years
jan1<-flights%>%
filter(month==1,
day==1)
jan1
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# flights on Christmas
dec25<-flights%>%
filter(month==12,
day==25)
dec25
## # A tibble: 719 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 12 25 456 500 -4 649 651
## 2 2013 12 25 524 515 9 805 814
## 3 2013 12 25 542 540 2 832 850
## 4 2013 12 25 546 550 -4 1022 1027
## 5 2013 12 25 556 600 -4 730 745
## 6 2013 12 25 557 600 -3 743 752
## 7 2013 12 25 557 600 -3 818 831
## 8 2013 12 25 559 600 -1 855 856
## 9 2013 12 25 559 600 -1 849 855
## 10 2013 12 25 600 600 0 850 846
## # … with 709 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# flights in nov OR dec
nov_dec<-flights%>%
filter(month %in% c(11, 12))
# Demorgans Law
# (A or B )^c = A^c and B^c
ABcomp<-flights%>%
filter(!(arr_delay>120| dep_delay>120))
AB<-flights%>%
filter(arr_delay <=120, dep_delay<=120)
head(ABcomp==AB)
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## [1,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [4,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [5,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [6,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## [1,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [4,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [5,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [6,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## time_hour
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
# is.na
x<-NA
y<-NA
x==y
## [1] NA
is.na(x)
## [1] TRUE
# Exercises
# 1. Find all flights that
## 1. Had an arrival delay of two or more hours
ex2.1.1<-flights%>%
filter(arr_delay>=120)
dim(ex2.1.1)
## [1] 10200 19
## 2. Flew to Houston (IAH or HOU)
ex2.1.2<-flights%>%
filter(dest %in% c("IAH", "HOU"))
dim(ex2.1.2)
## [1] 9313 19
# 3. Were operated by United, American, or Delta
ex2.1.3<-flights%>%
filter(carrier %in%c("UA", "AA", "DL"))
dim(ex2.1.3)
## [1] 139504 19
# 4. Departed in summer (July, August, and September)
ex2.1.4<-flights%>%
filter(month %in% c(7, 8, 9))
dim(ex2.1.4)
## [1] 86326 19
# 5. Arrived more than two hours late, but didn't leave late
ex2.1.5<-flights%>%
filter(arr_delay>=120,
dep_delay<=0)
dim(ex2.1.5)
## [1] 29 19
# 6. Were delayed by at least an hour, but made up over 30 minutes
## we'll come back to this
ex2.1.6<-flights%>%
mutate(madeup=dep_delay-arr_delay)%>%
filter(dep_delay>=60,
madeup>=30)
dim(ex2.1.6)
## [1] 2074 20
# 7. Departed between midnight and 6am (inclusive)
ex2.1.7<-flights%>%
filter(dep_time>=0 , dep_time<=600)
dim(ex2.1.7)
## [1] 9344 19
# 3. How many flights have a missing dep_time
ex2.3<-flights%>%
filter(is.na(dep_time)==TRUE)
head(ex2.3)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 NA 1630 NA NA 1815
## 2 2013 1 1 NA 1935 NA NA 2240
## 3 2013 1 1 NA 1500 NA NA 1825
## 4 2013 1 1 NA 600 NA NA 901
## 5 2013 1 2 NA 1540 NA NA 1747
## 6 2013 1 2 NA 1620 NA NA 1746
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>