library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
data(nycflights)
names(nycflights)
## [1] "year" "month" "day" "dep_time" "dep_delay" "arr_time"
## [7] "arr_delay" "carrier" "tailnum" "flight" "origin" "dest"
## [13] "air_time" "distance" "hour" "minute"
?nycflights
## starting httpd help server ... done
nycflights
## # A tibble: 32,735 × 16
## year month day dep_time dep_delay arr_time arr_de…¹ carrier tailnum flight
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr> <int>
## 1 2013 6 30 940 15 1216 -4 VX N626VA 407
## 2 2013 5 7 1657 -3 2104 10 DL N3760C 329
## 3 2013 12 8 859 -1 1238 11 DL N712TW 422
## 4 2013 5 14 1841 -4 2122 -34 DL N914DL 2391
## 5 2013 7 21 1102 -3 1230 -8 9E N823AY 3652
## 6 2013 1 1 1817 -3 2008 3 AA N3AXAA 353
## 7 2013 12 9 1259 14 1617 22 WN N218WN 1428
## 8 2013 8 13 1920 85 2032 71 B6 N284JB 1407
## 9 2013 9 26 725 -10 1027 -8 AA N3FSAA 2279
## 10 2013 4 30 1323 62 1549 60 EV N12163 4162
## # … with 32,725 more rows, 6 more variables: origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
## # variable name ¹arr_delay
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
’’’ #Exercise 1 A binwidth of 150 is ideal for this data analysis as it reveals all the columns clearly in a way that is legible and allows for a comparison between the columns the other columns make it very difficult to see all relevant columns as they are obscured
’’’
ggplot(data = nycflights, aes(x = dep_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = nycflights, aes(x = dep_delay)) +
geom_histogram(binwidth = 15)
ggplot(data = nycflights, aes(x = dep_delay)) +
geom_histogram(binwidth = 150)
’’’ Exercise 2 68 flights meet this criteria
’’’
sfo_feb_flights <- nycflights %>%
filter(dest == "SFO", month == 2)
’’’ Exercise 3 Most of the flights are centered around a 0 arrival delay with osome outliers in terms of some being 1-2 hours late, you even have one case of three hours late. The median for EWR ( and coincidentally the mean as well) is 15 mins late.
’’’
sfo_feb_flights %>%
group_by(origin) %>%
summarise(median_dd = median(arr_delay), mean_dd = mean(arr_delay))
## # A tibble: 2 × 3
## origin median_dd mean_dd
## <chr> <dbl> <dbl>
## 1 EWR -15.5 -15.1
## 2 JFK -10.5 -3.08
sfo_feb_flights
## # A tibble: 68 × 16
## year month day dep_time dep_delay arr_time arr_de…¹ carrier tailnum flight
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr> <int>
## 1 2013 2 18 1527 57 1903 48 DL N711ZX 1322
## 2 2013 2 3 613 14 1008 38 UA N502UA 691
## 3 2013 2 15 955 -5 1313 -28 DL N717TW 1765
## 4 2013 2 18 1928 15 2239 -6 UA N24212 1214
## 5 2013 2 24 1340 2 1644 -21 UA N76269 1111
## 6 2013 2 25 1415 -10 1737 -13 UA N532UA 394
## 7 2013 2 7 1032 1 1352 -10 B6 N627JB 641
## 8 2013 2 15 1805 20 2122 2 AA N335AA 177
## 9 2013 2 13 1056 -4 1412 -13 UA N532UA 642
## 10 2013 2 8 656 -4 1039 -6 DL N710TW 1865
## # … with 58 more rows, 6 more variables: origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
## # variable name ¹arr_delay
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
ggplot(data = sfo_feb_flights, aes(x = arr_delay)) +
geom_histogram(binwidth = 5)
’’’ Exercise 4 Looks like VX( Virgin Atlantic ) has the most delays in terms of the median. However, inter quartiel range is on par with Delta and UA.
sfo_feb_flights %>%
group_by(carrier) %>%
summarise(median_dd = median(arr_delay), iqr_dd = IQR(arr_delay), n_flights = n())
## # A tibble: 5 × 4
## carrier median_dd iqr_dd n_flights
## <chr> <dbl> <dbl> <int>
## 1 AA 5 17.5 10
## 2 B6 -10.5 12.2 6
## 3 DL -15 22 19
## 4 UA -10 22 21
## 5 VX -22.5 21.2 12
’’’ Exercise 5 Suppose you really dislike departure delays and you want to schedule your travel in a month that minimizes your potential departure delay leaving NYC. One option is to choose the month with the lowest mean departure delay. Another option is to choose the month with the lowest median departure delay. What are the pros and cons of these two choices? Answer : The mean makes use of all the elements in the data set but is also susceptible to outliars and skew The median gives us no idea of the shape of the distribution but it is insensitive to extreme values so the chance of the median being skewed is lower.
’’’
’’’ Exercise 6 LGA is the best for on time flights,
nycflights <- nycflights %>%
mutate (dep_type = if_else(dep_delay < 5,"on time", "delayed"))
nycflights %>%
group_by(origin) %>%
summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
arrange(desc(ot_dep_rate))
## # A tibble: 3 × 2
## origin ot_dep_rate
## <chr> <dbl>
## 1 LGA 0.728
## 2 JFK 0.694
## 3 EWR 0.637
’’’ Exercise 7 ’’’
nycflights <- nycflights %>%
mutate (avg_speed = distance / (air_time / 60) )
nycflights
## # A tibble: 32,735 × 18
## year month day dep_time dep_delay arr_time arr_de…¹ carrier tailnum flight
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr> <int>
## 1 2013 6 30 940 15 1216 -4 VX N626VA 407
## 2 2013 5 7 1657 -3 2104 10 DL N3760C 329
## 3 2013 12 8 859 -1 1238 11 DL N712TW 422
## 4 2013 5 14 1841 -4 2122 -34 DL N914DL 2391
## 5 2013 7 21 1102 -3 1230 -8 9E N823AY 3652
## 6 2013 1 1 1817 -3 2008 3 AA N3AXAA 353
## 7 2013 12 9 1259 14 1617 22 WN N218WN 1428
## 8 2013 8 13 1920 85 2032 71 B6 N284JB 1407
## 9 2013 9 26 725 -10 1027 -8 AA N3FSAA 2279
## 10 2013 4 30 1323 62 1549 60 EV N12163 4162
## # … with 32,725 more rows, 8 more variables: origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, dep_type <chr>,
## # avg_speed <dbl>, and abbreviated variable name ¹arr_delay
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
’’’ Exercise 8 avg_speed / distance are positively correlated ’’’
ggplot(data = nycflights,aes(x = distance, y = avg_speed )) +
geom_point()
’’’ Exercise 9 200 minutes ’’’
new_nycflights <- nycflights %>%
filter(carrier == "AA" | carrier == "DL" | carrier == "UA")
ggplot(data = new_nycflights,aes(x = dep_delay, y = arr_delay, color = origin)) +
geom_point()