Data-606-HW-2-re-done.knit

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(openintro)

## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata

data(nycflights)
names(nycflights)

##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay" "arr_time" 
##  [7] "arr_delay" "carrier"   "tailnum"   "flight"    "origin"    "dest"     
## [13] "air_time"  "distance"  "hour"      "minute"

?nycflights

## starting httpd help server ... done

nycflights

## # A tibble: 32,735 × 16
##     year month   day dep_time dep_delay arr_time arr_de…¹ carrier tailnum flight
##    <int> <int> <int>    <int>     <dbl>    <int>    <dbl> <chr>   <chr>    <int>
##  1  2013     6    30      940        15     1216       -4 VX      N626VA     407
##  2  2013     5     7     1657        -3     2104       10 DL      N3760C     329
##  3  2013    12     8      859        -1     1238       11 DL      N712TW     422
##  4  2013     5    14     1841        -4     2122      -34 DL      N914DL    2391
##  5  2013     7    21     1102        -3     1230       -8 9E      N823AY    3652
##  6  2013     1     1     1817        -3     2008        3 AA      N3AXAA     353
##  7  2013    12     9     1259        14     1617       22 WN      N218WN    1428
##  8  2013     8    13     1920        85     2032       71 B6      N284JB    1407
##  9  2013     9    26      725       -10     1027       -8 AA      N3FSAA    2279
## 10  2013     4    30     1323        62     1549       60 EV      N12163    4162
## # … with 32,725 more rows, 6 more variables: origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
## #   variable name ¹arr_delay
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names

’’’ #Exercise 1 A binwidth of 150 is ideal for this data analysis as it reveals all the columns clearly in a way that is legible and allows for a comparison between the columns the other columns make it very difficult to see all relevant columns as they are obscured

’’’

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 15)

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 150)

’’’ Exercise 2 68 flights meet this criteria

’’’

sfo_feb_flights <- nycflights %>%
  filter(dest == "SFO", month == 2)

’’’ Exercise 3 Most of the flights are centered around a 0 arrival delay with osome outliers in terms of some being 1-2 hours late, you even have one case of three hours late. The median for EWR ( and coincidentally the mean as well) is 15 mins late.

’’’

sfo_feb_flights %>%
  group_by(origin) %>%
  summarise(median_dd = median(arr_delay), mean_dd = mean(arr_delay))

## # A tibble: 2 × 3
##   origin median_dd mean_dd
##   <chr>      <dbl>   <dbl>
## 1 EWR        -15.5  -15.1 
## 2 JFK        -10.5   -3.08

sfo_feb_flights

## # A tibble: 68 × 16
##     year month   day dep_time dep_delay arr_time arr_de…¹ carrier tailnum flight
##    <int> <int> <int>    <int>     <dbl>    <int>    <dbl> <chr>   <chr>    <int>
##  1  2013     2    18     1527        57     1903       48 DL      N711ZX    1322
##  2  2013     2     3      613        14     1008       38 UA      N502UA     691
##  3  2013     2    15      955        -5     1313      -28 DL      N717TW    1765
##  4  2013     2    18     1928        15     2239       -6 UA      N24212    1214
##  5  2013     2    24     1340         2     1644      -21 UA      N76269    1111
##  6  2013     2    25     1415       -10     1737      -13 UA      N532UA     394
##  7  2013     2     7     1032         1     1352      -10 B6      N627JB     641
##  8  2013     2    15     1805        20     2122        2 AA      N335AA     177
##  9  2013     2    13     1056        -4     1412      -13 UA      N532UA     642
## 10  2013     2     8      656        -4     1039       -6 DL      N710TW    1865
## # … with 58 more rows, 6 more variables: origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
## #   variable name ¹arr_delay
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names

ggplot(data = sfo_feb_flights, aes(x = arr_delay)) +
  geom_histogram(binwidth = 5)

’’’ Exercise 4 Looks like VX( Virgin Atlantic ) has the most delays in terms of the median. However, inter quartiel range is on par with Delta and UA.

sfo_feb_flights %>%
  group_by(carrier) %>%
  summarise(median_dd = median(arr_delay), iqr_dd = IQR(arr_delay), n_flights = n())

## # A tibble: 5 × 4
##   carrier median_dd iqr_dd n_flights
##   <chr>       <dbl>  <dbl>     <int>
## 1 AA            5     17.5        10
## 2 B6          -10.5   12.2         6
## 3 DL          -15     22          19
## 4 UA          -10     22          21
## 5 VX          -22.5   21.2        12

’’’ Exercise 5 Suppose you really dislike departure delays and you want to schedule your travel in a month that minimizes your potential departure delay leaving NYC. One option is to choose the month with the lowest mean departure delay. Another option is to choose the month with the lowest median departure delay. What are the pros and cons of these two choices? Answer : The mean makes use of all the elements in the data set but is also susceptible to outliars and skew The median gives us no idea of the shape of the distribution but it is insensitive to extreme values so the chance of the median being skewed is lower.

’’’

’’’ Exercise 6 LGA is the best for on time flights,

nycflights <- nycflights %>% 
  mutate (dep_type = if_else(dep_delay < 5,"on time", "delayed"))

nycflights %>% 
  group_by(origin) %>%
  summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
  arrange(desc(ot_dep_rate))

## # A tibble: 3 × 2
##   origin ot_dep_rate
##   <chr>        <dbl>
## 1 LGA          0.728
## 2 JFK          0.694
## 3 EWR          0.637

’’’ Exercise 7 ’’’

nycflights <- nycflights %>% 
  mutate (avg_speed = distance / (air_time / 60) )

nycflights

## # A tibble: 32,735 × 18
##     year month   day dep_time dep_delay arr_time arr_de…¹ carrier tailnum flight
##    <int> <int> <int>    <int>     <dbl>    <int>    <dbl> <chr>   <chr>    <int>
##  1  2013     6    30      940        15     1216       -4 VX      N626VA     407
##  2  2013     5     7     1657        -3     2104       10 DL      N3760C     329
##  3  2013    12     8      859        -1     1238       11 DL      N712TW     422
##  4  2013     5    14     1841        -4     2122      -34 DL      N914DL    2391
##  5  2013     7    21     1102        -3     1230       -8 9E      N823AY    3652
##  6  2013     1     1     1817        -3     2008        3 AA      N3AXAA     353
##  7  2013    12     9     1259        14     1617       22 WN      N218WN    1428
##  8  2013     8    13     1920        85     2032       71 B6      N284JB    1407
##  9  2013     9    26      725       -10     1027       -8 AA      N3FSAA    2279
## 10  2013     4    30     1323        62     1549       60 EV      N12163    4162
## # … with 32,725 more rows, 8 more variables: origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, dep_type <chr>,
## #   avg_speed <dbl>, and abbreviated variable name ¹arr_delay
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names

’’’ Exercise 8 avg_speed / distance are positively correlated ’’’

ggplot(data = nycflights,aes(x = distance, y = avg_speed  )) +
  geom_point()

’’’ Exercise 9 200 minutes ’’’

new_nycflights <- nycflights %>% 
  filter(carrier == "AA" | carrier == "DL" | carrier == "UA")


ggplot(data = new_nycflights,aes(x = dep_delay, y = arr_delay, color = origin)) +
  geom_point()