W5

Author

M. Loukinov

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights23)
data("flights")
data("airlines")
names(flights) <- tolower(names(flights))
names(flights) <- gsub(" ","_",names(flights))
delay <- flights|>                       #Creates a top 20 list
  arrange(desc(arr_delay)) |>       #Desc makes descending order thus top 10
  mutate(column = "top") |>      #add a column for them to be consider top 20
  head(150)                               #Use 20 variables
delay1 <- merge(delay,airlines)
delay1 <- delay1 |>
  mutate(origin = recode(origin, EWR = "Newark Liberty International Airport", LGA ="Laguardia Airport" , JFK = "John F. Kennedy International Airport"))

American airlines is responsible for 17 of the 20 most delayed arrivals

undelay <- flights|>                       #Creates a top 20 list
  arrange((arr_delay)) |>       #Desc makes descending order thus top 10
  mutate(column = "bottom") |>      #add a column for them to be consider top 20
  head(50)                               #Use 20 variables
undelay
# A tibble: 50 × 20
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2023    12    25      838            845        -7     1054           1231
 2  2023    12    26     1956           1959        -3     2208           2344
 3  2023     4    11      907            915        -8     1118           1250
 4  2023     6     7     1547           1548        -1     1823           1955
 5  2023    12    26     1659           1710       -11     1941           2112
 6  2023     4    11     1643           1649        -6     1846           2015
 7  2023    12    26      853            900        -7     1110           1238
 8  2023     7    23     1725           1730        -5     2014           2140
 9  2023    12    25     1511           1515        -4     1707           1833
10  2023    12    25     1949           1954        -5     2218           2344
# ℹ 40 more rows
# ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, column <chr>

Delta Airlines has 13 of the 20 earliest arrival times

Compare departure time to delays and see if there is a trend

p1 <- delay1 |>
  ggplot(aes(x = sched_dep_time, y = arr_delay, color = name, shape = origin))+
  geom_point()+
  labs( x = "Scheduled Departure Time", y = "Actual Arrival Delay",
        title = "How the scheduled departure time affects arrival times",
        caption = "Data from FAA Aircraft registry",
        color = "Airline",
        shape = "Orgin Airport",)+
        theme_gray()+
        theme(legend.key.size = unit(0.5, "cm"))            #Google for help
        
p1