NYC Flights Homework

Author

Jonathan RH

Packages

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights23)
library(RColorBrewer)

Data

data("flights")

head(flights)
# A tibble: 6 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2023     1     1        1           2038       203      328              3
2  2023     1     1       18           2300        78      228            135
3  2023     1     1       31           2344        47      500            426
4  2023     1     1       33           2140       173      238           2352
5  2023     1     1       36           2048       228      223           2252
6  2023     1     1      503            500         3      808            815
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Decoding Carriers

flights$carrier[flights$carrier == "9E"] <- "Endeavor Air"
flights$carrier[flights$carrier == "AA"] <- "American Airlines"
flights$carrier[flights$carrier == "AS"] <- "Alaska Airlines"
flights$carrier[flights$carrier == "B6"] <- "JetBlue Airways"
flights$carrier[flights$carrier == "DL"] <- "Delta Air Lines"
flights$carrier[flights$carrier == "F9"] <- "Frontier Airlines"
flights$carrier[flights$carrier == "G4"] <- "Allegiant Air"
flights$carrier[flights$carrier == "HA"] <- "Hawaiian Airlines"
flights$carrier[flights$carrier == "MQ"] <- "Envoy Air"
flights$carrier[flights$carrier == "NK"] <- "Spirit Airlines"
flights$carrier[flights$carrier == "OO"] <- "SkyWest Airlines"
flights$carrier[flights$carrier == "UA"] <- "United Airlines"
flights$carrier[flights$carrier == "WN"] <- "Southwest Airlines"
flights$carrier[flights$carrier == "YX"] <- "Midwest Airlines"

Count

table(flights$carrier)

   Alaska Airlines      Allegiant Air  American Airlines    Delta Air Lines 
              7843                671              40525              61562 
      Endeavor Air          Envoy Air  Frontier Airlines  Hawaiian Airlines 
             54141                357               1286                366 
   JetBlue Airways   Midwest Airlines   SkyWest Airlines Southwest Airlines 
             66169              88785               6432              12385 
   Spirit Airlines    United Airlines 
             15189              79641 

Changing the Data

Altered_Flights <- flights |>
  select(-year) |>
  filter(dep_delay < 0, carrier != "Hawaiian Airlines", carrier != "Envoy Air", carrier != "Allegiant Air", carrier != "Frontier Airlines") |>
  mutate(dep_delay = abs(dep_delay))

head(Altered_Flights)
# A tibble: 6 × 18
  month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1     1     1      524            530         6      645            710
2     1     1      549            559        10      905            901
3     1     1      551            600         9      846            859
4     1     1      552            559         7      857            911
5     1     1      554            600         6      914            920
6     1     1      554            600         6      725            735
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

New Count

Carrier_Count <- table(Altered_Flights$carrier)
Carrier_Count

   Alaska Airlines  American Airlines    Delta Air Lines       Endeavor Air 
              4673              25590              32995              37515 
   JetBlue Airways   Midwest Airlines   SkyWest Airlines Southwest Airlines 
             31333              65327               3751               3993 
   Spirit Airlines    United Airlines 
              8191              40351 

Summary

Altered_Flights_Summary <- Altered_Flights |>
  group_by(carrier) |>
  summarize(total_dep_delay = sum(dep_delay)) |>
  mutate(carrier = factor(carrier, levels = carrier[order(-total_dep_delay)]))

head(Altered_Flights_Summary)
# A tibble: 6 × 2
  carrier           total_dep_delay
  <fct>                       <dbl>
1 Alaska Airlines             31997
2 American Airlines          140149
3 Delta Air Lines            143104
4 Endeavor Air               219138
5 JetBlue Airways            186239
6 Midwest Airlines           419241

Data Visualization

P1 <- ggplot(Altered_Flights_Summary, aes(x = carrier, y = total_dep_delay, fill = total_dep_delay)) +
  geom_bar(stat = "identity") +
  labs(x= "Airline Carriers", y = "Early Departure Minutes Accumulated", title = "Top 10 Carriers Who Left the Earliest in 2023", fill = "Minutes", caption = "Source: FAA Aircraft registry") +
  scale_fill_gradient(low = "cyan", high = "darkmagenta") + #scale_fill_brewer() did not work
  theme(axis.text.x = element_text(angle = 90)) 

Converting From Scientific Notation to Standard Notation

options(scipen = 999)

Bargraph

P1

Essayr

I decided to have fun with the NYC Flights data set and made a bar graph that represents the top 10 carriers that left the earliest in 2023. I changed the codes of the carriers to their proper names. Then, filter the data so it only includes the flights that had early departures, as well as removing the four lowest carriers. After cleaning the data, I wanted to sort the data so the carriers with the most early departure minutes accumulated are from greatest to least. This was the most difficult part for me, but with the help of notes (from DATA 101 & 110), Geeks4Geeks, and Stack Overflow. I was able to figure it out. I had to convert the negative numbers to positive by using the abs function in the mutate. Then in the summary, I used the mutate, summarize, and factor to order the minutes from greatest to least. Finally, I put the summary into the bar graph, added the titles, converted to standard notation, and added some color. Besides that, I had a lot of fun putting together the code to create this visual.

Source

https://stackoverflow.com/questions/28190435/changing-factor-levels-with-dplyr-mutate