chooseCRANmirror(graphics = FALSE, ind = 1)  # Selects the first mirror
install.packages("dplyr")
## 
## The downloaded binary packages are in
##  /var/folders/nz/h7z329n55nxfs2dv7hmbhc400000gn/T//RtmptOixtv/downloaded_packages
install.packages("tidyr")
## 
## The downloaded binary packages are in
##  /var/folders/nz/h7z329n55nxfs2dv7hmbhc400000gn/T//RtmptOixtv/downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

Load and view the csv file

flights <- read.csv("/Users/aaliyahmjh/Documents/assignment4.csv", stringsAsFactors = FALSE)   
glimpse (flights)  
## Rows: 5
## Columns: 7
## $ X             <chr> "ALASKA", "", "", "AMWEST", ""
## $ X.1           <chr> "on time", "delayed", "", "on time", "delayed"
## $ Los.Angeles   <int> 497, 62, NA, 694, 117
## $ Phoenix       <int> 221, 12, NA, 4840, 415
## $ San.Diego     <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle       <int> 1841, 305, NA, 201, 61

Tidying the data

Drop all empty rows

flights <- flights %>% drop_na()
glimpse (flights)  
## Rows: 4
## Columns: 7
## $ X             <chr> "ALASKA", "", "AMWEST", ""
## $ X.1           <chr> "on time", "delayed", "on time", "delayed"
## $ Los.Angeles   <int> 497, 62, 694, 117
## $ Phoenix       <int> 221, 12, 4840, 415
## $ San.Diego     <int> 212, 20, 383, 65
## $ San.Francisco <int> 503, 102, 320, 129
## $ Seattle       <int> 1841, 305, 201, 61

Create clearer column names

colnames(flights) <- c("airline", "status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle")
glimpse (flights)  
## Rows: 4
## Columns: 7
## $ airline       <chr> "ALASKA", "", "AMWEST", ""
## $ status        <chr> "on time", "delayed", "on time", "delayed"
## $ Los_Angeles   <int> 497, 62, 694, 117
## $ Phoenix       <int> 221, 12, 4840, 415
## $ San_Diego     <int> 212, 20, 383, 65
## $ San_Francisco <int> 503, 102, 320, 129
## $ Seattle       <int> 1841, 305, 201, 61

Fill the empty rows with the correct airline name

flights <- flights %>%
  mutate(airline = na_if(airline, "")) %>%  # Convert empty strings to NA
  fill(airline, .direction = "down")  
glimpse (flights)  
## Rows: 4
## Columns: 7
## $ airline       <chr> "ALASKA", "ALASKA", "AMWEST", "AMWEST"
## $ status        <chr> "on time", "delayed", "on time", "delayed"
## $ Los_Angeles   <int> 497, 62, 694, 117
## $ Phoenix       <int> 221, 12, 4840, 415
## $ San_Diego     <int> 212, 20, 383, 65
## $ San_Francisco <int> 503, 102, 320, 129
## $ Seattle       <int> 1841, 305, 201, 61

Converting the sheet from wide to long format for easier analysis

flights_long <- flights %>%
  pivot_longer(cols = Los_Angeles:Seattle, 
               names_to = "city", 
               values_to = "flights")
glimpse (flights_long)  
## Rows: 20
## Columns: 4
## $ airline <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "A…
## $ status  <chr> "on time", "on time", "on time", "on time", "on time", "delaye…
## $ city    <chr> "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seatt…
## $ flights <int> 497, 221, 212, 503, 1841, 62, 12, 20, 102, 305, 694, 4840, 383…

Analysis

Summary of On-Time vs. Delayed Flights by Airline

From this summary, I can see that Alaska Airlines has less delayed flights than Amwest does but they also recorded just over half of the total amount of flights that Amwest did.

I’m interested in seeing what is the rate of delayed flights since this total number of flights is so different

airline_summary <- flights_long %>%
  group_by(airline, status) %>%
  summarise(total_flights = sum(flights)) %>%
  pivot_wider(names_from = status, values_from = total_flights)
## `summarise()` has grouped output by 'airline'. You can override using the
## `.groups` argument.
print(airline_summary) 
## # A tibble: 2 × 3
## # Groups:   airline [2]
##   airline delayed `on time`
##   <chr>     <int>     <int>
## 1 ALASKA      501      3274
## 2 AMWEST      787      6438

Cleaning up column names to remove spaces and make them more accessible

airline_summary <- airline_summary %>%
  rename(delayed = 'delayed',
    on_time = 'on time'
  )

Finding the rate of delays for each airline

Even though, when looking at the overall count of flights delayed by each airline it seemed like Amwest was worse, we can tell by the rate that they actually perform better on a whole when looking at their rate of departure delays since it is lower than Alaska Airlines’.

delayed_rate <- airline_summary %>%
  mutate(
    del_rate = delayed / (delayed + on_time)
  ) %>%
  select(airline, del_rate) %>%
  arrange(desc(del_rate))

glimpse(delayed_rate)
## Rows: 2
## Columns: 2
## Groups: airline [2]
## $ airline  <chr> "ALASKA", "AMWEST"
## $ del_rate <dbl> 0.1327152, 0.1089273
ggplot(airline_summary, aes(x = airline, y = on_time, fill = "On Time")) +
  geom_bar(stat = "identity") +
  geom_bar(aes(y = delayed, fill = "Delayed"), stat = "identity") +
  labs(title = "On-time vs. Delayed Flights per Airline", y = "Number of Flights") +
  scale_fill_manual(values = c("On Time" = "blue", "Delayed" = "orange")) +
  theme_minimal()