chooseCRANmirror(graphics = FALSE, ind = 1) # Selects the first mirror
install.packages("dplyr")
##
## The downloaded binary packages are in
## /var/folders/nz/h7z329n55nxfs2dv7hmbhc400000gn/T//RtmptOixtv/downloaded_packages
install.packages("tidyr")
##
## The downloaded binary packages are in
## /var/folders/nz/h7z329n55nxfs2dv7hmbhc400000gn/T//RtmptOixtv/downloaded_packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
flights <- read.csv("/Users/aaliyahmjh/Documents/assignment4.csv", stringsAsFactors = FALSE)
glimpse (flights)
## Rows: 5
## Columns: 7
## $ X <chr> "ALASKA", "", "", "AMWEST", ""
## $ X.1 <chr> "on time", "delayed", "", "on time", "delayed"
## $ Los.Angeles <int> 497, 62, NA, 694, 117
## $ Phoenix <int> 221, 12, NA, 4840, 415
## $ San.Diego <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle <int> 1841, 305, NA, 201, 61
flights <- flights %>% drop_na()
glimpse (flights)
## Rows: 4
## Columns: 7
## $ X <chr> "ALASKA", "", "AMWEST", ""
## $ X.1 <chr> "on time", "delayed", "on time", "delayed"
## $ Los.Angeles <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ San.Diego <int> 212, 20, 383, 65
## $ San.Francisco <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
colnames(flights) <- c("airline", "status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle")
glimpse (flights)
## Rows: 4
## Columns: 7
## $ airline <chr> "ALASKA", "", "AMWEST", ""
## $ status <chr> "on time", "delayed", "on time", "delayed"
## $ Los_Angeles <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ San_Diego <int> 212, 20, 383, 65
## $ San_Francisco <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
flights <- flights %>%
mutate(airline = na_if(airline, "")) %>% # Convert empty strings to NA
fill(airline, .direction = "down")
glimpse (flights)
## Rows: 4
## Columns: 7
## $ airline <chr> "ALASKA", "ALASKA", "AMWEST", "AMWEST"
## $ status <chr> "on time", "delayed", "on time", "delayed"
## $ Los_Angeles <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ San_Diego <int> 212, 20, 383, 65
## $ San_Francisco <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
flights_long <- flights %>%
pivot_longer(cols = Los_Angeles:Seattle,
names_to = "city",
values_to = "flights")
glimpse (flights_long)
## Rows: 20
## Columns: 4
## $ airline <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "A…
## $ status <chr> "on time", "on time", "on time", "on time", "on time", "delaye…
## $ city <chr> "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seatt…
## $ flights <int> 497, 221, 212, 503, 1841, 62, 12, 20, 102, 305, 694, 4840, 383…
From this summary, I can see that Alaska Airlines has less delayed flights than Amwest does but they also recorded just over half of the total amount of flights that Amwest did.
I’m interested in seeing what is the rate of delayed flights since this total number of flights is so different
airline_summary <- flights_long %>%
group_by(airline, status) %>%
summarise(total_flights = sum(flights)) %>%
pivot_wider(names_from = status, values_from = total_flights)
## `summarise()` has grouped output by 'airline'. You can override using the
## `.groups` argument.
print(airline_summary)
## # A tibble: 2 × 3
## # Groups: airline [2]
## airline delayed `on time`
## <chr> <int> <int>
## 1 ALASKA 501 3274
## 2 AMWEST 787 6438
airline_summary <- airline_summary %>%
rename(delayed = 'delayed',
on_time = 'on time'
)
Even though, when looking at the overall count of flights delayed by each airline it seemed like Amwest was worse, we can tell by the rate that they actually perform better on a whole when looking at their rate of departure delays since it is lower than Alaska Airlines’.
delayed_rate <- airline_summary %>%
mutate(
del_rate = delayed / (delayed + on_time)
) %>%
select(airline, del_rate) %>%
arrange(desc(del_rate))
glimpse(delayed_rate)
## Rows: 2
## Columns: 2
## Groups: airline [2]
## $ airline <chr> "ALASKA", "AMWEST"
## $ del_rate <dbl> 0.1327152, 0.1089273
ggplot(airline_summary, aes(x = airline, y = on_time, fill = "On Time")) +
geom_bar(stat = "identity") +
geom_bar(aes(y = delayed, fill = "Delayed"), stat = "identity") +
labs(title = "On-time vs. Delayed Flights per Airline", y = "Number of Flights") +
scale_fill_manual(values = c("On Time" = "blue", "Delayed" = "orange")) +
theme_minimal()