Possibly needed libraries
library(tidyverse)
library(openintro)
library(stringr)
library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)
Load in data
flights_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/Delays.csv"
flights_raw <- read.csv(flights_url, header = TRUE, stringsAsFactors = FALSE)
flights_raw
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 NA NA NA NA NA
## 2 ALASKA on time 497 221 212 503 1841
## 3 delayed 62 12 20 102 305
## 4 NA NA NA NA NA
## 5 AMWEST on time 694 4840 383 320 201
## 6 delayed 117 415 65 129 61
Remove both un-needed blank lines
Flights <- na.omit(flights_raw)
Flights
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 2 ALASKA on time 497 221 212 503 1841
## 3 delayed 62 12 20 102 305
## 5 AMWEST on time 694 4840 383 320 201
## 6 delayed 117 415 65 129 61
Add missing column names
colnames(Flights)[1] <- "Airline"
colnames(Flights)[2] <- "Status"
Flights <- dplyr::rename(Flights, 'Los Angeles' = Los.Angeles)
Flights <- dplyr::rename(Flights, 'Phoenix' = Phoenix)
Flights <- dplyr::rename(Flights, 'San Diego' = San.Diego)
Flights <- dplyr::rename(Flights, 'San Francisco'= San.Francisco)
Flights <- dplyr::rename(Flights, 'Seattle' = Seattle)
Flights
## Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 2 ALASKA on time 497 221 212 503 1841
## 3 delayed 62 12 20 102 305
## 5 AMWEST on time 694 4840 383 320 201
## 6 delayed 117 415 65 129 61
Add the flight origin to missing rows
Flights_clean <- tidyr::fill(Flights, Airline)
# Fill doesn't work here oddly
Flights_clean[2,1] <- Flights_clean[1,1]
# Adds Alaska to missing space
Flights_clean[4,1] <- Flights_clean[3,1]
# Adds AMWEST to missing space
Flights_clean
## Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 2 ALASKA on time 497 221 212 503 1841
## 3 ALASKA delayed 62 12 20 102 305
## 5 AMWEST on time 694 4840 383 320 201
## 6 AMWEST delayed 117 415 65 129 61
Analysis to compare arrival delays for the two airlines
dplyr::glimpse(Flights_clean)
## Rows: 4
## Columns: 7
## $ Airline <chr> "ALASKA", "ALASKA", "AMWEST", "AMWEST"
## $ Status <chr> "on time", "delayed", "on time", "delayed"
## $ `Los Angeles` <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ `San Diego` <int> 212, 20, 383, 65
## $ `San Francisco` <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
# Now data is formatted properly, can analyze
# ------------------------------------------------------------------------------
Alaska_ontime_avg <- rowMeans(Flights_clean[1,3:length(Flights_clean)])
Alaska_ontime_avg # 654.8
## 2
## 654.8
Alaska_delayed_avg <- rowMeans(Flights_clean[2,3:length(Flights_clean)])
Alaska_delayed_avg # 100.2
## 3
## 100.2
# ------------------------------------------------------------------------------
Amwest_ontime_avg <- rowMeans(Flights_clean[3,3:length(Flights_clean)])
Amwest_ontime_avg # 1287.6
## 5
## 1287.6
Amwest_delayed_avg <- rowMeans(Flights_clean[4,3:length(Flights_clean)])
Amwest_delayed_avg # 157.4
## 6
## 157.4
On average, there are more almost twice as many flights that are on-time with Amwest Airlines, but 50% more average delays as well.
dplyr::glimpse(Flights_clean)
## Rows: 4
## Columns: 7
## $ Airline <chr> "ALASKA", "ALASKA", "AMWEST", "AMWEST"
## $ Status <chr> "on time", "delayed", "on time", "delayed"
## $ `Los Angeles` <int> 497, 62, 694, 117
## $ Phoenix <int> 221, 12, 4840, 415
## $ `San Diego` <int> 212, 20, 383, 65
## $ `San Francisco` <int> 503, 102, 320, 129
## $ Seattle <int> 1841, 305, 201, 61
# ------------------------------------------------------------------------------
Alaska_ontime_total <- rowSums(Flights_clean[1,3:length(Flights_clean)])
Alaska_ontime_total # 3274
## 2
## 3274
Alaska_delayed_total <- rowSums(Flights_clean[2,3:length(Flights_clean)])
Alaska_delayed_total # 501
## 3
## 501
# ------------------------------------------------------------------------------
Amwest_ontime_total <- rowSums(Flights_clean[3,3:length(Flights_clean)])
Amwest_ontime_total # 6438
## 5
## 6438
Amwest_delayed_total<- rowSums(Flights_clean[4,3:length(Flights_clean)])
Amwest_delayed_total # 787
## 6
## 787
Thus we can see that AmWest is actually a betetr airline overall, as it had a greater ratio of on-time flights compared to delayed-flights, both in raw amounts and averages. Let’s confirm with Dplyr
Better version; Make the data wide for cleaner analysis
Flights_Wide <- Flights_clean %>%
# change format from wide to long to tidy up data
pivot_longer(`Los Angeles`:`Seattle`, names_to = "Location", values_to = "Count")
Flights_Wide
## # A tibble: 20 × 4
## Airline Status Location Count
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AMWEST on time Los Angeles 694
## 12 AMWEST on time Phoenix 4840
## 13 AMWEST on time San Diego 383
## 14 AMWEST on time San Francisco 320
## 15 AMWEST on time Seattle 201
## 16 AMWEST delayed Los Angeles 117
## 17 AMWEST delayed Phoenix 415
## 18 AMWEST delayed San Diego 65
## 19 AMWEST delayed San Francisco 129
## 20 AMWEST delayed Seattle 61
Delayed Flights
Delayed_flights <- Flights_Wide %>%
filter(Status == "delayed")
Delayed_flights
## # A tibble: 10 × 4
## Airline Status Location Count
## <chr> <chr> <chr> <int>
## 1 ALASKA delayed Los Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San Diego 20
## 4 ALASKA delayed San Francisco 102
## 5 ALASKA delayed Seattle 305
## 6 AMWEST delayed Los Angeles 117
## 7 AMWEST delayed Phoenix 415
## 8 AMWEST delayed San Diego 65
## 9 AMWEST delayed San Francisco 129
## 10 AMWEST delayed Seattle 61
# ------------------------------------------------------------------------------
# By flight location
Delays_per_location <- Delayed_flights %>%
group_by(Airline, Location) %>%
summarise(Average_Delay = mean(Count),
Total_Delays = sum(Count)) %>%
arrange(desc(Average_Delay))
Delays_per_location
## # A tibble: 10 × 4
## # Groups: Airline [2]
## Airline Location Average_Delay Total_Delays
## <chr> <chr> <dbl> <int>
## 1 AMWEST Phoenix 415 415
## 2 ALASKA Seattle 305 305
## 3 AMWEST San Francisco 129 129
## 4 AMWEST Los Angeles 117 117
## 5 ALASKA San Francisco 102 102
## 6 AMWEST San Diego 65 65
## 7 ALASKA Los Angeles 62 62
## 8 AMWEST Seattle 61 61
## 9 ALASKA San Diego 20 20
## 10 ALASKA Phoenix 12 12
# ------------------------------------------------------------------------------
# By airline
Delays_per_airline <- Delayed_flights %>%
group_by(Airline) %>%
summarise(Average_Delay = mean(Count),
Total_Delays = sum(Count)) %>%
arrange(desc(Total_Delays))
Delays_per_airline
## # A tibble: 2 × 3
## Airline Average_Delay Total_Delays
## <chr> <dbl> <int>
## 1 AMWEST 157. 787
## 2 ALASKA 100. 501
Total Delays by Airline
ggplot(Delays_per_airline, aes(x= Airline, y = Total_Delays, color = Airline, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Total Delays Per Airline") +
labs(x = "Airline Carrier", y = "Total Delays") +
geom_text(aes(label = Total_Delays), vjust = 1.5, colour = "black")
Total Delays by location by Airline
ggplot(Delays_per_location, aes(x = Location, y = Average_Delay, color = Airline, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Total Delays by Location by Airline") +
labs(x = "Airport Destinations", y = "Total Delays") +
geom_text(aes(label = Average_Delay), colour = "black")