#Libraries needed
library(RCurl)
library(dplyr)
library(tidyverse)
library(tidyfast)
library(tidyr)
library(ggplot2)
Import Flights data from GitHub
site <- getURL("https://raw.githubusercontent.com/ltcancel/Homework-5/master/Airline%20Data.csv")
df <- read.csv(text = site, sep = ",", header = TRUE)
head(df)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Some of the columns need to be renamed. The data also imported with null values where a row will need to be removed or observations will need to be filled with the appropriate data.
#Fix column names of the first two columns
df <- rename(df, Airline = X, Status = X.1)
df
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
#Remove the null row
df <- df %>% drop_na()
df
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
I tried to use the fill function to fill in the Airline data in the observations that are blank but it did not work at first. R did not read these observations as NULL or NA so fill did not work. I made these observations NA so fill would work. Even though I could have just filled in the observations with the exact values, I chose to go this route to practice using some of these functions.
#Make blank Airline NULL so it can be filled
df[2,1] <- NA
df[4,1] <- NA
df
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#Fill the new NA columns with the values from the row above
df <- fill(df, Airline)
df
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
#Create a new data frame making some of the columns as rows instead
long_df <- df %>% dt_pivot_longer(
cols = -c(Airline,Status),
names_to = "City",
values_to = "Flights"
)
head(long_df)
## Airline Status City Flights
## 1: ALASKA on time Los.Angeles 497
## 2: ALASKA delayed Los.Angeles 62
## 3: AM WEST on time Los.Angeles 694
## 4: AM WEST delayed Los.Angeles 117
## 5: ALASKA on time Phoenix 221
## 6: ALASKA delayed Phoenix 12
Analysis to compare arrival delays for both airlines.
#split data by flight status
onTime <- long_df %>%
filter(Status == "on time")
delayed <- long_df %>%
filter(Status == "delayed")
onTime
## Airline Status City Flights
## 1 ALASKA on time Los.Angeles 497
## 2 AM WEST on time Los.Angeles 694
## 3 ALASKA on time Phoenix 221
## 4 AM WEST on time Phoenix 4840
## 5 ALASKA on time San.Diego 212
## 6 AM WEST on time San.Diego 383
## 7 ALASKA on time San.Francisco 503
## 8 AM WEST on time San.Francisco 320
## 9 ALASKA on time Seattle 1841
## 10 AM WEST on time Seattle 201
delayed
## Airline Status City Flights
## 1 ALASKA delayed Los.Angeles 62
## 2 AM WEST delayed Los.Angeles 117
## 3 ALASKA delayed Phoenix 12
## 4 AM WEST delayed Phoenix 415
## 5 ALASKA delayed San.Diego 20
## 6 AM WEST delayed San.Diego 65
## 7 ALASKA delayed San.Francisco 102
## 8 AM WEST delayed San.Francisco 129
## 9 ALASKA delayed Seattle 305
## 10 AM WEST delayed Seattle 61
#combine subsets so each flight status is in their own column
final_flights <- cbind(onTime, delayed[,4])
final_flights
## Airline Status City Flights delayed[, 4]
## 1 ALASKA on time Los.Angeles 497 62
## 2 AM WEST on time Los.Angeles 694 117
## 3 ALASKA on time Phoenix 221 12
## 4 AM WEST on time Phoenix 4840 415
## 5 ALASKA on time San.Diego 212 20
## 6 AM WEST on time San.Diego 383 65
## 7 ALASKA on time San.Francisco 503 102
## 8 AM WEST on time San.Francisco 320 129
## 9 ALASKA on time Seattle 1841 305
## 10 AM WEST on time Seattle 201 61
#Tidy final data frame to keep only the columns needed and rename columns
final_flights <- final_flights %>%
rename(
onTime = Flights,
Delayed = `delayed[, 4]`
) %>%
select(Airline,City,onTime,Delayed)
final_flights
## Airline City onTime Delayed
## 1 ALASKA Los.Angeles 497 62
## 2 AM WEST Los.Angeles 694 117
## 3 ALASKA Phoenix 221 12
## 4 AM WEST Phoenix 4840 415
## 5 ALASKA San.Diego 212 20
## 6 AM WEST San.Diego 383 65
## 7 ALASKA San.Francisco 503 102
## 8 AM WEST San.Francisco 320 129
## 9 ALASKA Seattle 1841 305
## 10 AM WEST Seattle 201 61
#Comparison of on time flights by Airline for each city
ggplot(final_flights, aes(x = Airline, y = onTime, fill = Airline)) +
geom_bar(stat = "identity") +
facet_wrap(~City)
#Comparison of delayed flights by Airline for each city
ggplot(final_flights, aes(x = Airline, y = Delayed, fill = Airline)) +
geom_bar(stat = "identity") +
facet_wrap(~City)
#Comparison of cumulative status for each airline
ggplot(long_df, aes(x = Status, y = Flights, fill = Airline)) +
geom_bar(stat = "identity") +
facet_wrap(~Airline)