#Libraries needed
library(RCurl)
library(dplyr)
library(tidyverse)
library(tidyfast)
library(tidyr)
library(ggplot2)

Import Flights Data

Import Flights data from GitHub

site <- getURL("https://raw.githubusercontent.com/ltcancel/Homework-5/master/Airline%20Data.csv")
df <- read.csv(text = site, sep = ",", header  = TRUE)

head(df)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Tidy the data

Some of the columns need to be renamed. The data also imported with null values where a row will need to be removed or observations will need to be filled with the appropriate data.

#Fix column names of the first two columns
df <- rename(df, Airline = X, Status = X.1)
df
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61
#Remove the null row
df <- df %>% drop_na()
df
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

I tried to use the fill function to fill in the Airline data in the observations that are blank but it did not work at first. R did not read these observations as NULL or NA so fill did not work. I made these observations NA so fill would work. Even though I could have just filled in the observations with the exact values, I chose to go this route to practice using some of these functions.

#Make blank Airline NULL so it can be filled
df[2,1] <- NA
df[4,1] <- NA

df
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2    <NA> delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129      61
#Fill the new NA columns with the values from the row above
df <- fill(df, Airline)
df
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5 AM WEST delayed         117     415        65           129      61
#Create a new data frame making some of the columns as rows instead
long_df <- df %>% dt_pivot_longer(
  cols = -c(Airline,Status),
  names_to = "City",
  values_to = "Flights"
)

head(long_df)
##    Airline  Status        City Flights
## 1:  ALASKA on time Los.Angeles     497
## 2:  ALASKA delayed Los.Angeles      62
## 3: AM WEST on time Los.Angeles     694
## 4: AM WEST delayed Los.Angeles     117
## 5:  ALASKA on time     Phoenix     221
## 6:  ALASKA delayed     Phoenix      12

Flights Analysis

Analysis to compare arrival delays for both airlines.

#split data by flight status 
onTime <- long_df %>%
  filter(Status == "on time")

delayed <- long_df %>% 
  filter(Status == "delayed")

onTime
##    Airline  Status          City Flights
## 1   ALASKA on time   Los.Angeles     497
## 2  AM WEST on time   Los.Angeles     694
## 3   ALASKA on time       Phoenix     221
## 4  AM WEST on time       Phoenix    4840
## 5   ALASKA on time     San.Diego     212
## 6  AM WEST on time     San.Diego     383
## 7   ALASKA on time San.Francisco     503
## 8  AM WEST on time San.Francisco     320
## 9   ALASKA on time       Seattle    1841
## 10 AM WEST on time       Seattle     201
delayed
##    Airline  Status          City Flights
## 1   ALASKA delayed   Los.Angeles      62
## 2  AM WEST delayed   Los.Angeles     117
## 3   ALASKA delayed       Phoenix      12
## 4  AM WEST delayed       Phoenix     415
## 5   ALASKA delayed     San.Diego      20
## 6  AM WEST delayed     San.Diego      65
## 7   ALASKA delayed San.Francisco     102
## 8  AM WEST delayed San.Francisco     129
## 9   ALASKA delayed       Seattle     305
## 10 AM WEST delayed       Seattle      61
#combine subsets so each flight status is in their own column
final_flights <- cbind(onTime, delayed[,4])
final_flights
##    Airline  Status          City Flights delayed[, 4]
## 1   ALASKA on time   Los.Angeles     497           62
## 2  AM WEST on time   Los.Angeles     694          117
## 3   ALASKA on time       Phoenix     221           12
## 4  AM WEST on time       Phoenix    4840          415
## 5   ALASKA on time     San.Diego     212           20
## 6  AM WEST on time     San.Diego     383           65
## 7   ALASKA on time San.Francisco     503          102
## 8  AM WEST on time San.Francisco     320          129
## 9   ALASKA on time       Seattle    1841          305
## 10 AM WEST on time       Seattle     201           61
#Tidy final data frame to keep only the columns needed and rename columns
final_flights <- final_flights %>%
  rename(
    onTime = Flights,
    Delayed = `delayed[, 4]`
  ) %>%
  select(Airline,City,onTime,Delayed)

final_flights
##    Airline          City onTime Delayed
## 1   ALASKA   Los.Angeles    497      62
## 2  AM WEST   Los.Angeles    694     117
## 3   ALASKA       Phoenix    221      12
## 4  AM WEST       Phoenix   4840     415
## 5   ALASKA     San.Diego    212      20
## 6  AM WEST     San.Diego    383      65
## 7   ALASKA San.Francisco    503     102
## 8  AM WEST San.Francisco    320     129
## 9   ALASKA       Seattle   1841     305
## 10 AM WEST       Seattle    201      61
#Comparison of on time flights by Airline for each city
ggplot(final_flights, aes(x = Airline, y = onTime, fill = Airline)) + 
  geom_bar(stat = "identity") +
  facet_wrap(~City)

#Comparison of delayed flights by Airline for each city
ggplot(final_flights, aes(x = Airline, y = Delayed, fill = Airline)) + 
  geom_bar(stat = "identity") +
  facet_wrap(~City)

#Comparison of cumulative status for each airline
ggplot(long_df, aes(x = Status, y = Flights, fill = Airline)) + 
  geom_bar(stat = "identity") +
  facet_wrap(~Airline)