Import Libraries

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read the Data

url = 'https://raw.githubusercontent.com/AlphaCurse/DATA607/main/flights.csv'
df = read.csv(url)
head(df)
##       ï..       X Los.Angeles Pheonix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Tidy and Transform Data

colnames(df)[1] = 'Airline'
colnames(df)[2] = 'Status'
colnames(df)[3] = 'Los_Angeles'
colnames(df)[5] = 'San_Diego'
colnames(df)[6] = 'San_Francisco'
df = df %>%
  filter(!row_number() %in% c(3))

Analysis comparing arrival delays

You can also embed plots, for example:

df = df %>%
  mutate(Total = select(., Los_Angeles, Pheonix, San_Diego, San_Francisco, Seattle) %>% rowSums(na.rm = TRUE))
head(df)
##   Airline  Status Los_Angeles Pheonix San_Diego San_Francisco Seattle Total
## 1  ALASKA on time         497     221       212           503    1841  3274
## 2         delayed          62      12        20           102     305   501
## 3 AM WEST on time         694    4840       383           320     201  6438
## 4         delayed         117     415        65           129      61   787

Below, I have calculated the probability of a flight being delayed per airline. As we can see, Alaska has a 13% probability to be delayed, which is worse than AM West’s probability of 10%. Customers may prefer to fly with AM West as oppose to Alaska due to the delay probability.

alaska = (501/(501+3274))*100
am_west = (787/(787+6438))*100
alaska
## [1] 13.27152
am_west
## [1] 10.89273