library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url = 'https://raw.githubusercontent.com/AlphaCurse/DATA607/main/flights.csv'
df = read.csv(url)
head(df)
## ï.. X Los.Angeles Pheonix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
colnames(df)[1] = 'Airline'
colnames(df)[2] = 'Status'
colnames(df)[3] = 'Los_Angeles'
colnames(df)[5] = 'San_Diego'
colnames(df)[6] = 'San_Francisco'
df = df %>%
filter(!row_number() %in% c(3))
You can also embed plots, for example:
df = df %>%
mutate(Total = select(., Los_Angeles, Pheonix, San_Diego, San_Francisco, Seattle) %>% rowSums(na.rm = TRUE))
head(df)
## Airline Status Los_Angeles Pheonix San_Diego San_Francisco Seattle Total
## 1 ALASKA on time 497 221 212 503 1841 3274
## 2 delayed 62 12 20 102 305 501
## 3 AM WEST on time 694 4840 383 320 201 6438
## 4 delayed 117 415 65 129 61 787
Below, I have calculated the probability of a flight being delayed per airline. As we can see, Alaska has a 13% probability to be delayed, which is worse than AM West’s probability of 10%. Customers may prefer to fly with AM West as oppose to Alaska due to the delay probability.
alaska = (501/(501+3274))*100
am_west = (787/(787+6438))*100
alaska
## [1] 13.27152
am_west
## [1] 10.89273