In this project, goal is to read a .csv of data showing counts of whether two airlines were on time or delayed flying into multiple destinations, transform and tidy the data, and then do an analysis to compare the arrival delays for both airlines.
library('dplyr')
library('tidyr')
df <- read.csv("airline.csv", header=TRUE,stringsAsFactors = FALSE)
df
## X X.1 LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
#add missing data
df[2, 1] <- "ALASKA"
df[5, 1] <- "AMWEST"
df
## X X.1 LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 AMWEST delayed 117 415 65 129 61
data_long <- gather(df, city, number, LosAngeles:Seattle, factor_key=TRUE)
# rename column name.
colnames(data_long)[2] <- "status"
colnames(data_long)[1] <- "Airline"
data_long$Airline <- factor(data_long$Airline)
data_long
## Airline status city number
## 1 ALASKA on time LosAngeles 497
## 2 ALASKA delayed LosAngeles 62
## 3 LosAngeles NA
## 4 AMWEST on time LosAngeles 694
## 5 AMWEST delayed LosAngeles 117
## 6 ALASKA on time Phoenix 221
## 7 ALASKA delayed Phoenix 12
## 8 Phoenix NA
## 9 AMWEST on time Phoenix 4840
## 10 AMWEST delayed Phoenix 415
## 11 ALASKA on time SanDiego 212
## 12 ALASKA delayed SanDiego 20
## 13 SanDiego NA
## 14 AMWEST on time SanDiego 383
## 15 AMWEST delayed SanDiego 65
## 16 ALASKA on time SanFrancisco 503
## 17 ALASKA delayed SanFrancisco 102
## 18 SanFrancisco NA
## 19 AMWEST on time SanFrancisco 320
## 20 AMWEST delayed SanFrancisco 129
## 21 ALASKA on time Seattle 1841
## 22 ALASKA delayed Seattle 305
## 23 Seattle NA
## 24 AMWEST on time Seattle 201
## 25 AMWEST delayed Seattle 61
# remove NA
clean_data <- data_long %>%
filter(!is.na(number))
clean_data
## Airline status city number
## 1 ALASKA on time LosAngeles 497
## 2 ALASKA delayed LosAngeles 62
## 3 AMWEST on time LosAngeles 694
## 4 AMWEST delayed LosAngeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AMWEST on time Phoenix 4840
## 8 AMWEST delayed Phoenix 415
## 9 ALASKA on time SanDiego 212
## 10 ALASKA delayed SanDiego 20
## 11 AMWEST on time SanDiego 383
## 12 AMWEST delayed SanDiego 65
## 13 ALASKA on time SanFrancisco 503
## 14 ALASKA delayed SanFrancisco 102
## 15 AMWEST on time SanFrancisco 320
## 16 AMWEST delayed SanFrancisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AMWEST on time Seattle 201
## 20 AMWEST delayed Seattle 61
clean_data<-spread(clean_data, "status" , number)
clean_data
## Airline city delayed on time
## 1 ALASKA LosAngeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA SanDiego 20 212
## 4 ALASKA SanFrancisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST LosAngeles 117 694
## 7 AMWEST Phoenix 415 4840
## 8 AMWEST SanDiego 65 383
## 9 AMWEST SanFrancisco 129 320
## 10 AMWEST Seattle 61 201
analysis <- as.data.frame(clean_data %>%
mutate(total = delayed + `on time`) %>%
group_by(Airline) %>%
summarise(delayed_total = sum(delayed),
on_time_total = sum(`on time`),
count_total = sum(total),
percent_ontime = round(on_time_total/count_total*100,digit = 2)))
analysis
## Airline delayed_total on_time_total count_total percent_ontime
## 1 ALASKA 501 3274 3775 86.73
## 2 AMWEST 787 6438 7225 89.11
dotchart(analysis$percent_ontime, labels=analysis$airline, main = "percent of online time Flights Across All Cities")
The individual city delay ratios by airline show that ALASKA airlines had a lower individual delay ratio than AMWEST in every single city. However the aggregated data shows that AMWEST is mostly on-time airline, due to the larger volume of flights.