Abstract

In this project, goal is to read a .csv of data showing counts of whether two airlines were on time or delayed flying into multiple destinations, transform and tidy the data, and then do an analysis to compare the arrival delays for both airlines.

Preparation

prerequisite

library('dplyr')
library('tidyr')

Data Import

df <- read.csv("airline.csv", header=TRUE,stringsAsFactors = FALSE)
df
##        X     X.1 LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 ALASKA on time        497     221      212          503    1841
## 2        delayed         62      12       20          102     305
## 3                        NA      NA       NA           NA      NA
## 4 AMWEST on time        694    4840      383          320     201
## 5        delayed        117     415       65          129      61

Data Transformation

#add missing data
df[2, 1] <- "ALASKA"
df[5, 1] <- "AMWEST"
df
##        X     X.1 LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1 ALASKA on time        497     221      212          503    1841
## 2 ALASKA delayed         62      12       20          102     305
## 3                        NA      NA       NA           NA      NA
## 4 AMWEST on time        694    4840      383          320     201
## 5 AMWEST delayed        117     415       65          129      61
data_long <- gather(df, city, number, LosAngeles:Seattle, factor_key=TRUE)

# rename column name.
colnames(data_long)[2] <- "status" 
colnames(data_long)[1] <- "Airline"
data_long$Airline <- factor(data_long$Airline)
data_long
##    Airline  status         city number
## 1   ALASKA on time   LosAngeles    497
## 2   ALASKA delayed   LosAngeles     62
## 3                    LosAngeles     NA
## 4   AMWEST on time   LosAngeles    694
## 5   AMWEST delayed   LosAngeles    117
## 6   ALASKA on time      Phoenix    221
## 7   ALASKA delayed      Phoenix     12
## 8                       Phoenix     NA
## 9   AMWEST on time      Phoenix   4840
## 10  AMWEST delayed      Phoenix    415
## 11  ALASKA on time     SanDiego    212
## 12  ALASKA delayed     SanDiego     20
## 13                     SanDiego     NA
## 14  AMWEST on time     SanDiego    383
## 15  AMWEST delayed     SanDiego     65
## 16  ALASKA on time SanFrancisco    503
## 17  ALASKA delayed SanFrancisco    102
## 18                 SanFrancisco     NA
## 19  AMWEST on time SanFrancisco    320
## 20  AMWEST delayed SanFrancisco    129
## 21  ALASKA on time      Seattle   1841
## 22  ALASKA delayed      Seattle    305
## 23                      Seattle     NA
## 24  AMWEST on time      Seattle    201
## 25  AMWEST delayed      Seattle     61
# remove NA
clean_data <- data_long %>%
  filter(!is.na(number))
clean_data
##    Airline  status         city number
## 1   ALASKA on time   LosAngeles    497
## 2   ALASKA delayed   LosAngeles     62
## 3   AMWEST on time   LosAngeles    694
## 4   AMWEST delayed   LosAngeles    117
## 5   ALASKA on time      Phoenix    221
## 6   ALASKA delayed      Phoenix     12
## 7   AMWEST on time      Phoenix   4840
## 8   AMWEST delayed      Phoenix    415
## 9   ALASKA on time     SanDiego    212
## 10  ALASKA delayed     SanDiego     20
## 11  AMWEST on time     SanDiego    383
## 12  AMWEST delayed     SanDiego     65
## 13  ALASKA on time SanFrancisco    503
## 14  ALASKA delayed SanFrancisco    102
## 15  AMWEST on time SanFrancisco    320
## 16  AMWEST delayed SanFrancisco    129
## 17  ALASKA on time      Seattle   1841
## 18  ALASKA delayed      Seattle    305
## 19  AMWEST on time      Seattle    201
## 20  AMWEST delayed      Seattle     61

Data Analysis

clean_data<-spread(clean_data, "status" , number)
clean_data
##    Airline         city delayed on time
## 1   ALASKA   LosAngeles      62     497
## 2   ALASKA      Phoenix      12     221
## 3   ALASKA     SanDiego      20     212
## 4   ALASKA SanFrancisco     102     503
## 5   ALASKA      Seattle     305    1841
## 6   AMWEST   LosAngeles     117     694
## 7   AMWEST      Phoenix     415    4840
## 8   AMWEST     SanDiego      65     383
## 9   AMWEST SanFrancisco     129     320
## 10  AMWEST      Seattle      61     201
analysis <- as.data.frame(clean_data %>%
  mutate(total = delayed + `on time`) %>%
  group_by(Airline) %>%
  summarise(delayed_total = sum(delayed), 
            on_time_total = sum(`on time`),
            count_total = sum(total),
            percent_ontime = round(on_time_total/count_total*100,digit = 2)))
analysis
##   Airline delayed_total on_time_total count_total percent_ontime
## 1  ALASKA           501          3274        3775          86.73
## 2  AMWEST           787          6438        7225          89.11

chart

dotchart(analysis$percent_ontime, labels=analysis$airline, main = "percent of online time Flights Across All Cities")

Conclusion

The individual city delay ratios by airline show that ALASKA airlines had a lower individual delay ratio than AMWEST in every single city. However the aggregated data shows that AMWEST is mostly on-time airline, due to the larger volume of flights.