Prob Statement


  1. Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above.

You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can

practice tidying and transformations as described below.

  1. Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and

transform your data.

  1. Perform analysis to compare the arrival delays for the two airlines.

  2. Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative

descriptions of your data cleanup work, analysis, and conclusions.


RPubs Link:

http://rpubs.com/ssufian/532710

Github Links:

https://github.com/ssufian/Data_607


Loading data and importing necessary packages

library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#loading "wide" data from Github Repo


url <- 'https://raw.githubusercontent.com/ssufian/Data_607/master/airlines.csv'

df1 <- read.delim(file =url, header=T , stringsAsFactors =FALSE, sep = ",")

df1 # checking the raw data sets
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM West on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61
df1 <- df1 %>% mutate_all(na_if,"")%>%  
  do(na.locf(.)) %>% 
  distinct %>% # remove duplications after filling in empty lines with previous row
  rename(Airlines= X, arr_time = X.1) 


df1 <- gather(df1,"destination","N_flights",3:7)

df1
##    Airlines arr_time   destination N_flights
## 1    Alaska  on time   Los.Angeles       497
## 2    Alaska  delayed   Los.Angeles        62
## 3   AM West  on time   Los.Angeles       694
## 4   AM West  delayed   Los.Angeles       117
## 5    Alaska  on time       Phoenix       221
## 6    Alaska  delayed       Phoenix        12
## 7   AM West  on time       Phoenix      4840
## 8   AM West  delayed       Phoenix       415
## 9    Alaska  on time     San.Diego       212
## 10   Alaska  delayed     San.Diego        20
## 11  AM West  on time     San.Diego       383
## 12  AM West  delayed     San.Diego        65
## 13   Alaska  on time San.Francisco       503
## 14   Alaska  delayed San.Francisco       102
## 15  AM West  on time San.Francisco       320
## 16  AM West  delayed San.Francisco       129
## 17   Alaska  on time       Seattle      1841
## 18   Alaska  delayed       Seattle       305
## 19  AM West  on time       Seattle       201
## 20  AM West  delayed       Seattle        61
df2 <- df1 # to use as spread 

df1 <- df1 %>% 
  group_by(Airlines, arr_time) %>% 
  summarise(n_flights=sum(N_flights)) %>% 
  group_by(Airlines) %>% 
  mutate(percent_of_flights = n_flights/sum(n_flights))

# to get a dataframe in the format to compute the overall arrival performance of the airlines
df1 
## # A tibble: 4 x 4
## # Groups:   Airlines [2]
##   Airlines arr_time n_flights percent_of_flights
##   <chr>    <chr>        <int>              <dbl>
## 1 Alaska   delayed        501              0.133
## 2 Alaska   on time       3274              0.867
## 3 AM West  delayed        787              0.109
## 4 AM West  on time       6438              0.891
#seperating delays and on-time into 2 columns
df2 <- spread(df2,arr_time, N_flights)

ctm2 <- df2 %>%
        clean_names() #using janitor library to remove spaces and put int underline in column name

ctm2
##    airlines   destination delayed on_time
## 1    Alaska   Los.Angeles      62     497
## 2    Alaska       Phoenix      12     221
## 3    Alaska     San.Diego      20     212
## 4    Alaska San.Francisco     102     503
## 5    Alaska       Seattle     305    1841
## 6   AM West   Los.Angeles     117     694
## 7   AM West       Phoenix     415    4840
## 8   AM West     San.Diego      65     383
## 9   AM West San.Francisco     129     320
## 10  AM West       Seattle      61     201
ddf3 <- ctm2 %>% 
    group_by(airlines,destination)%>% 
    mutate(percent_delayedflts = delayed/sum(delayed, on_time)) %>% 
    mutate(tot_flts = sum(delayed, on_time))
  
# per city performance grouped by airlines % destinations
ddf3 
## # A tibble: 10 x 6
## # Groups:   airlines, destination [10]
##    airlines destination   delayed on_time percent_delayedflts tot_flts
##    <chr>    <chr>           <int>   <int>               <dbl>    <int>
##  1 Alaska   Los.Angeles        62     497              0.111       559
##  2 Alaska   Phoenix            12     221              0.0515      233
##  3 Alaska   San.Diego          20     212              0.0862      232
##  4 Alaska   San.Francisco     102     503              0.169       605
##  5 Alaska   Seattle           305    1841              0.142      2146
##  6 AM West  Los.Angeles       117     694              0.144       811
##  7 AM West  Phoenix           415    4840              0.0790     5255
##  8 AM West  San.Diego          65     383              0.145       448
##  9 AM West  San.Francisco     129     320              0.287       449
## 10 AM West  Seattle            61     201              0.233       262
ddf4 <- ddf3 %>% 
        group_by(airlines) %>% 
        mutate(total_flights = sum(tot_flts)) %>% 
        summarise(by_air= sum(tot_flts))

ddf4
## # A tibble: 2 x 2
##   airlines by_air
##   <chr>     <int>
## 1 Alaska     3775
## 2 AM West    7225

Summary of actions taken during the data munging process:


Data Analysis 1

Comparision of overall performance between the two airlines


require(ggthemes)
## Loading required package: ggthemes
library(ggplot2)

# Basic barplot - abs numbers
df1 %>% group_by(Airlines, arr_time) %>% 
  group_by(percent_of_flights) %>% 
  ggplot(aes(x=Airlines, y=n_flights, fill = arr_time)) +
  geom_bar(stat="identity", width = 0.5, position="dodge")+
  geom_text(aes(label=n_flights), vjust=1.6, color="black",position = position_dodge(0.45), size=3.5)+
  ggtitle("Number of Delay Arrivals by Airlines")+
  theme_economist()

library(formattable)

# Basic barplot - percentage %
df1 %>% group_by(Airlines, arr_time) %>% 
  group_by(percent_of_flights) %>% 
  ggplot(aes(x=Airlines, y=percent_of_flights, fill = arr_time)) +
  geom_bar(stat="identity", width = 0.5, position="dodge")+
  geom_text(aes(label=percent(round(percent_of_flights,digits=2))), vjust=1.6, color="black",position =  position_dodge(0.45),     size=3.5)+
  ggtitle("% of Delay Arrivals by Airlines")+
  theme_economist()


Observation 1:

Looking at the abs. values of on-tme and delays of AM West relative to Alaska side-by-side, it seems that

the higher no. of delays in AM West is “small” relative to the number of on-time arrival of AM West,

which is twice the amount compared to Alaska’s. However, A second look by percentage basis, showed that

overall, AM West performed better than Alaska Airways; 11% delays for AM West vs. 13% delays Alaska.

But is it?


Data Analysis 2

Comparision between arrival delays for the two airlines per destinations of the two airways to explain

this discrepancy…

# Basic barplot - arrival performance of the 2 airlines by destinations
ddf3 %>% 
  ggplot(aes(x=destination, y=percent_delayedflts, fill = airlines)) +
  geom_bar(stat="identity", width = 0.5,  position="dodge")+
  geom_text(aes(label=percent(percent_delayedflts)), vjust=0.1, color="black",position = position_dodge(0.55), size=3.00)+
  geom_text(aes(label=destination), vjust=1.4, color="black",position = position_dodge(0.55), size=3.25)+
  ggtitle("Per city Delays of the 2 airlines")+
  theme_excel()

#loading data again to perform arrival comparasion BUT by destination
dfdest <- read.delim(file =url, header=T , stringsAsFactors =FALSE, sep = ",")

## Fill in empty rows with NA and Updating rows with earlier/previous rows values
dfdest <- dfdest %>% filter(complete.cases(" "))%>% 
  mutate_all(na_if,"")%>%  
  do(na.locf(.)) %>% 
  distinct%>% 
  rename(Airlines= X, arr_time = X.1) 

dfdest
##   Airlines arr_time Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   Alaska  on time         497     221       212           503    1841
## 2   Alaska  delayed          62      12        20           102     305
## 3  AM West  on time         694    4840       383           320     201
## 4  AM West  delayed         117     415        65           129      61
#making it long format
dfdest1<- gather(dfdest,"destination","N_flights",3:7)

dfdest1
##    Airlines arr_time   destination N_flights
## 1    Alaska  on time   Los.Angeles       497
## 2    Alaska  delayed   Los.Angeles        62
## 3   AM West  on time   Los.Angeles       694
## 4   AM West  delayed   Los.Angeles       117
## 5    Alaska  on time       Phoenix       221
## 6    Alaska  delayed       Phoenix        12
## 7   AM West  on time       Phoenix      4840
## 8   AM West  delayed       Phoenix       415
## 9    Alaska  on time     San.Diego       212
## 10   Alaska  delayed     San.Diego        20
## 11  AM West  on time     San.Diego       383
## 12  AM West  delayed     San.Diego        65
## 13   Alaska  on time San.Francisco       503
## 14   Alaska  delayed San.Francisco       102
## 15  AM West  on time San.Francisco       320
## 16  AM West  delayed San.Francisco       129
## 17   Alaska  on time       Seattle      1841
## 18   Alaska  delayed       Seattle       305
## 19  AM West  on time       Seattle       201
## 20  AM West  delayed       Seattle        61
# this has another layer of groupby; adding destination
dfdest1 <- dfdest1 %>% 
  group_by(Airlines, arr_time, destination) %>% 
  summarise(n_flights=sum(N_flights)) %>% 
  group_by(Airlines) %>% 
  mutate(percent_of_flights = n_flights/sum(n_flights)) %>% 
  mutate(rank = rank(percent_of_flights)) %>% 
  filter(arr_time =="delayed")

dfdest1
## # A tibble: 10 x 6
## # Groups:   Airlines [2]
##    Airlines arr_time destination   n_flights percent_of_flights  rank
##    <chr>    <chr>    <chr>             <int>              <dbl> <dbl>
##  1 Alaska   delayed  Los.Angeles          62            0.0164      3
##  2 Alaska   delayed  Phoenix              12            0.00318     1
##  3 Alaska   delayed  San.Diego            20            0.00530     2
##  4 Alaska   delayed  San.Francisco       102            0.0270      4
##  5 Alaska   delayed  Seattle             305            0.0808      7
##  6 AM West  delayed  Los.Angeles         117            0.0162      3
##  7 AM West  delayed  Phoenix             415            0.0574      8
##  8 AM West  delayed  San.Diego            65            0.00900     2
##  9 AM West  delayed  San.Francisco       129            0.0179      4
## 10 AM West  delayed  Seattle              61            0.00844     1

Checking and confirming the actual numbers of delays per city of the two airlines


# Basic barplot - Numbers of arrival delays per city
dfdest1 %>% group_by(Airlines, arr_time, destination) %>% 
  group_by(destination) %>% 
  ggplot(aes(x=destination, y=n_flights, fill = Airlines)) +
  geom_bar(stat="identity", width = 0.5, position="dodge")+
  geom_text(aes(label=n_flights), vjust=0.1, color="black",position = position_dodge(0.55), size=3.00)+
  geom_text(aes(label=destination), vjust=1.4, color="black",position = position_dodge(0.55), size=3.25)+
  ggtitle("No. of Delays per Service Cities")+
  theme_excel()

# Basic barplot - Total flights taken by 2 airlines per each city
ddf3 %>% 
  ggplot(aes(x=destination, y=tot_flts, fill = airlines)) +
  geom_bar(stat="identity", width = 0.5,  position="dodge")+
  geom_text(aes(label=(tot_flts)), vjust=0.1, color="black",position = position_dodge(0.55), size=3.00)+
  geom_text(aes(label=destination), vjust=1.4, color="black",position = position_dodge(0.55), size=3.25)+
  ggtitle("Total flights taken by the airlines per city")+
  theme_excel()

# Basic barplot - Total no. of flts taken by each airlines
ddf4 %>% 
  ggplot(aes(x=airlines, y=by_air, fill=airlines)) +
  geom_bar(stat="identity", width = 0.5,  position="dodge")+
  geom_text(aes(label=by_air), vjust=0.1, color="black",position = position_dodge(0.55), size=3.00)+
  geom_text(aes(label=airlines), vjust=1.4, color="black",position = position_dodge(0.55), size=3.25)+
  ggtitle("Total no. of flights taken by the each airlines")+
 
  theme_excel()


Observation 2:

By all measures, the per-city performance of Alaska Airways is better than AM West (as measured by abs.

numbers and percentage basis). The only exception was Seattle in which Alaska was worse than AM West.

The reason, the overall performance of AM West first appeared better was because they flew more flights

overall, and as such, with a bigger denominator in terms of total flights, the percentage delays

per total flights taken resulted in a smaller ratio than Alaska. In fact, AM West flew twice as

much as Alaska Airways did.