1. The chart above describes arrival delays for two airlines across five destinations. Your task is to:
    Task#1: Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
    Task#2: Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
    Task#3: Perform analysis to compare the arrival delays for the two airlines.
    Task#4: Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative
    descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission:

1 Data Acquisition

library(knitr)
## Warning: package 'knitr' was built under R version 3.4.4
flights.data = read.csv('https://raw.githubusercontent.com/keshaws/CUNY_MSDS_2020/master/DATA607/fligthts.csv',sep = ',')
kable(flights.data)
Carrier Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
Alaska on_time 497 221 212 503 1841
Alaska delayed 62 12 20 102 305
AMWEST on_time 694 4840 383 320 201
AMWEST delayed 117 415 65 129 61

2 Environment Set Up

library(tidyr)
library(dplyr)
summary(flights.data)
##    Carrier      Status   Los.Angeles       Phoenix         San.Diego     
##  Alaska:2   delayed:2   Min.   : 62.0   Min.   :  12.0   Min.   : 20.00  
##  AMWEST:2   on_time:2   1st Qu.:103.2   1st Qu.: 168.8   1st Qu.: 53.75  
##                         Median :307.0   Median : 318.0   Median :138.50  
##                         Mean   :342.5   Mean   :1372.0   Mean   :170.00  
##                         3rd Qu.:546.2   3rd Qu.:1521.2   3rd Qu.:254.75  
##                         Max.   :694.0   Max.   :4840.0   Max.   :383.00  
##  San.Francisco      Seattle    
##  Min.   :102.0   Min.   :  61  
##  1st Qu.:122.2   1st Qu.: 166  
##  Median :224.5   Median : 253  
##  Mean   :263.5   Mean   : 602  
##  3rd Qu.:365.8   3rd Qu.: 689  
##  Max.   :503.0   Max.   :1841

3 Tidying Data

The dataset is untidy, so using gather function to convert the imported data into a ‘tidy’ format

flight <- gather(flights.data, City, Flight_Count, Los.Angeles:Seattle)
kable(flight)
Carrier Status City Flight_Count
Alaska on_time Los.Angeles 497
Alaska delayed Los.Angeles 62
AMWEST on_time Los.Angeles 694
AMWEST delayed Los.Angeles 117
Alaska on_time Phoenix 221
Alaska delayed Phoenix 12
AMWEST on_time Phoenix 4840
AMWEST delayed Phoenix 415
Alaska on_time San.Diego 212
Alaska delayed San.Diego 20
AMWEST on_time San.Diego 383
AMWEST delayed San.Diego 65
Alaska on_time San.Francisco 503
Alaska delayed San.Francisco 102
AMWEST on_time San.Francisco 320
AMWEST delayed San.Francisco 129
Alaska on_time Seattle 1841
Alaska delayed Seattle 305
AMWEST on_time Seattle 201
AMWEST delayed Seattle 61

4 Total number of flights by each carrier (airlines) to different destinations (Cities)

total.flights <- flight %>% group_by(Carrier) %>% 
      summarize(Total_Flights = sum(Flight_Count))
total.flights
## # A tibble: 2 x 2
##   Carrier Total_Flights
##   <fct>           <int>
## 1 Alaska           3775
## 2 AMWEST           7225

5 On-time flights for each carrier

flights.ontime <- flight %>% group_by(Carrier) %>% 
  filter(Status == 'on_time') %>% 
  summarize(FlightsOnTime = sum(Flight_Count))
flights.ontime
## # A tibble: 2 x 2
##   Carrier FlightsOnTime
##   <fct>           <int>
## 1 Alaska           3274
## 2 AMWEST           6438

6 Delayed flights for each carrier

flights.delayed <- flight %>% group_by(Carrier) %>% 
  filter(Status == 'delayed') %>% 
  summarize(FlightsDelayed = sum(Flight_Count))
flights.delayed
## # A tibble: 2 x 2
##   Carrier FlightsDelayed
##   <fct>            <int>
## 1 Alaska             501
## 2 AMWEST             787
flights.info = cbind(flights.ontime,FlightsDelayed = flights.delayed$FlightsDelayed, TotalFlights=total.flights$Total_Flights)
flights.info
##   Carrier FlightsOnTime FlightsDelayed TotalFlights
## 1  Alaska          3274            501         3775
## 2  AMWEST          6438            787         7225
flights.summary <- mutate(flights.info, PercentOnTime = FlightsOnTime/TotalFlights, PercentDelayed= FlightsDelayed/TotalFlights)
flights.summary
##   Carrier FlightsOnTime FlightsDelayed TotalFlights PercentOnTime
## 1  Alaska          3274            501         3775     0.8672848
## 2  AMWEST          6438            787         7225     0.8910727
##   PercentDelayed
## 1      0.1327152
## 2      0.1089273

7 Analysis

7.1 EDA (Exploratory Data Analysis)

Carrier.Alaska <- filter(flight, Carrier == 'Alaska')
kable(Carrier.Alaska)
Carrier Status City Flight_Count
Alaska on_time Los.Angeles 497
Alaska delayed Los.Angeles 62
Alaska on_time Phoenix 221
Alaska delayed Phoenix 12
Alaska on_time San.Diego 212
Alaska delayed San.Diego 20
Alaska on_time San.Francisco 503
Alaska delayed San.Francisco 102
Alaska on_time Seattle 1841
Alaska delayed Seattle 305
Carrier.AMWEST <- filter(flight, Carrier == 'AMWEST')
kable(Carrier.AMWEST)
Carrier Status City Flight_Count
AMWEST on_time Los.Angeles 694
AMWEST delayed Los.Angeles 117
AMWEST on_time Phoenix 4840
AMWEST delayed Phoenix 415
AMWEST on_time San.Diego 383
AMWEST delayed San.Diego 65
AMWEST on_time San.Francisco 320
AMWEST delayed San.Francisco 129
AMWEST on_time Seattle 201
AMWEST delayed Seattle 61

For Alaskan airlines which city had most number of delays:

alaska.delays <- filter(Carrier.Alaska,Status=='delayed')
city.alaska.delays <- alaska.delays %>%
                      arrange(desc(alaska.delays$Flight_Count))
kable(city.alaska.delays)
Carrier Status City Flight_Count
Alaska delayed Seattle 305
Alaska delayed San.Francisco 102
Alaska delayed Los.Angeles 62
Alaska delayed San.Diego 20
Alaska delayed Phoenix 12
alaska.ontime <- filter(Carrier.Alaska,Status=='on_time')
city.alaska.ontime.rank <- alaska.ontime %>%
                      arrange(desc(alaska.ontime$Flight_Count))
kable(city.alaska.ontime.rank)
Carrier Status City Flight_Count
Alaska on_time Seattle 1841
Alaska on_time San.Francisco 503
Alaska on_time Los.Angeles 497
Alaska on_time Phoenix 221
Alaska on_time San.Diego 212
city.amwest.delays <-  filter(Carrier.AMWEST, Status == 'delayed')
city.amwest.delays <- city.amwest.delays %>%
                      arrange(desc(city.amwest.delays$Flight_Count))
kable(city.amwest.delays)
Carrier Status City Flight_Count
AMWEST delayed Phoenix 415
AMWEST delayed San.Francisco 129
AMWEST delayed Los.Angeles 117
AMWEST delayed San.Diego 65
AMWEST delayed Seattle 61
city.flight.delays <-  filter(flight, Status == 'delayed')
city.flight.delays <- city.flight.delays %>%
                    arrange(desc(city.flight.delays$Flight_Count))
kable(city.flight.delays)
Carrier Status City Flight_Count
AMWEST delayed Phoenix 415
Alaska delayed Seattle 305
AMWEST delayed San.Francisco 129
AMWEST delayed Los.Angeles 117
Alaska delayed San.Francisco 102
AMWEST delayed San.Diego 65
Alaska delayed Los.Angeles 62
AMWEST delayed Seattle 61
Alaska delayed San.Diego 20
Alaska delayed Phoenix 12
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
ggplot(data = city.flight.delays, aes(x=Carrier,y=Flight_Count))+
  #geom_point(alpha = 0.5, size = 5, color ='blue')
  geom_bar(stat = 'identity',aes(fill=Carrier))+
  geom_text(aes(x = Carrier, y = Flight_Count, 
                                    label = paste(Flight_Count),
                                    group = Carrier,
                                    vjust = -0.4)) +
  labs(title = "Comparing Delays by Carrier & City", 
                         x = "Carrier", 
                         y = "Delay Flight Count") +
  facet_wrap(~City, ncol = 5)+
  theme_bw()

spread function to view the dataset for delayed flights

flight.delay.spread <- spread(city.flight.delays,City, Flight_Count)
kable(flight.delay.spread)
Carrier Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
Alaska delayed 62 12 20 102 305
AMWEST delayed 117 415 65 129 61
flight.df <- flight %>%
    spread(Status,'Flight_Count')
kable(flight.df)
Carrier City delayed on_time
Alaska Los.Angeles 62 497
Alaska Phoenix 12 221
Alaska San.Diego 20 212
Alaska San.Francisco 102 503
Alaska Seattle 305 1841
AMWEST Los.Angeles 117 694
AMWEST Phoenix 415 4840
AMWEST San.Diego 65 383
AMWEST San.Francisco 129 320
AMWEST Seattle 61 201
flight_percentile <- mutate(flight.df,delay_percent=round((delayed/(delayed+on_time)*100),2),
                            onTime_percent=round((on_time/(delayed+on_time))*100,2))
kable(flight_percentile)
Carrier City delayed on_time delay_percent onTime_percent
Alaska Los.Angeles 62 497 11.09 88.91
Alaska Phoenix 12 221 5.15 94.85
Alaska San.Diego 20 212 8.62 91.38
Alaska San.Francisco 102 503 16.86 83.14
Alaska Seattle 305 1841 14.21 85.79
AMWEST Los.Angeles 117 694 14.43 85.57
AMWEST Phoenix 415 4840 7.90 92.10
AMWEST San.Diego 65 383 14.51 85.49
AMWEST San.Francisco 129 320 28.73 71.27
AMWEST Seattle 61 201 23.28 76.72
library(ggplot2)
ggplot(data = flight_percentile, aes(x=Carrier,y=delay_percent))+
  geom_bar(stat = 'identity',aes(fill=Carrier))+
  geom_text(aes(x = Carrier, y = delay_percent, 
                                    label = paste(delay_percent,'%'),
                                    group = Carrier,
                                    vjust = -0.4)) +
  labs(title = "Comparing Delays(%) by Carrier & City", 
                         x = "Carrier", 
                         y = "Delay Percent") +
  facet_wrap(~City, ncol = 5)+
  theme_bw()

library(ggplot2)
ggplot(data = flight_percentile, aes(x=Carrier,y=onTime_percent))+
  geom_bar(stat = 'identity',aes(fill=Carrier))+
  geom_text(aes(x = Carrier, y = onTime_percent, 
                                    label = paste(onTime_percent,'%'),
                                    group = Carrier,
                                    vjust = -0.4)) +
  labs(title = "Comparing OnTime(%) by Carrier & City", 
                         x = "Carrier", 
                         y = "OnTime Percent") +
  facet_wrap(~City, ncol = 5)+
  theme_bw()

8 Conclusion:

  • San Fransicso has highest delay rate for both carriers
  • Phoenix has highest on time rate for both carriers
  • AMWEST has higher delay rate than Alaska across 5 cities