1. The chart above describes arrival delays for two airlines across five destinations. Your task is to:
    Task#1: Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
    Task#2: Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
    Task#3: Perform analysis to compare the arrival delays for the two airlines.
    Task#4: Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative
    descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission:

Data Acquisition

flights.data = read.csv('https://raw.githubusercontent.com/niteen11/MSDS/master/DATA607/Week5/dataset/fligthts.csv',sep = ',')
flights.data
##   Carrier  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on_time         497     221       212           503    1841
## 2  Alaska delayed          62      12        20           102     305
## 3  AMWEST on_time         694    4840       383           320     201
## 4  AMWEST delayed         117     415        65           129      61

Environment Set Up

Load required libraries

library(knitr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
summary(flights.data)
##    Carrier      Status   Los.Angeles       Phoenix         San.Diego     
##  Alaska:2   delayed:2   Min.   : 62.0   Min.   :  12.0   Min.   : 20.00  
##  AMWEST:2   on_time:2   1st Qu.:103.2   1st Qu.: 168.8   1st Qu.: 53.75  
##                         Median :307.0   Median : 318.0   Median :138.50  
##                         Mean   :342.5   Mean   :1372.0   Mean   :170.00  
##                         3rd Qu.:546.2   3rd Qu.:1521.2   3rd Qu.:254.75  
##                         Max.   :694.0   Max.   :4840.0   Max.   :383.00  
##  San.Francisco      Seattle    
##  Min.   :102.0   Min.   :  61  
##  1st Qu.:122.2   1st Qu.: 166  
##  Median :224.5   Median : 253  
##  Mean   :263.5   Mean   : 602  
##  3rd Qu.:365.8   3rd Qu.: 689  
##  Max.   :503.0   Max.   :1841

Tidying Data

The loaded dataset is untidy, so let’s format into a ‘tidy’ format

flight <- gather(flights.data, City, Flight_Count, Los.Angeles:Seattle)
flight
##    Carrier  Status          City Flight_Count
## 1   Alaska on_time   Los.Angeles          497
## 2   Alaska delayed   Los.Angeles           62
## 3   AMWEST on_time   Los.Angeles          694
## 4   AMWEST delayed   Los.Angeles          117
## 5   Alaska on_time       Phoenix          221
## 6   Alaska delayed       Phoenix           12
## 7   AMWEST on_time       Phoenix         4840
## 8   AMWEST delayed       Phoenix          415
## 9   Alaska on_time     San.Diego          212
## 10  Alaska delayed     San.Diego           20
## 11  AMWEST on_time     San.Diego          383
## 12  AMWEST delayed     San.Diego           65
## 13  Alaska on_time San.Francisco          503
## 14  Alaska delayed San.Francisco          102
## 15  AMWEST on_time San.Francisco          320
## 16  AMWEST delayed San.Francisco          129
## 17  Alaska on_time       Seattle         1841
## 18  Alaska delayed       Seattle          305
## 19  AMWEST on_time       Seattle          201
## 20  AMWEST delayed       Seattle           61

Let’s analyze the tidy dataset to compare the arrival delays.
Calculate the total number of flights by each carrier (airlines) to different destinations (Cities)

total.flights <- flight %>% group_by(Carrier) %>% 
      summarize(Total_Flights = sum(Flight_Count))
total.flights
## # A tibble: 2 x 2
##   Carrier Total_Flights
##   <fctr>          <int>
## 1 Alaska           3775
## 2 AMWEST           7225

Let’s calculate the number of on-time flights for each carrier

flights.ontime <- flight %>% group_by(Carrier) %>% 
  filter(Status == 'on_time') %>% 
  summarize(FlightsOnTime = sum(Flight_Count))
flights.ontime
## # A tibble: 2 x 2
##   Carrier FlightsOnTime
##   <fctr>          <int>
## 1 Alaska           3274
## 2 AMWEST           6438

Let’s find the number of delayed flights for each carrier

flights.delayed <- flight %>% group_by(Carrier) %>% 
  filter(Status == 'delayed') %>% 
  summarize(FlightsDelayed = sum(Flight_Count))
flights.delayed
## # A tibble: 2 x 2
##   Carrier FlightsDelayed
##   <fctr>           <int>
## 1 Alaska             501
## 2 AMWEST             787
flights.info = cbind(flights.ontime,FlightsDelayed = flights.delayed$FlightsDelayed, TotalFlights=total.flights$Total_Flights)
flights.info
##   Carrier FlightsOnTime FlightsDelayed TotalFlights
## 1  Alaska          3274            501         3775
## 2  AMWEST          6438            787         7225
flights.summary <- mutate(flights.info, PercentOnTime = FlightsOnTime/TotalFlights, PercentDelayed= FlightsDelayed/TotalFlights)
flights.summary
##   Carrier FlightsOnTime FlightsDelayed TotalFlights PercentOnTime
## 1  Alaska          3274            501         3775     0.8672848
## 2  AMWEST          6438            787         7225     0.8910727
##   PercentDelayed
## 1      0.1327152
## 2      0.1089273

Analyzing further…

Carrier.Alaska <- filter(flight, Carrier == 'Alaska')
Carrier.Alaska
##    Carrier  Status          City Flight_Count
## 1   Alaska on_time   Los.Angeles          497
## 2   Alaska delayed   Los.Angeles           62
## 3   Alaska on_time       Phoenix          221
## 4   Alaska delayed       Phoenix           12
## 5   Alaska on_time     San.Diego          212
## 6   Alaska delayed     San.Diego           20
## 7   Alaska on_time San.Francisco          503
## 8   Alaska delayed San.Francisco          102
## 9   Alaska on_time       Seattle         1841
## 10  Alaska delayed       Seattle          305
Carrier.AMWEST <- filter(flight, Carrier == 'AMWEST')
Carrier.AMWEST
##    Carrier  Status          City Flight_Count
## 1   AMWEST on_time   Los.Angeles          694
## 2   AMWEST delayed   Los.Angeles          117
## 3   AMWEST on_time       Phoenix         4840
## 4   AMWEST delayed       Phoenix          415
## 5   AMWEST on_time     San.Diego          383
## 6   AMWEST delayed     San.Diego           65
## 7   AMWEST on_time San.Francisco          320
## 8   AMWEST delayed San.Francisco          129
## 9   AMWEST on_time       Seattle          201
## 10  AMWEST delayed       Seattle           61

For Alaskan airlines which city had most number of delays:

alaska.delays <- filter(Carrier.Alaska,Status=='delayed')
city.alaska.delays <- alaska.delays %>%
                      arrange(desc(alaska.delays$Flight_Count))
city.alaska.delays
##   Carrier  Status          City Flight_Count
## 1  Alaska delayed       Seattle          305
## 2  Alaska delayed San.Francisco          102
## 3  Alaska delayed   Los.Angeles           62
## 4  Alaska delayed     San.Diego           20
## 5  Alaska delayed       Phoenix           12
alaska.ontime <- filter(Carrier.Alaska,Status=='on_time')
city.alaska.ontime.rank <- alaska.ontime %>%
                      arrange(desc(alaska.ontime$Flight_Count))
city.alaska.ontime.rank
##   Carrier  Status          City Flight_Count
## 1  Alaska on_time       Seattle         1841
## 2  Alaska on_time San.Francisco          503
## 3  Alaska on_time   Los.Angeles          497
## 4  Alaska on_time       Phoenix          221
## 5  Alaska on_time     San.Diego          212
city.amwest.delays <-  filter(Carrier.AMWEST, Status == 'delayed')
city.amwest.delays <- city.amwest.delays %>%
                      arrange(desc(city.amwest.delays$Flight_Count))
city.amwest.delays
##   Carrier  Status          City Flight_Count
## 1  AMWEST delayed       Phoenix          415
## 2  AMWEST delayed San.Francisco          129
## 3  AMWEST delayed   Los.Angeles          117
## 4  AMWEST delayed     San.Diego           65
## 5  AMWEST delayed       Seattle           61

Let’s use Spread function on the delayed dataset

city.alaska.delays.spread <- spread(city.alaska.delays, City, Flight_Count)
kable(city.alaska.delays.spread)
Carrier Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
Alaska delayed 62 12 20 102 305
library(ggplot2)
ggplot(data = city.alaska.delays, aes(x=City,y=Flight_Count))+
  geom_point(alpha = 0.5, size = 5, color ='blue')+
  geom_text(aes(x = City, y = Flight_Count, 
                                    label = paste(Flight_Count),
                                    group = Carrier,
                                    vjust = -0.5))

city.flight.delays <-  filter(flight, Status == 'delayed')
city.flight.delays <- city.flight.delays %>%
                      arrange(desc(city.flight.delays$Flight_Count))
city.flight.delays
##    Carrier  Status          City Flight_Count
## 1   AMWEST delayed       Phoenix          415
## 2   Alaska delayed       Seattle          305
## 3   AMWEST delayed San.Francisco          129
## 4   AMWEST delayed   Los.Angeles          117
## 5   Alaska delayed San.Francisco          102
## 6   AMWEST delayed     San.Diego           65
## 7   Alaska delayed   Los.Angeles           62
## 8   AMWEST delayed       Seattle           61
## 9   Alaska delayed     San.Diego           20
## 10  Alaska delayed       Phoenix           12
kable(city.flight.delays)
Carrier Status City Flight_Count
AMWEST delayed Phoenix 415
Alaska delayed Seattle 305
AMWEST delayed San.Francisco 129
AMWEST delayed Los.Angeles 117
Alaska delayed San.Francisco 102
AMWEST delayed San.Diego 65
Alaska delayed Los.Angeles 62
AMWEST delayed Seattle 61
Alaska delayed San.Diego 20
Alaska delayed Phoenix 12
library(ggplot2)
ggplot(data = city.flight.delays, aes(x=Carrier,y=Flight_Count))+
  #geom_point(alpha = 0.5, size = 5, color ='blue')
  geom_bar(stat = 'identity',aes(fill=Carrier))+
  geom_text(aes(x = Carrier, y = Flight_Count, 
                                    label = paste(Flight_Count),
                                    group = Carrier,
                                    vjust = -0.4)) +
  labs(title = "Comparing Delays by Carrier & City", 
                         x = "Carrier", 
                         y = "Delay Flight Count") +
  facet_wrap(~City, ncol = 5)+
  theme_bw()

Let’s use spread function to view the dataset for delayed flights

flight.delay.spread <- spread(city.flight.delays,City, Flight_Count)
kable(flight.delay.spread)
Carrier Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
Alaska delayed 62 12 20 102 305
AMWEST delayed 117 415 65 129 61
flight.df <- flight %>%
    spread(Status,'Flight_Count')
kable(flight.df)
Carrier City delayed on_time
Alaska Los.Angeles 62 497
Alaska Phoenix 12 221
Alaska San.Diego 20 212
Alaska San.Francisco 102 503
Alaska Seattle 305 1841
AMWEST Los.Angeles 117 694
AMWEST Phoenix 415 4840
AMWEST San.Diego 65 383
AMWEST San.Francisco 129 320
AMWEST Seattle 61 201
flight_percentile <- mutate(flight.df,delay_percent=round((delayed/(delayed+on_time)*100),2),
                            onTime_percent=round((on_time/(delayed+on_time))*100,2))
kable(flight_percentile)
Carrier City delayed on_time delay_percent onTime_percent
Alaska Los.Angeles 62 497 11.09 88.91
Alaska Phoenix 12 221 5.15 94.85
Alaska San.Diego 20 212 8.62 91.38
Alaska San.Francisco 102 503 16.86 83.14
Alaska Seattle 305 1841 14.21 85.79
AMWEST Los.Angeles 117 694 14.43 85.57
AMWEST Phoenix 415 4840 7.90 92.10
AMWEST San.Diego 65 383 14.51 85.49
AMWEST San.Francisco 129 320 28.73 71.27
AMWEST Seattle 61 201 23.28 76.72
library(ggplot2)
ggplot(data = flight_percentile, aes(x=Carrier,y=delay_percent))+
  geom_bar(stat = 'identity',aes(fill=Carrier))+
  geom_text(aes(x = Carrier, y = delay_percent, 
                                    label = paste(delay_percent,'%'),
                                    group = Carrier,
                                    vjust = -0.4)) +
  labs(title = "Comparing Delays(%) by Carrier & City", 
                         x = "Carrier", 
                         y = "Delay Percent") +
  facet_wrap(~City, ncol = 5)+
  theme_bw()

Conclusion:
(a) From the above plot, it can be concluded that AMWEST has most delay rate across 5 cities as compared to Alaska.
(b) San Fransicso has highest delay rate for both carriers.
(c) Phoenix has lowest delay rate for both carriers.