(1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3

(2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

table <- read.csv("https://raw.githubusercontent.com/Zchen116/assignment-2/master/flights.csv", header = TRUE, stringsAsFactors = FALSE)

Remove the blank row that contains NAs

table <- table[!apply(is.na(table) | table == "", 1, all),] 
table
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

rename column

names(table)[1] <- "Airline"
names(table)[2] <- "Arrival_Status"
table
##   Airline Arrival_Status Los.Angeles Phoenix San.Diego San.Francisco
## 1  ALASKA        on time         497     221       212           503
## 2                delayed          62      12        20           102
## 4 AM WEST        on time         694    4840       383           320
## 5                delayed         117     415        65           129
##   Seattle
## 1    1841
## 2     305
## 4     201
## 5      61

Fill in the missing values

table$Airline[2] <- table$Airline[1]
table$Airline[4] <- table$Airline[3]
table
##   Airline Arrival_Status Los.Angeles Phoenix San.Diego San.Francisco
## 1  ALASKA        on time         497     221       212           503
## 2  ALASKA        delayed          62      12        20           102
## 4 AM WEST        on time         694    4840       383           320
## 5 AM WEST        delayed         117     415        65           129
##   Seattle
## 1    1841
## 2     305
## 4     201
## 5      61

Combine City and Flights in one data table

airlines <- gather(table, "City", "Flights", 3:7)
airlines
##    Airline Arrival_Status          City Flights
## 1   ALASKA        on time   Los.Angeles     497
## 2   ALASKA        delayed   Los.Angeles      62
## 3  AM WEST        on time   Los.Angeles     694
## 4  AM WEST        delayed   Los.Angeles     117
## 5   ALASKA        on time       Phoenix     221
## 6   ALASKA        delayed       Phoenix      12
## 7  AM WEST        on time       Phoenix    4840
## 8  AM WEST        delayed       Phoenix     415
## 9   ALASKA        on time     San.Diego     212
## 10  ALASKA        delayed     San.Diego      20
## 11 AM WEST        on time     San.Diego     383
## 12 AM WEST        delayed     San.Diego      65
## 13  ALASKA        on time San.Francisco     503
## 14  ALASKA        delayed San.Francisco     102
## 15 AM WEST        on time San.Francisco     320
## 16 AM WEST        delayed San.Francisco     129
## 17  ALASKA        on time       Seattle    1841
## 18  ALASKA        delayed       Seattle     305
## 19 AM WEST        on time       Seattle     201
## 20 AM WEST        delayed       Seattle      61

Combine Arrival Status and Flights in one data table

airlines_2 <- spread(airlines, Arrival_Status, Flights)
airlines_2
##    Airline          City delayed on time
## 1   ALASKA   Los.Angeles      62     497
## 2   ALASKA       Phoenix      12     221
## 3   ALASKA     San.Diego      20     212
## 4   ALASKA San.Francisco     102     503
## 5   ALASKA       Seattle     305    1841
## 6  AM WEST   Los.Angeles     117     694
## 7  AM WEST       Phoenix     415    4840
## 8  AM WEST     San.Diego      65     383
## 9  AM WEST San.Francisco     129     320
## 10 AM WEST       Seattle      61     201

(3) Perform analysis to compare the arrival delays for the two airlines.

Compare_City <- airlines_2 %>%
  group_by(City, Airline) %>%
  summarize(delay_ratio = sum(delayed) / (sum(delayed) + sum(`on time`)))
Compare_City %>% spread(Airline, delay_ratio)
## # A tibble: 5 x 3
## # Groups:   City [5]
##   City          ALASKA `AM WEST`
##   <chr>          <dbl>     <dbl>
## 1 Los.Angeles   0.111     0.144 
## 2 Phoenix       0.0515    0.0790
## 3 San.Diego     0.0862    0.145 
## 4 San.Francisco 0.169     0.287 
## 5 Seattle       0.142     0.233

From cities delay comparison, we see that ALASKA’s delay ratio is smaller than AM WEST’s in every city.

Compare_airlines <- airlines_2 %>%
  group_by(Airline) %>% 
  summarize(total_delayed = sum(delayed), total_flights = sum(delayed) + sum(`on time`), delay_ratio = sum(delayed) / (sum(delayed) + sum(`on time`))) 
Compare_airlines
## # A tibble: 2 x 4
##   Airline total_delayed total_flights delay_ratio
##   <chr>           <int>         <int>       <dbl>
## 1 ALASKA            501          3775       0.133
## 2 AM WEST           787          7225       0.109

From airlines delay comparison on total flights and delay ratio, we see that AM WEST had more flights than ALASKA (about twice as more as ALASKA) with a smaller delay ratio.