1. Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
  2. Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
  3. Perform analysis to compare the arrival delays for the two airlines.

Load libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

Read CSV file Rename Columns

airplanes <-read.csv("/Users/christinakasman/Desktop/airplanes data607.csv", header=TRUE, sep=",", stringsAsFactors=FALSE)
airplanes
##          X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   Alaska on time         497     221       212           503    1841
## 2          Delayed          62      12        20           102     305
## 3                           NA      NA        NA            NA      NA
## 4  AM West on time         694    4850       383           320     201
## 5          Delayed         117     415        65           129      61
## 6                           NA      NA        NA            NA      NA
## 7                           NA      NA        NA            NA      NA
## 8                           NA      NA        NA            NA      NA
## 9                           NA      NA        NA            NA      NA
## 10                          NA      NA        NA            NA      NA
## 11                          NA      NA        NA            NA      NA
## 12                          NA      NA        NA            NA      NA
## 13                          NA      NA        NA            NA      NA
## 14                          NA      NA        NA            NA      NA
## 15                          NA      NA        NA            NA      NA
## 16                          NA      NA        NA            NA      NA
## 17                          NA      NA        NA            NA      NA
## 18                          NA      NA        NA            NA      NA
## 19                          NA      NA        NA            NA      NA
## 20                          NA      NA        NA            NA      NA
## 21                          NA      NA        NA            NA      NA
names(airplanes) <- c("Airline", "Status", "LosAngeles", "Phoenix", "SanDiego", "SanFrancisco", "Seattle")
airplanes
##    Airline  Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1   Alaska on time        497     221      212          503    1841
## 2          Delayed         62      12       20          102     305
## 3                          NA      NA       NA           NA      NA
## 4  AM West on time        694    4850      383          320     201
## 5          Delayed        117     415       65          129      61
## 6                          NA      NA       NA           NA      NA
## 7                          NA      NA       NA           NA      NA
## 8                          NA      NA       NA           NA      NA
## 9                          NA      NA       NA           NA      NA
## 10                         NA      NA       NA           NA      NA
## 11                         NA      NA       NA           NA      NA
## 12                         NA      NA       NA           NA      NA
## 13                         NA      NA       NA           NA      NA
## 14                         NA      NA       NA           NA      NA
## 15                         NA      NA       NA           NA      NA
## 16                         NA      NA       NA           NA      NA
## 17                         NA      NA       NA           NA      NA
## 18                         NA      NA       NA           NA      NA
## 19                         NA      NA       NA           NA      NA
## 20                         NA      NA       NA           NA      NA
## 21                         NA      NA       NA           NA      NA

Remove NA values using drop_na from plyr package

airplanes5 <- airplanes %>% drop_na()
airplanes5
##   Airline  Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1  Alaska on time        497     221      212          503    1841
## 2         Delayed         62      12       20          102     305
## 4 AM West on time        694    4850      383          320     201
## 5         Delayed        117     415       65          129      61
for (i in 1:dim(airplanes5)[1]){
  if (i %% 2 == 0){
      airplanes5$Airline[i] <-  airplanes5$Airline[i-1]
  }
}
airplanes5
##   Airline  Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1  Alaska on time        497     221      212          503    1841
## 2  Alaska Delayed         62      12       20          102     305
## 4 AM West on time        694    4850      383          320     201
## 5 AM West Delayed        117     415       65          129      61

Use gather function to transform data into Tidy dataset

airplanesnew <- airplanes5%>% gather("city", "Count", LosAngeles:Seattle) 
airplanesnew
##    Airline  Status         city Count
## 1   Alaska on time   LosAngeles   497
## 2   Alaska Delayed   LosAngeles    62
## 3  AM West on time   LosAngeles   694
## 4  AM West Delayed   LosAngeles   117
## 5   Alaska on time      Phoenix   221
## 6   Alaska Delayed      Phoenix    12
## 7  AM West on time      Phoenix  4850
## 8  AM West Delayed      Phoenix   415
## 9   Alaska on time     SanDiego   212
## 10  Alaska Delayed     SanDiego    20
## 11 AM West on time     SanDiego   383
## 12 AM West Delayed     SanDiego    65
## 13  Alaska on time SanFrancisco   503
## 14  Alaska Delayed SanFrancisco   102
## 15 AM West on time SanFrancisco   320
## 16 AM West Delayed SanFrancisco   129
## 17  Alaska on time      Seattle  1841
## 18  Alaska Delayed      Seattle   305
## 19 AM West on time      Seattle   201
## 20 AM West Delayed      Seattle    61
airplanesnew2 <- as.data.frame(airplanesnew)
airplanesnew4 <- airplanesnew %>%  
  spread("Status", "Count")
names(airplanesnew4) <- c("Airline", "city", "Delayed", "Ontime")
airplanesnew4
##    Airline         city Delayed Ontime
## 1   Alaska   LosAngeles      62    497
## 2   Alaska      Phoenix      12    221
## 3   Alaska     SanDiego      20    212
## 4   Alaska SanFrancisco     102    503
## 5   Alaska      Seattle     305   1841
## 6  AM West   LosAngeles     117    694
## 7  AM West      Phoenix     415   4850
## 8  AM West     SanDiego      65    383
## 9  AM West SanFrancisco     129    320
## 10 AM West      Seattle      61    201
counts <- airplanesnew4 %>% group_by(Airline) %>% summarise(Totaldelayed = sum(Delayed), Totalontime = sum(Ontime))
counts
## # A tibble: 2 x 3
##   Airline Totaldelayed Totalontime
##     <chr>        <int>       <int>
## 1  Alaska          501        3274
## 2 AM West          787        6448
graph1 <- mutate(counts, Airline, Totaldelayed, Totalontime, Totalflights = Totaldelayed + Totalontime, percentdelayed = ((Totaldelayed/(Totalontime + Totaldelayed))*100), percentontime = ((Totalontime/(Totalontime + Totaldelayed))*100))
graph1
## # A tibble: 2 x 6
##   Airline Totaldelayed Totalontime Totalflights percentdelayed
##     <chr>        <int>       <int>        <int>          <dbl>
## 1  Alaska          501        3274         3775       13.27152
## 2 AM West          787        6448         7235       10.87768
## # ... with 1 more variables: percentontime <dbl>

Overall, AM West has a nigher number of delayed flights, but when looking at ratios, Alaska has the highest probability of having a flight delayed. AM West has almost double the total number of flights

library(ggplot2)
g <- ggplot(graph1, aes(x = Airline)) + geom_bar(aes(weight = Totaldelayed), fill="#DD8888") + ylab("Delayed") + ggtitle("Total Delayed Flights per Airline")
g

graph2 <- airplanesnew4 %>% group_by(Airline, city) %>% summarise(Totaldelayed = sum(Delayed), Totalontime = sum(Ontime)) %>% 
 mutate(city, Totaldelayed, Totalontime, percentdelayed = ((Totaldelayed/(Totalontime + Totaldelayed))*100), percentontime = ((Totalontime/(Totalontime + Totaldelayed))*100))
graph2
## # A tibble: 10 x 6
## # Groups:   Airline [2]
##    Airline         city Totaldelayed Totalontime percentdelayed
##      <chr>        <chr>        <int>       <int>          <dbl>
##  1  Alaska   LosAngeles           62         497      11.091234
##  2  Alaska      Phoenix           12         221       5.150215
##  3  Alaska     SanDiego           20         212       8.620690
##  4  Alaska SanFrancisco          102         503      16.859504
##  5  Alaska      Seattle          305        1841      14.212488
##  6 AM West   LosAngeles          117         694      14.426634
##  7 AM West      Phoenix          415        4850       7.882241
##  8 AM West     SanDiego           65         383      14.508929
##  9 AM West SanFrancisco          129         320      28.730512
## 10 AM West      Seattle           61         201      23.282443
## # ... with 1 more variables: percentontime <dbl>
g3 <- ggplot(graph2, aes(x = city, y = Totaldelayed)) + geom_bar(aes(fill= Airline), stat = "Identity", position=position_dodge()) + ylab("Delayed") + ggtitle("Total Delayed Flights per Airline")
g3

The highest number of delayed flights occur at Phoenix aiport by AM West Airlines

g1 <- ggplot(graph2, aes(x = city, y = percentdelayed)) + geom_bar(aes(fill= Airline), stat = "Identity", position=position_dodge()) + ylab("On Time") + ggtitle("Percent Delayed Flights per Airline")
g1

The highest probability of flights delayed occur at San Francisco airport by AM West