Objective

  1. Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
  2. Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

load needed package

library ('tidyr')
library ('dplyr')
library ('knitr')

Get the data from the CSV File, the CSV file format is exactly the same as the source

airlines <- read.csv ("E:/airlineData.csv")
kable(airlines)
X X.1 LosAngeles Phoenix SanDiego SanFrancisco Seattle
ALASKA on time 497 221 212 503 1841
delayed 62 12 20 102 305
NA NA NA NA NA
AMWEST on time 694 4840 383 320 201
delayed 117 415 65 129 61
NA NA NA NA NA
kable(airlines)
X X.1 LosAngeles Phoenix SanDiego SanFrancisco Seattle
ALASKA on time 497 221 212 503 1841
delayed 62 12 20 102 305
NA NA NA NA NA
AMWEST on time 694 4840 383 320 201
delayed 117 415 65 129 61
NA NA NA NA NA

Data Cleaning to make it into a condition ready for analysis

# rename cols
names(airlines)[c(1,2)] <- c("Airline", "Status")
airlines
  Airline  Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
1  ALASKA on time        497     221      212          503    1841
2         delayed         62      12       20          102     305
3                         NA      NA       NA           NA      NA
4  AMWEST on time        694    4840      383          320     201
5         delayed        117     415       65          129      61
6                         NA      NA       NA           NA      NA
# remove middle NA obs, in row3, and row6
airlines <- airlines [c(-3,-6),]
airlines
  Airline  Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
1  ALASKA on time        497     221      212          503    1841
2         delayed         62      12       20          102     305
4  AMWEST on time        694    4840      383          320     201
5         delayed        117     415       65          129      61
# add in missing airline values
airlines[2,1] <- "ALASKA"
airlines[4,1] <- "AMWEST"
kable (airlines)
Airline Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
1 ALASKA on time 497 221 212 503 1841
2 ALASKA delayed 62 12 20 102 305
4 AMWEST on time 694 4840 383 320 201
5 AMWEST delayed 117 415 65 129 61
# the original data is in wide format, make it a long format for cities
tidyairline1 <- airlines %>% 
   gather(cities, count, 3:7)
tidyairline1
   Airline  Status       cities count
1   ALASKA on time   LosAngeles   497
2   ALASKA delayed   LosAngeles    62
3   AMWEST on time   LosAngeles   694
4   AMWEST delayed   LosAngeles   117
5   ALASKA on time      Phoenix   221
6   ALASKA delayed      Phoenix    12
7   AMWEST on time      Phoenix  4840
8   AMWEST delayed      Phoenix   415
9   ALASKA on time     SanDiego   212
10  ALASKA delayed     SanDiego    20
11  AMWEST on time     SanDiego   383
12  AMWEST delayed     SanDiego    65
13  ALASKA on time SanFrancisco   503
14  ALASKA delayed SanFrancisco   102
15  AMWEST on time SanFrancisco   320
16  AMWEST delayed SanFrancisco   129
17  ALASKA on time      Seattle  1841
18  ALASKA delayed      Seattle   305
19  AMWEST on time      Seattle   201
20  AMWEST delayed      Seattle    61
#  then, for the status (on time vs delay, we transform it a wide format)
tidyairline2 <- tidyairline1 %>% 
     spread(Status, count)
 tidyairline2
   Airline       cities delayed on time
1   ALASKA   LosAngeles      62     497
2   ALASKA      Phoenix      12     221
3   ALASKA     SanDiego      20     212
4   ALASKA SanFrancisco     102     503
5   ALASKA      Seattle     305    1841
6   AMWEST   LosAngeles     117     694
7   AMWEST      Phoenix     415    4840
8   AMWEST     SanDiego      65     383
9   AMWEST SanFrancisco     129     320
10  AMWEST      Seattle      61     201
#rename the 4th column name to get rid of the space in col name
names(tidyairline2)[4] <- "ontime"

kable(tidyairline2)
Airline cities delayed ontime
ALASKA LosAngeles 62 497
ALASKA Phoenix 12 221
ALASKA SanDiego 20 212
ALASKA SanFrancisco 102 503
ALASKA Seattle 305 1841
AMWEST LosAngeles 117 694
AMWEST Phoenix 415 4840
AMWEST SanDiego 65 383
AMWEST SanFrancisco 129 320
AMWEST Seattle 61 201
(3) Perfor m analysis to c ompare the arrival delays for the two airlines.
#Delay ratios by airline and city, looked at each element seperately

tidyairline3 <- tidyairline2 %>%
   mutate(total=ontime + delayed, delayratio=round(delayed/total, 2)) 
## Warning: package 'bindrcpp' was built under R version 3.3.3
tidyairline3
##    Airline       cities delayed ontime total delayratio
## 1   ALASKA   LosAngeles      62    497   559       0.11
## 2   ALASKA      Phoenix      12    221   233       0.05
## 3   ALASKA     SanDiego      20    212   232       0.09
## 4   ALASKA SanFrancisco     102    503   605       0.17
## 5   ALASKA      Seattle     305   1841  2146       0.14
## 6   AMWEST   LosAngeles     117    694   811       0.14
## 7   AMWEST      Phoenix     415   4840  5255       0.08
## 8   AMWEST     SanDiego      65    383   448       0.15
## 9   AMWEST SanFrancisco     129    320   449       0.29
## 10  AMWEST      Seattle      61    201   262       0.23
#Plot these delay ratios, based on City
dotchart(tidyairline3$delayratio, groups = tidyairline3$Airline, labels = tidyairline3$cities )

Observation from the analysis:

It looks like ALASKA airlines had a lower individual delay ratio than AMWEST in every single city.