Data Cleaning to make it into a condition ready for analysis
# rename cols
names(airlines)[c(1,2)] <- c("Airline", "Status")
airlines
Airline Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
1 ALASKA on time 497 221 212 503 1841
2 delayed 62 12 20 102 305
3 NA NA NA NA NA
4 AMWEST on time 694 4840 383 320 201
5 delayed 117 415 65 129 61
6 NA NA NA NA NA
# remove middle NA obs, in row3, and row6
airlines <- airlines [c(-3,-6),]
airlines
Airline Status LosAngeles Phoenix SanDiego SanFrancisco Seattle
1 ALASKA on time 497 221 212 503 1841
2 delayed 62 12 20 102 305
4 AMWEST on time 694 4840 383 320 201
5 delayed 117 415 65 129 61
# add in missing airline values
airlines[2,1] <- "ALASKA"
airlines[4,1] <- "AMWEST"
kable (airlines)
1 |
ALASKA |
on time |
497 |
221 |
212 |
503 |
1841 |
2 |
ALASKA |
delayed |
62 |
12 |
20 |
102 |
305 |
4 |
AMWEST |
on time |
694 |
4840 |
383 |
320 |
201 |
5 |
AMWEST |
delayed |
117 |
415 |
65 |
129 |
61 |
# the original data is in wide format, make it a long format for cities
tidyairline1 <- airlines %>%
gather(cities, count, 3:7)
tidyairline1
Airline Status cities count
1 ALASKA on time LosAngeles 497
2 ALASKA delayed LosAngeles 62
3 AMWEST on time LosAngeles 694
4 AMWEST delayed LosAngeles 117
5 ALASKA on time Phoenix 221
6 ALASKA delayed Phoenix 12
7 AMWEST on time Phoenix 4840
8 AMWEST delayed Phoenix 415
9 ALASKA on time SanDiego 212
10 ALASKA delayed SanDiego 20
11 AMWEST on time SanDiego 383
12 AMWEST delayed SanDiego 65
13 ALASKA on time SanFrancisco 503
14 ALASKA delayed SanFrancisco 102
15 AMWEST on time SanFrancisco 320
16 AMWEST delayed SanFrancisco 129
17 ALASKA on time Seattle 1841
18 ALASKA delayed Seattle 305
19 AMWEST on time Seattle 201
20 AMWEST delayed Seattle 61
# then, for the status (on time vs delay, we transform it a wide format)
tidyairline2 <- tidyairline1 %>%
spread(Status, count)
tidyairline2
Airline cities delayed on time
1 ALASKA LosAngeles 62 497
2 ALASKA Phoenix 12 221
3 ALASKA SanDiego 20 212
4 ALASKA SanFrancisco 102 503
5 ALASKA Seattle 305 1841
6 AMWEST LosAngeles 117 694
7 AMWEST Phoenix 415 4840
8 AMWEST SanDiego 65 383
9 AMWEST SanFrancisco 129 320
10 AMWEST Seattle 61 201
#rename the 4th column name to get rid of the space in col name
names(tidyairline2)[4] <- "ontime"
kable(tidyairline2)
ALASKA |
LosAngeles |
62 |
497 |
ALASKA |
Phoenix |
12 |
221 |
ALASKA |
SanDiego |
20 |
212 |
ALASKA |
SanFrancisco |
102 |
503 |
ALASKA |
Seattle |
305 |
1841 |
AMWEST |
LosAngeles |
117 |
694 |
AMWEST |
Phoenix |
415 |
4840 |
AMWEST |
SanDiego |
65 |
383 |
AMWEST |
SanFrancisco |
129 |
320 |
AMWEST |
Seattle |
61 |
201 |
(3) Perfor |
m analysis to c |
ompare the |
arrival delays for the two airlines. |
#Delay ratios by airline and city, looked at each element seperately
tidyairline3 <- tidyairline2 %>%
mutate(total=ontime + delayed, delayratio=round(delayed/total, 2))
## Warning: package 'bindrcpp' was built under R version 3.3.3
tidyairline3
## Airline cities delayed ontime total delayratio
## 1 ALASKA LosAngeles 62 497 559 0.11
## 2 ALASKA Phoenix 12 221 233 0.05
## 3 ALASKA SanDiego 20 212 232 0.09
## 4 ALASKA SanFrancisco 102 503 605 0.17
## 5 ALASKA Seattle 305 1841 2146 0.14
## 6 AMWEST LosAngeles 117 694 811 0.14
## 7 AMWEST Phoenix 415 4840 5255 0.08
## 8 AMWEST SanDiego 65 383 448 0.15
## 9 AMWEST SanFrancisco 129 320 449 0.29
## 10 AMWEST Seattle 61 201 262 0.23
#Plot these delay ratios, based on City
dotchart(tidyairline3$delayratio, groups = tidyairline3$Airline, labels = tidyairline3$cities )
