remove the NA in the 3rd row
data <- data.frame(data[-3, ])
kable(head(data))
| 1 |
ALASKA |
on time |
497 |
221 |
21 |
503 |
1841 |
| 2 |
NA |
delayed |
62 |
12 |
20 |
102 |
305 |
| 4 |
AM WEST |
on time |
694 |
4840 |
383 |
320 |
201 |
| 5 |
NA |
delayed |
117 |
415 |
65 |
129 |
61 |
Give name for rows and columns and replace the rows which still have NA
colnames(data)[1]="Airline"
colnames(data)[2]="Status"
data[2,1]="ALASKA"
data[4,1]="AM WEST"
kable(head(data))
| 1 |
ALASKA |
on time |
497 |
221 |
21 |
503 |
1841 |
| 2 |
ALASKA |
delayed |
62 |
12 |
20 |
102 |
305 |
| 4 |
AM WEST |
on time |
694 |
4840 |
383 |
320 |
201 |
| 5 |
AM WEST |
delayed |
117 |
415 |
65 |
129 |
61 |
Now, we can use “gather” of tidyr to gather the rows and columns
tdata <- gather(data, "Destination", "Number_of_time", 3:7)
kable(head(tdata))
| ALASKA |
on time |
Los.Angeles |
497 |
| ALASKA |
delayed |
Los.Angeles |
62 |
| AM WEST |
on time |
Los.Angeles |
694 |
| AM WEST |
delayed |
Los.Angeles |
117 |
| ALASKA |
on time |
Phonexi |
221 |
| ALASKA |
delayed |
Phonexi |
12 |
spread the “on time” and “delay”
tdata1 <- spread(tdata, key= Status, value = Number_of_time)
colnames(tdata1)[4]="ontime"
kable(head(tdata1))
| ALASKA |
Los.Angeles |
62 |
497 |
| ALASKA |
Phonexi |
12 |
221 |
| ALASKA |
San.Diego |
20 |
21 |
| ALASKA |
San.Francisco |
102 |
503 |
| ALASKA |
Seattle |
305 |
1841 |
| AM WEST |
Los.Angeles |
117 |
694 |
(3) Perform analysis to compare the arrival delays for the two airlines.
Convert delayed and ontime to numerical value so that it is easy to calculate.
tdata1 <- within(tdata1, {delayed <- as.numeric(as.character(delayed))
ontime <-as.numeric(as.character(ontime))})
# Use "mutate" to preserve the old list but add new columns of "total" and "PercentDelayed"
tdata1<-mutate(tdata1, Total= delayed + ontime, PercentDelayed = delayed / Total*100)
tdata1 <- arrange(tdata1, PercentDelayed)
kable(tdata1)
| ALASKA |
Phonexi |
12 |
221 |
233 |
5.150215 |
| AM WEST |
Phonexi |
415 |
4840 |
5255 |
7.897241 |
| ALASKA |
Los.Angeles |
62 |
497 |
559 |
11.091234 |
| ALASKA |
Seattle |
305 |
1841 |
2146 |
14.212488 |
| AM WEST |
Los.Angeles |
117 |
694 |
811 |
14.426634 |
| AM WEST |
San.Diego |
65 |
383 |
448 |
14.508929 |
| ALASKA |
San.Francisco |
102 |
503 |
605 |
16.859504 |
| AM WEST |
Seattle |
61 |
201 |
262 |
23.282443 |
| AM WEST |
San.Francisco |
129 |
320 |
449 |
28.730512 |
| ALASKA |
San.Diego |
20 |
21 |
41 |
48.780488 |
group and find the average delayed percent by Airlines
delays <- tdata1 %>% group_by(Airline) %>% summarise(MeanPercent= round(mean(PercentDelayed, 0)))
delays
## # A tibble: 2 × 2
## Airline MeanPercent
## <chr> <dbl>
## 1 ALASKA 19
## 2 AM WEST 18
Alaska is delayed 19% and AM WEST is 18% of the time.