knitr::include_graphics("Assignment Tidying and Transforming Data.pdf")Assignment Tidying and Transforming Data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AM WEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
## Airline FlightStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AM WEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
## Airline FlightStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 4 AM WEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
## Airline FlightStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4,840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
pivot.raw.data <- tidyr::gather(raw.data, "City", "Count", 3:7)
pivot.raw.data## Airline FlightStatus City Count
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4,840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1,841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
pivot.raw.data <- transform(pivot.raw.data, Count = as.numeric(gsub(",", "", Count)))
pivot.raw.data$City <- str_replace(pivot.raw.data$City, "[.]", " ")
pivot.raw.data## Airline FlightStatus City Count
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
pivot_raw_data <- tidyr::spread(pivot.raw.data, FlightStatus, Count)
pivot_raw_data## Airline City delayed on time
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
#colnames(pivot_raw_data)DT::datatable(pivot_raw_data, options = list(pagelength=5))DT::datatable(select(pivot_raw_data, Airline:`on time`), options = list(pagelength=5))knitr::kable(pivot_raw_data)| Airline | City | delayed | on time |
|---|---|---|---|
| ALASKA | Los Angeles | 62 | 497 |
| ALASKA | Phoenix | 12 | 221 |
| ALASKA | San Diego | 20 | 212 |
| ALASKA | San Francisco | 102 | 503 |
| ALASKA | Seattle | 305 | 1841 |
| AM WEST | Los Angeles | 117 | 694 |
| AM WEST | Phoenix | 415 | 4840 |
| AM WEST | San Diego | 65 | 383 |
| AM WEST | San Francisco | 129 | 320 |
| AM WEST | Seattle | 61 | 201 |
## Airline City delayed on time
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
OnTimeByAirline <- pivot_raw_data %>%
group_by(Airline) %>%
summarise(totalDelayed = sum(delayed),
totalOnTime = sum(`on time`),
percentDelayed = round(100 * totalDelayed/(totalDelayed+totalOnTime), 2),
percentOnTime = round(100 * totalOnTime/(totalDelayed+totalOnTime), 2)
) %>%
arrange(desc(percentOnTime))
OnTimeByAirline## # A tibble: 2 x 5
## Airline totalDelayed totalOnTime percentDelayed percentOnTime
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AM WEST 787 6438 10.9 89.1
## 2 ALASKA 501 3274 13.3 86.7
DelaysByCity <- pivot_raw_data %>%
group_by(City) %>%
summarise(totalDelayed=sum(delayed),
totalOnTime=sum(`on time`),
percentDelayed=round(100 * totalDelayed/(totalDelayed+totalOnTime), 2),
percentOnTime = round(100 * totalOnTime/(totalDelayed+totalOnTime), 2)
) %>%
arrange(desc(percentDelayed))
DelaysByCity## # A tibble: 5 x 5
## City totalDelayed totalOnTime percentDelayed percentOnTime
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 San Francisco 231 823 21.9 78.1
## 2 Seattle 366 2042 15.2 84.8
## 3 Los Angeles 179 1191 13.1 86.9
## 4 San Diego 85 595 12.5 87.5
## 5 Phoenix 427 5061 7.78 92.2
AirlineCity <- pivot_raw_data %>%
mutate(totalFlights = delayed + `on time`,
percentDelayed = 100*delayed / totalFlights,
percentOnTime = 100*`on time` / totalFlights
) %>%
arrange(percentDelayed)
AirlineCity## Airline City delayed on time totalFlights percentDelayed
## 1 ALASKA Phoenix 12 221 233 5.150215
## 2 AM WEST Phoenix 415 4840 5255 7.897241
## 3 ALASKA San Diego 20 212 232 8.620690
## 4 ALASKA Los Angeles 62 497 559 11.091234
## 5 ALASKA Seattle 305 1841 2146 14.212488
## 6 AM WEST Los Angeles 117 694 811 14.426634
## 7 AM WEST San Diego 65 383 448 14.508929
## 8 ALASKA San Francisco 102 503 605 16.859504
## 9 AM WEST Seattle 61 201 262 23.282443
## 10 AM WEST San Francisco 129 320 449 28.730512
## percentOnTime
## 1 94.84979
## 2 92.10276
## 3 91.37931
## 4 88.90877
## 5 85.78751
## 6 85.57337
## 7 85.49107
## 8 83.14050
## 9 76.71756
## 10 71.26949
ggplot(AirlineCity, aes(x=reorder(City,percentDelayed), y=percentDelayed)) +
geom_bar(aes(fill = Airline), position = "dodge", stat = "identity") +
xlab("City") +
ylab("Delay Rate")ggplot(AirlineCity, aes(x=City)) +
geom_line(aes(y=percentDelayed, color = "darkred")) +
geom_line(aes(y=percentOnTime, color="steelblue"), linetype="twodash") +
scale_y_continuous(name ="Cumulative minutes delayed by cause of delay", labels = scales::comma ) +
scale_color_discrete(name = "Flight Status", labels=c("Delay", "On Time")) +
theme_minimal()ggplot(pivot.raw.data, aes(x = Airline, y = Count, fill = FlightStatus)) +
geom_bar(position = "stack", stat = "identity") + facet_grid(~ City) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_discrete(labels = c("Delayed", "On Time")) +
ylab("Flights")ggplot(OnTimeByAirline, aes(Airline, percentOnTime)) +
geom_bar(aes(fill = Airline), position = "dodge", stat = "identity") +
ylab("Percent OnTIme")