Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
# Tidyverse packages includes ggplot2, dplyr, tidyr already
library(tidyverse)
table <- read.csv("https://raw.githubusercontent.com/tonyCUNY/tonyCUNY/main/assignment5.csv")
table
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# Remove row with NA value
# Rename X and X.1 to Airline and Status
table_1 <- table |>
slice(-3) |>
rename(Airline = X, Status = X.1)
# Fill in the missing values under Airline column
table_1[2, 1] <- 'ALASKA'
table_1[4, 1] <- 'AM WEST'
table_1
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
# Use pivot_longer() to transform all location into Destination column
# arrange by Destination column
table_2 <- table_1 |>
pivot_longer(
cols = c("Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "Seattle"),
names_to = "Destination",
values_to = "Count"
) |>
arrange(Destination)
table_2
## # A tibble: 20 × 4
## Airline Status Destination Count
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
# Use pivot_wider() to transform values under Status into two columns: On_time, Delayed
# value will be from Count column
# rename the new column
table_3 <- table_2 |>
pivot_wider(
names_from = Status,
values_from = Count,
) |>
rename(On_time = 'on time', Delayed = delayed)
table_3
## # A tibble: 10 × 4
## Airline Destination On_time Delayed
## <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 497 62
## 2 AM WEST Los.Angeles 694 117
## 3 ALASKA Phoenix 221 12
## 4 AM WEST Phoenix 4840 415
## 5 ALASKA San.Diego 212 20
## 6 AM WEST San.Diego 383 65
## 7 ALASKA San.Francisco 503 102
## 8 AM WEST San.Francisco 320 129
## 9 ALASKA Seattle 1841 305
## 10 AM WEST Seattle 201 61
# Create a column: Delay_rate
table_4 <- table_3 |>
mutate(Delay_rate = round((Delayed / (On_time + Delayed)) * 100, 2))
table_4
## # A tibble: 10 × 5
## Airline Destination On_time Delayed Delay_rate
## <chr> <chr> <int> <int> <dbl>
## 1 ALASKA Los.Angeles 497 62 11.1
## 2 AM WEST Los.Angeles 694 117 14.4
## 3 ALASKA Phoenix 221 12 5.15
## 4 AM WEST Phoenix 4840 415 7.9
## 5 ALASKA San.Diego 212 20 8.62
## 6 AM WEST San.Diego 383 65 14.5
## 7 ALASKA San.Francisco 503 102 16.9
## 8 AM WEST San.Francisco 320 129 28.7
## 9 ALASKA Seattle 1841 305 14.2
## 10 AM WEST Seattle 201 61 23.3
# Create a table showing two airlines Delay_rate for each Destination
table_5 <- table_4 |>
group_by(Destination, Airline) |>
summarize(Delay_rate) |>
spread(Airline, Delay_rate) |>
rename(AM_WEST = 'AM WEST')
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
table_5
## # A tibble: 5 × 3
## # Groups: Destination [5]
## Destination ALASKA AM_WEST
## <chr> <dbl> <dbl>
## 1 Los.Angeles 11.1 14.4
## 2 Phoenix 5.15 7.9
## 3 San.Diego 8.62 14.5
## 4 San.Francisco 16.9 28.7
## 5 Seattle 14.2 23.3
Conclusion: AM WEST has higher delay_rate than ALASKA
# Visualize the results
table_4 |>
group_by(Airline) |>
ggplot(aes(x = Destination, y = Delay_rate, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Airline Arrival Delay",
x = "Destination",
y = "Delayrate (%)")