source: Numbersense, Kaiser Fung, McGraw HIll, 2013
# read csv into R
raw_data <- read.csv("airline_delays.csv", sep = ",")#, check.names = FALSE)
raw_data
## X X.1 los.angeles phoenix san.diego san.francisco seattle
## 1 alaska on time 497 221 212 503 1841
## 2 alaska delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 am west on time 694 4840 383 320 201
## 5 am west delayed 117 415 65 129 61
# modify column names for easier reference and drop row with blank values
headers.raw_data <- raw_data %>%
rename(airline = 1, status = 2) %>%
drop_na()
headers.raw_data
## airline status los.angeles phoenix san.diego san.francisco seattle
## 1 alaska on time 497 221 212 503 1841
## 2 alaska delayed 62 12 20 102 305
## 4 am west on time 694 4840 383 320 201
## 5 am west delayed 117 415 65 129 61
# melt city and count
molten.raw_data <- headers.raw_data %>%
pivot_longer(-c(airline, status), names_to = "city", values_to = "count")
head(molten.raw_data)
## # A tibble: 6 x 4
## airline status city count
## <fct> <fct> <chr> <int>
## 1 alaska on time los.angeles 497
## 2 alaska on time phoenix 221
## 3 alaska on time san.diego 212
## 4 alaska on time san.francisco 503
## 5 alaska on time seattle 1841
## 6 alaska delayed los.angeles 62
# cast status and count
casted.raw_data <- molten.raw_data %>%
pivot_wider(names_from = "status", values_from = "count")
head(casted.raw_data)
## # A tibble: 6 x 4
## airline city `on time` delayed
## <fct> <chr> <int> <int>
## 1 alaska los.angeles 497 62
## 2 alaska phoenix 221 12
## 3 alaska san.diego 212 20
## 4 alaska san.francisco 503 102
## 5 alaska seattle 1841 305
## 6 am west los.angeles 694 117
# replace period with spaces for values in column "city"
casted.raw_data$city <- str_replace_all(casted.raw_data$city, "\\.", " ")
head(casted.raw_data)
## # A tibble: 6 x 4
## airline city `on time` delayed
## <fct> <chr> <int> <int>
## 1 alaska los angeles 497 62
## 2 alaska phoenix 221 12
## 3 alaska san diego 212 20
## 4 alaska san francisco 503 102
## 5 alaska seattle 1841 305
## 6 am west los angeles 694 117
# new column for total flights
casted.raw_data$flights <- casted.raw_data$`on time` + casted.raw_data$delayed
# new column for percentage of flights that are delayed
casted.raw_data$delayed_rate <- casted.raw_data$delayed / casted.raw_data$flights
casted.raw_data
## # A tibble: 10 x 6
## airline city `on time` delayed flights delayed_rate
## <fct> <chr> <int> <int> <int> <dbl>
## 1 alaska los angeles 497 62 559 0.111
## 2 alaska phoenix 221 12 233 0.0515
## 3 alaska san diego 212 20 232 0.0862
## 4 alaska san francisco 503 102 605 0.169
## 5 alaska seattle 1841 305 2146 0.142
## 6 am west los angeles 694 117 811 0.144
## 7 am west phoenix 4840 415 5255 0.0790
## 8 am west san diego 383 65 448 0.145
## 9 am west san francisco 320 129 449 0.287
## 10 am west seattle 201 61 262 0.233
# calculate average rate of delay for all "alaska" flights
delayed_rate_alaska <- sum(subset(casted.raw_data, airline == "alaska")$delayed_rate) / sum(casted.raw_data$airline == "alaska")
# calculate average rate of delay for all "am west" flights
delayed_rate_am_west <- sum(subset(casted.raw_data, airline == "am west")$delayed_rate) / sum(casted.raw_data$airline == "alaska")
# dataframe storing average rate of delay for each airline
delays <- data.frame(c("alaska", "am west"), c(delayed_rate_alaska, delayed_rate_am_west))
colnames(delays) <- c("airline", "delay")
delays
## airline delay
## 1 alaska 0.1118683
## 2 am west 0.1776915
# plot delayed rate per airline
ggplot(casted.raw_data, aes(x = city, y = delayed_rate, fill = city)) +
geom_bar(stat = 'identity', position = position_dodge()) + # get bar graph of count
facet_grid(~airline) + # group by airline
theme(axis.text.x = element_text(angle = 90)) + # rotate x-axis labels
labs(y="count", title="rate of flight delays by airline and city", subtitle="alaska vs. am west") # labels
# plot delays stacked by flights per airline
ggplot(casted.raw_data, aes(x = city, y = delayed, fill = flights)) +
geom_bar(stat = 'identity', position = position_stack()) + # get bar graph of count
facet_grid(~airline) + # group by airline
theme(axis.text.x = element_text(angle = 90)) + # rotate x-axis labels
labs(y="delays", title="count of flight delays by airline and city", subtitle="alaska vs. am west") # labels