library(tidyr)
library(dplyr)
library(ggplot2)
library(flextable)
library(rstatix)
url <- "https://raw.githubusercontent.com/jewelercart/Data607/main/flight_data.csv"
flight_data <- read.csv(url, header = TRUE, sep = ",", quote = "", fill = TRUE)
colnames(flight_data) <- c("Time_Zone", "Cities", "on_time", "delayed")
flight_data_long <- flight_data %>%
gather(key = "Status", value = "Count", -Time_Zone, -Cities)
flight_data_long
## Time_Zone Cities Status Count
## 1 "ALASKA" "Los Angeles" on_time 497
## 2 "ALASKA" "Phoenix" on_time 221
## 3 "ALASKA" "San Diego" on_time 212
## 4 "ALASKA" "San Francisco" on_time 503
## 5 "ALASKA" "Seattle" on_time 1841
## 6 "AM WEST" "Los Angeles" on_time 694
## 7 "AM WEST" "Phoenix" on_time 4840
## 8 "AM WEST" "San Diego" on_time 383
## 9 "AM WEST" "San Francisco" on_time 320
## 10 "AM WEST" "Seattle" on_time 201
## 11 "ALASKA" "Los Angeles" delayed 62
## 12 "ALASKA" "Phoenix" delayed 12
## 13 "ALASKA" "San Diego" delayed 20
## 14 "ALASKA" "San Francisco" delayed 102
## 15 "ALASKA" "Seattle" delayed 305
## 16 "AM WEST" "Los Angeles" delayed 117
## 17 "AM WEST" "Phoenix" delayed 415
## 18 "AM WEST" "San Diego" delayed 65
## 19 "AM WEST" "San Francisco" delayed 129
## 20 "AM WEST" "Seattle" delayed 61
flight_data_long$Time_Zone<- gsub("\"", "", flight_data_long$Time_Zone)
flight_data_long$Cities<- gsub("\"", "", flight_data_long$Cities)
table <- knitr::kable(flight_data_long)
table
Time_Zone | Cities | Status | Count |
---|---|---|---|
ALASKA | Los Angeles | on_time | 497 |
ALASKA | Phoenix | on_time | 221 |
ALASKA | San Diego | on_time | 212 |
ALASKA | San Francisco | on_time | 503 |
ALASKA | Seattle | on_time | 1841 |
AM WEST | Los Angeles | on_time | 694 |
AM WEST | Phoenix | on_time | 4840 |
AM WEST | San Diego | on_time | 383 |
AM WEST | San Francisco | on_time | 320 |
AM WEST | Seattle | on_time | 201 |
ALASKA | Los Angeles | delayed | 62 |
ALASKA | Phoenix | delayed | 12 |
ALASKA | San Diego | delayed | 20 |
ALASKA | San Francisco | delayed | 102 |
ALASKA | Seattle | delayed | 305 |
AM WEST | Los Angeles | delayed | 117 |
AM WEST | Phoenix | delayed | 415 |
AM WEST | San Diego | delayed | 65 |
AM WEST | San Francisco | delayed | 129 |
AM WEST | Seattle | delayed | 61 |
summary_stats =
flight_data_long %>%
group_by(Time_Zone, Status) %>%
get_summary_stats(Count, show = c("mean", "median", "max","min")) %>%
flextable()
ggplot(flight_data_long, aes(x = Time_Zone, y = Count, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Arrival Delays by Time Zones and Status", y = "Number of Flights") +
theme_minimal()
alaska_data <- flight_data_long %>% filter(Time_Zone == "ALASKA")
am_west_data <- flight_data_long %>% filter(Time_Zone == "AM WEST")
plot_alaska <- ggplot(alaska_data, aes(x = Cities, y = Count, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Arrival Delays for ALASKA by City and Status", y = "Number of Flights") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust=0.5),
panel.grid.minor= element_blank())
plot_alaska
plot_am_west <- ggplot(am_west_data, aes(x = Cities, y = Count, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Arrival Delays for AM WEST by City and Status", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust=0.5),
panel.grid.minor= element_blank())
plot_am_west
Summary Statistics:
summary_stats
Time_Zone | Status | variable | n | mean | median | max | min |
---|---|---|---|---|---|---|---|
ALASKA | delayed | Count | 5 | 100.2 | 62 | 305 | 12 |
ALASKA | on_time | Count | 5 | 654.8 | 497 | 1,841 | 212 |
AM WEST | delayed | Count | 5 | 157.4 | 117 | 415 | 61 |
AM WEST | on_time | Count | 5 | 1,287.6 | 383 | 4,840 | 201 |
tbl <- xtabs(~ Time_Zone + Status, data = flight_data_long)
summary(tbl)
## Call: xtabs(formula = ~Time_Zone + Status, data = flight_data_long)
## Number of cases in table: 20
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 0, df = 1, p-value = 1
proportions(tbl, "Status")
## Status
## Time_Zone delayed on_time
## ALASKA 0.5 0.5
## AM WEST 0.5 0.5
There is no difference between the two time zones regarding flight delays and punctuality. This was confirmed with a chi-squared test p-value = 1.