library(tidyr)
library(dplyr)
library(ggplot2)
library(flextable)
library(rstatix)
url <- "https://raw.githubusercontent.com/jewelercart/Data607/main/flight_data.csv"
flight_data <- read.csv(url, header = TRUE, sep = ",", quote = "", fill = TRUE)
colnames(flight_data) <- c("Time_Zone", "Cities", "on_time", "delayed")
flight_data_long <- flight_data %>%
gather(key = "Status", value = "Count", -Time_Zone, -Cities)
flight_data_long$Time_Zone<- gsub("\"", "", flight_data_long$Time_Zone)
flight_data_long$Cities<- gsub("\"", "", flight_data_long$Cities)
table <- knitr::kable(flight_data_long)
table
Time_Zone | Cities | Status | Count |
---|---|---|---|
ALASKA | Los Angeles | on_time | 497 |
ALASKA | Phoenix | on_time | 221 |
ALASKA | San Diego | on_time | 212 |
ALASKA | San Francisco | on_time | 503 |
ALASKA | Seattle | on_time | 1841 |
AM WEST | Los Angeles | on_time | 694 |
AM WEST | Phoenix | on_time | 4840 |
AM WEST | San Diego | on_time | 383 |
AM WEST | San Francisco | on_time | 320 |
AM WEST | Seattle | on_time | 201 |
ALASKA | Los Angeles | delayed | 62 |
ALASKA | Phoenix | delayed | 12 |
ALASKA | San Diego | delayed | 20 |
ALASKA | San Francisco | delayed | 102 |
ALASKA | Seattle | delayed | 305 |
AM WEST | Los Angeles | delayed | 117 |
AM WEST | Phoenix | delayed | 415 |
AM WEST | San Diego | delayed | 65 |
AM WEST | San Francisco | delayed | 129 |
AM WEST | Seattle | delayed | 61 |
summary_stats =
flight_data_long %>%
group_by(Time_Zone, Status) %>%
get_summary_stats(Count, show = c("mean", "median", "max","min"))
ggplot(flight_data_long, aes(x = Time_Zone, y = Count, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Arrival Delays by Time Zones and Status", y = "Number of Flights") +
theme_minimal()
alaska_data <- flight_data_long %>% filter(Time_Zone == "ALASKA")
am_west_data <- flight_data_long %>% filter(Time_Zone == "AM WEST")
plot_alaska <- ggplot(alaska_data, aes(x = Cities, y = Count, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Arrival Delays for ALASKA by City and Status", y = "Number of Flights") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust=0.5),
panel.grid.minor= element_blank())
plot_alaska
plot_am_west <- ggplot(am_west_data, aes(x = Cities, y = Count, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Arrival Delays for AM WEST by City and Status", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust=0.5),
panel.grid.minor= element_blank())
plot_am_west
Summary Statistics:
summary_stats
## # A tibble: 4 × 8
## Time_Zone Status variable n mean median max min
## <chr> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA delayed Count 5 100. 62 305 12
## 2 ALASKA on_time Count 5 655. 497 1841 212
## 3 AM WEST delayed Count 5 157. 117 415 61
## 4 AM WEST on_time Count 5 1288. 383 4840 201
tbl <- xtabs(~ Time_Zone + Status, data = flight_data_long)
summary(tbl)
## Call: xtabs(formula = ~Time_Zone + Status, data = flight_data_long)
## Number of cases in table: 20
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 0, df = 1, p-value = 1
proportions(tbl, "Status")
## Status
## Time_Zone delayed on_time
## ALASKA 0.5 0.5
## AM WEST 0.5 0.5
There is no difference between the two time zones regarding flight delays and punctuality. This was confirmed with a chi-squared test p-value = 1.