#Above code will be hidden for password privacy purpose
#alternative way to read from csv file
#url = 'https://raw.githubusercontent.com/mehreengillani/DATA607/refs/heads/main/flight_info.csv'
#flights <- read.csv(url)
#head(flights)
flights <- dbReadTable(con, "flight_delays")
flights
## id airline airport ontime delay
## 1 1 ALASKA Los.Angeles 497 62
## 2 2 ALASKA Phoenix 221 12
## 3 3 ALASKA San.Diego 212 20
## 4 4 ALASKA San.Francisco 503 102
## 5 5 ALASKA Seattle 1841 305
## 6 6 AM_WEST Los.Angeles 694 117
## 7 7 AM_WEST Phoenix 4840 415
## 8 8 AM_WEST San.Diego 383 65
## 9 9 AM_WEST San.Francisco 320 129
## 10 10 AM_WEST Seattle 201 61
# Check for missing values in the dataset
colSums(is.na(flights)) # number of NAs per column
## id airline airport ontime delay
## 0 0 0 0 0
# Summary for numeric columns
summary(flights[sapply(flights, is.numeric)])
## id ontime delay
## Min. : 1.00 Min. : 201.0 Min. : 12.00
## 1st Qu.: 3.25 1st Qu.: 245.8 1st Qu.: 61.25
## Median : 5.50 Median : 440.0 Median : 83.50
## Mean : 5.50 Mean : 971.2 Mean :128.80
## 3rd Qu.: 7.75 3rd Qu.: 646.2 3rd Qu.:126.00
## Max. :10.00 Max. :4840.0 Max. :415.00
#print unique values for airlines
unique(flights$airline)
## [1] "ALASKA" "AM_WEST"
#printunique values for airports
unique(flights$airport)
## [1] "Los.Angeles" "Phoenix" "San.Diego" "San.Francisco"
## [5] "Seattle"
There are no null values. There are two airlines and five destination airports. Max number of delayed flights are 415 with average of 128. Max number of on time flights are 4840 with mean value of 971.
flights_long <- flights %>%
pivot_longer(cols = c(ontime, delay),
names_to = "status",
values_to = "count")
flights_long
## # A tibble: 20 × 5
## id airline airport status count
## <int> <chr> <chr> <chr> <int>
## 1 1 ALASKA Los.Angeles ontime 497
## 2 1 ALASKA Los.Angeles delay 62
## 3 2 ALASKA Phoenix ontime 221
## 4 2 ALASKA Phoenix delay 12
## 5 3 ALASKA San.Diego ontime 212
## 6 3 ALASKA San.Diego delay 20
## 7 4 ALASKA San.Francisco ontime 503
## 8 4 ALASKA San.Francisco delay 102
## 9 5 ALASKA Seattle ontime 1841
## 10 5 ALASKA Seattle delay 305
## 11 6 AM_WEST Los.Angeles ontime 694
## 12 6 AM_WEST Los.Angeles delay 117
## 13 7 AM_WEST Phoenix ontime 4840
## 14 7 AM_WEST Phoenix delay 415
## 15 8 AM_WEST San.Diego ontime 383
## 16 8 AM_WEST San.Diego delay 65
## 17 9 AM_WEST San.Francisco ontime 320
## 18 9 AM_WEST San.Francisco delay 129
## 19 10 AM_WEST Seattle ontime 201
## 20 10 AM_WEST Seattle delay 61
airline_delay_summary <- flights %>%
group_by(airline) %>%
summarise(
total_flights = sum(ontime + delay),
total_delays = sum(delay),
delay_rate = total_delays / total_flights,
on_time_rate = sum(ontime) / total_flights
) %>%
arrange(desc(delay_rate))
#arrival delay rates per airline
airline_delay_summary
## # A tibble: 2 × 5
## airline total_flights total_delays delay_rate on_time_rate
## <chr> <int> <int> <dbl> <dbl>
## 1 ALASKA 3775 501 0.133 0.867
## 2 AM_WEST 7225 787 0.109 0.891
library(ggplot2)
# Convert to long format for plotting
airline_long <- airline_delay_summary %>%
pivot_longer(cols = c(delay_rate, on_time_rate),
names_to = "status",
values_to = "rate")
# Plot side-by-side bars (with % in title and axis label)
ggplot(airline_long, aes(x = airline, y = rate, fill = status)) +
geom_col(position = position_dodge(width = 0.6), width = 0.6) +
geom_text(aes(label = paste0(round(rate*100, 1), "%")),
position = position_dodge(width = 0.6),
vjust = -0.5, size = 3) +
labs(title = "Arrival Delay vs On-time Percentage by Airline",
x = "Airline",
y = "Percentage") + # changed here
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 1.1)) +
scale_fill_manual(values = c("delay_rate" = "darkblue", "on_time_rate" = "skyblue"),
labels = c("Delay", "On-time")) +
theme_minimal()
# Calculate total flights, delays, and rates per destination airport
airport_summary <- flights %>%
group_by(airport) %>%
summarise(
total_flights = sum(ontime + delay),
total_delays = sum(delay),
total_ontime = sum(ontime),
delay_rate = total_delays / total_flights,
on_time_rate = total_ontime / total_flights
) %>%
arrange(desc(delay_rate))
airport_summary
## # A tibble: 5 × 6
## airport total_flights total_delays total_ontime delay_rate on_time_rate
## <chr> <int> <int> <int> <dbl> <dbl>
## 1 San.Francisco 1054 231 823 0.219 0.781
## 2 Seattle 2408 366 2042 0.152 0.848
## 3 Los.Angeles 1370 179 1191 0.131 0.869
## 4 San.Diego 680 85 595 0.125 0.875
## 5 Phoenix 5488 427 5061 0.0778 0.922
# Convert to long format for plotting
airport_long <- airport_summary %>%
pivot_longer(cols = c(delay_rate, on_time_rate),
names_to = "status",
values_to = "rate")
ggplot(airport_long, aes(x = airport, y = rate, fill = status)) +
geom_col(position = position_dodge(width = 0.6), width = 0.6) +
geom_text(aes(label = paste0(round(rate*100, 1), "%")),
position = position_dodge(width = 0.6),
vjust = -0.5, size = 3) +
labs(title = "Arrival Delay vs On-time flights percentage by Destination Airport",
x = "Destination Airport",
y = "Percentage") +
scale_y_continuous(labels = percent_format(), limits = c(0, 1.1)) +
scale_fill_manual(values = c("delay_rate" = "black", "on_time_rate" = "grey"),
labels = c("Delay", "On-time")) +
theme_minimal()
airline_airport_summary <- flights %>%
group_by(airline, airport) %>%
summarise(
total_flights = sum(ontime + delay),
total_delays = sum(delay),
total_ontime = sum(ontime),
delay_rate = total_delays / total_flights,
on_time_rate = total_ontime / total_flights
) %>%
ungroup()
## `summarise()` has grouped output by 'airline'. You can override using the
## `.groups` argument.
# Reshape for plotting
airline_airport_long <- airline_airport_summary %>%
pivot_longer(cols = c(delay_rate, on_time_rate),
names_to = "status",
values_to = "rate")
# Plot grouped bars
ggplot(airline_airport_long, aes(x = airport, y = rate, fill = status)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(round(rate*100, 1), "%")),
position = position_dodge(width = 0.8),
hjust = -0.1, size = 3) + # hjust places text in line horizontally
facet_wrap(~airline, ncol = 1) +
labs(title = "Delay vs On-time flights percentage by Airport and Airline",
x = "Destination Airport",
y = "percentage") +
scale_y_continuous(labels = percent_format(), limits = c(0, 1.1)) +
scale_fill_manual(values = c("delay_rate" = "black", "on_time_rate" = "grey"),
labels = c("Delay", "On-time")) +
coord_flip() + # flip axes to make horizontal bars
theme_minimal()
The total number of flights for AM_WEST is more than double that of ALASKA.
ALASKA has a higher overall delay percentage than AM_WEST by approximately 2.3%.
Phoenix handles the highest number of flights, followed by Seattle in second place.
San Francisco has the highest delay rate among destinations, with Seattle following.
Phoenix has the lowest flight delay rate across all destinations.
The ALASKA flights to Phoenix have the lowest individual delay rate, followed closely by AM_WEST flights to Phoenix.
AM_WEST flights to San Francisco experience the highest delay rate, followed by ALASKA flights to San Francisco.