# Path to data (from Python)
data_path <- "C:/Users/RUKN AL BAB/Desktop/ST2195_Part2/data"
years <- c(2004, 2005, 2006, 2007, 2008)
# Load all years (filter non-diverted for delays)
delays_data <- list()
for (y in years) {
df <- read_parquet(file.path(data_path, paste0("cleaned_full_", y, ".parquet")))
df$TotalDelay <- df$DepDelay + df$ArrDelay # Create total delay
df$DepHour <- floor(df$CRSDepTime / 100) # Dep hour
# Filter non-diverted (delays defined)
delays_data[[as.character(y)]] <- df %>% filter(Diverted == 0)
}
# (a) Best times/days to minimize delays
best_times <- list()
for (y in years) {
df_y <- delays_data[[as.character(y)]]
delays <- df_y %>%
group_by(DayOfWeek, DepHour) %>%
summarise(AvgDelay = mean(TotalDelay, na.rm = TRUE), .groups = 'drop')
min_delay <- delays[which.min(delays$AvgDelay), ]
day_map <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
best_times[[y]] <- list(Day = day_map[min_delay$DayOfWeek], Hour = min_delay$DepHour, AvgDelay = round(min_delay$AvgDelay, 2))
# Heatmap
p <- ggplot(delays, aes(x = factor(DayOfWeek, labels = day_map), y = DepHour, fill = AvgDelay)) +
geom_tile() + scale_fill_gradient(low = "blue", high = "yellow") +
labs(title = paste("Avg Delay Heatmap -", y), x = "Day of Week", y = "Dep Hour")
ggsave(file.path(data_path, paste0("delay_heatmap_r_", y, ".png")), p)
print(p) # Show in report
}
## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

# Best times table
best_df <- data.frame(Year = years, do.call(rbind, best_times))
print(best_df)
## Year Day Hour AvgDelay
## 1 2004 Sun 4 -17.22
## 2 2005 Wed 3 -6.5
## 3 2006 Thu 2 -5.8
## 4 2007 Wed 4 -0.44
## 5 2008 Sun 3 -12.25
# (b) Older planes and delays
delays_age_all <- data.frame()
corrs <- numeric(length(years))
names(corrs) <- years
for (y in years) {
df_y <- delays_data[[as.character(y)]]
df_y$AgeBin <- cut(df_y$PlaneAge, breaks = c(0,5,10,15,20,Inf), labels = c("0-5","6-10","11-15","16-20","21+"))
delays_age <- df_y %>%
group_by(AgeBin) %>%
summarise(AvgDepDelay = mean(DepDelay, na.rm = TRUE), .groups = 'drop')
delays_age$Year <- y
delays_age_all <- rbind(delays_age_all, delays_age)
corrs[as.character(y)] <- cor(df_y$PlaneAge, df_y$DepDelay, use = "complete.obs")
# Bar plot
p <- ggplot(delays_age, aes(x = AgeBin, y = AvgDepDelay, fill = AgeBin)) +
geom_bar(stat = "identity") + labs(title = paste("Avg Dep Delay by Plane Age -", y))
ggsave(file.path(data_path, paste0("delays_by_age_r_", y, ".png")), p)
print(p)
}
## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

# Correlations table
print(data.frame(Year = years, Correlation = round(corrs, 3)))
## Year Correlation
## 2004 2004 -0.005
## 2005 2005 -0.005
## 2006 2006 -0.001
## 2007 2007 -0.002
## 2008 2008 -0.005
# Overall line plot
p_line <- ggplot(delays_age_all, aes(x = Year, y = AvgDepDelay, color = AgeBin, group = AgeBin)) +
geom_line() + geom_point() + labs(title = "Avg Dep Delay by Plane Age Across Years")
ggsave(file.path(data_path, "delays_by_age_overall_r.png"), p_line)
## Saving 7 x 5 in image
print(p_line)

knitr::opts_chunk$set(echo = TRUE)
required_packages <- c("arrow", "dplyr", "ggplot2", "tidyr")
for (pkg in required_packages) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg)
library(pkg, character.only = TRUE)
}
}
# Path and years
data_path <- "C:/Users/RUKN AL BAB/Desktop/ST2195_Part2/data"
years <- c(2004, 2005, 2006, 2007, 2008)
coeffs <- list()
for (y in years) {
df_year <- read_parquet(file.path(data_path, paste0("cleaned_full_", y, ".parquet")))
df_year$ArrHour <- floor(df_year$CRSArrTime / 100)
if (sum(df_year$Diverted) == 0) {
cat(sprintf("No diversions in %d. Skipping.\n", y))
next
}
# Sample
sample_size <- 500000
if (nrow(df_year) > sample_size) {
df_sample <- df_year[sample(nrow(df_year), sample_size), ]
} else {
df_sample <- df_year
}
# Ensure both classes
if (length(unique(df_sample$Diverted)) < 2) {
cat(sprintf("No diversions in sample for %d. Using full.\n", y))
df_sample <- df_year
}
# Fit logit
formula <- Diverted ~ factor(Month) + factor(DayofMonth) + DepHour + ArrHour +
OriginLat + OriginLong + DestLat + DestLong + Distance + factor(UniqueCarrier)
fit <- glm(formula, data = df_sample, family = "binomial")
coeffs[[as.character(y)]] <- coef(fit)
cat(sprintf("Model for %d: Fitted on %d samples, %d diversions\n", y, nrow(df_sample), sum(df_sample$Diverted)))
}
## Model for 2004: Fitted on 500000 samples, 946 diversions
## Model for 2005: Fitted on 500000 samples, 1006 diversions
## Model for 2006: Fitted on 500000 samples, 1133 diversions
## Model for 2007: Fitted on 500000 samples, 1162 diversions
## Model for 2008: Fitted on 500000 samples, 1223 diversions
library(arrow)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.2.0
## ✔ lubridate 1.9.5 ✔ stringr 1.6.0
## ✔ purrr 1.2.1 ✔ tibble 3.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::duration() masks arrow::duration()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Path and years
data_path <- "C:/Users/RUKN AL BAB/Desktop/ST2195_Part2/data"
years <- c(2004, 2005, 2006, 2007, 2008)
coeffs <- list()
for (y in years) {
cat(paste("Processing year:", y, "\n"))
# Load only needed columns
df_year <- read_parquet(
file.path(data_path, paste0("cleaned_full_", y, ".parquet")),
col_select = c("Diverted","Month","DayofMonth","DepHour","CRSArrTime",
"OriginLat","OriginLong","DestLat","DestLong",
"Distance","UniqueCarrier")
)
# Create arrival hour
df_year$ArrHour <- floor(df_year$CRSArrTime / 100)
# Skip if no diversions
if (sum(df_year$Diverted, na.rm = TRUE) == 0) {
cat(sprintf("No diversions in %d. Skipping.\n", y))
next
}
# Smaller sample for 8GB RAM
sample_size <- 200000
if (nrow(df_year) > sample_size) {
df_sample <- df_year %>% sample_n(sample_size)
} else {
df_sample <- df_year
}
# Ensure both classes exist
if (length(unique(df_sample$Diverted)) < 2) {
cat(sprintf("Sample missing class for %d. Using full dataset.\n", y))
df_sample <- df_year
}
# Logistic regression
fit <- glm(
Diverted ~ factor(Month) + factor(DayofMonth) +
DepHour + ArrHour +
OriginLat + OriginLong +
DestLat + DestLong +
Distance + factor(UniqueCarrier),
data = df_sample,
family = "binomial"
)
# Save coefficients
coeffs[[as.character(y)]] <- coef(fit)
cat(sprintf("Model for %d fitted on %d rows (%d diversions)\n",
y, nrow(df_sample), sum(df_sample$Diverted)))
# Free memory
rm(df_year, df_sample, fit)
gc()
}
## Processing year: 2004
## Model for 2004 fitted on 200000 rows (413 diversions)
## Processing year: 2005
## Model for 2005 fitted on 200000 rows (357 diversions)
## Processing year: 2006
## Model for 2006 fitted on 200000 rows (475 diversions)
## Processing year: 2007
## Model for 2007 fitted on 200000 rows (447 diversions)
## Processing year: 2008
## Model for 2008 fitted on 200000 rows (462 diversions)
library(tidyverse)
if (length(coeffs) == 0) {
cat("No models fitted.\n")
} else {
key_features <- c("Distance", "DepHour", "ArrHour", "OriginLat", "DestLat")
# Use only years that actually produced models
coeff_years <- names(coeffs)
coeff_df <- data.frame(Year = as.numeric(coeff_years))
for (feat in key_features) {
coeff_df[[feat]] <- sapply(coeffs, function(c) {
if (feat %in% names(c)) {
c[feat]
} else {
NA
}
})
}
# Save coefficients table
write.csv(
coeff_df,
file.path(data_path, "logit_coeffs_key_r.csv"),
row.names = FALSE
)
# Convert to long format
coeff_long <- pivot_longer(
coeff_df,
cols = -Year,
names_to = "Feature",
values_to = "Coef"
)
# Plot
p <- ggplot(coeff_long, aes(x = Year, y = Coef, color = Feature, group = Feature)) +
geom_line() +
geom_point() +
labs(
title = "Logistic Coefficients for Diverted Across Years",
x = "Year",
y = "Coefficient"
) +
theme_minimal()
# Save plot
ggsave(
file.path(data_path, "logit_coeffs_plot_r.png"),
p,
width = 8,
height = 5
)
print(p)
# Show example coefficients
last_y <- names(coeffs)[length(coeffs)]
cat(sprintf("\nSample coefficients for %s:\n", last_y))
for (feat in key_features) {
if (feat %in% names(coeffs[[last_y]])) {
cat(sprintf("%s: %.4f\n", feat, coeffs[[last_y]][feat]))
}
}
}

##
## Sample coefficients for 2008:
## Distance: 0.0003
## DepHour: -0.2691
## ArrHour: 0.2429
## OriginLat: 0.0120
## DestLat: 0.0177