ST2195 Part 2 (a) and (b) - R Implementation

# Path to data (from Python)
data_path <- "C:/Users/RUKN AL BAB/Desktop/ST2195_Part2/data"
years <- c(2004, 2005, 2006, 2007, 2008)

# Load all years (filter non-diverted for delays)
delays_data <- list()
for (y in years) {
  df <- read_parquet(file.path(data_path, paste0("cleaned_full_", y, ".parquet")))
  df$TotalDelay <- df$DepDelay + df$ArrDelay  # Create total delay
  df$DepHour <- floor(df$CRSDepTime / 100)  # Dep hour
  # Filter non-diverted (delays defined)
  delays_data[[as.character(y)]] <- df %>% filter(Diverted == 0)
}

# (a) Best times/days to minimize delays
best_times <- list()
for (y in years) {
  df_y <- delays_data[[as.character(y)]]
  delays <- df_y %>%
    group_by(DayOfWeek, DepHour) %>%
    summarise(AvgDelay = mean(TotalDelay, na.rm = TRUE), .groups = 'drop')
  
  min_delay <- delays[which.min(delays$AvgDelay), ]
  day_map <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
  best_times[[y]] <- list(Day = day_map[min_delay$DayOfWeek], Hour = min_delay$DepHour, AvgDelay = round(min_delay$AvgDelay, 2))
  
  # Heatmap
  p <- ggplot(delays, aes(x = factor(DayOfWeek, labels = day_map), y = DepHour, fill = AvgDelay)) +
    geom_tile() + scale_fill_gradient(low = "blue", high = "yellow") +
    labs(title = paste("Avg Delay Heatmap -", y), x = "Day of Week", y = "Dep Hour")
  ggsave(file.path(data_path, paste0("delay_heatmap_r_", y, ".png")), p)
  print(p)  # Show in report
}

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

# Best times table
best_df <- data.frame(Year = years, do.call(rbind, best_times))
print(best_df)

##   Year Day Hour AvgDelay
## 1 2004 Sun    4   -17.22
## 2 2005 Wed    3     -6.5
## 3 2006 Thu    2     -5.8
## 4 2007 Wed    4    -0.44
## 5 2008 Sun    3   -12.25

# (b) Older planes and delays
delays_age_all <- data.frame()
corrs <- numeric(length(years))
names(corrs) <- years

for (y in years) {
  df_y <- delays_data[[as.character(y)]]
  df_y$AgeBin <- cut(df_y$PlaneAge, breaks = c(0,5,10,15,20,Inf), labels = c("0-5","6-10","11-15","16-20","21+"))
  
  delays_age <- df_y %>%
    group_by(AgeBin) %>%
    summarise(AvgDepDelay = mean(DepDelay, na.rm = TRUE), .groups = 'drop')
  delays_age$Year <- y
  delays_age_all <- rbind(delays_age_all, delays_age)
  
  corrs[as.character(y)] <- cor(df_y$PlaneAge, df_y$DepDelay, use = "complete.obs")
  
  # Bar plot
  p <- ggplot(delays_age, aes(x = AgeBin, y = AvgDepDelay, fill = AgeBin)) +
    geom_bar(stat = "identity") + labs(title = paste("Avg Dep Delay by Plane Age -", y))
  ggsave(file.path(data_path, paste0("delays_by_age_r_", y, ".png")), p)
  print(p)
}

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

# Correlations table
print(data.frame(Year = years, Correlation = round(corrs, 3)))

##      Year Correlation
## 2004 2004      -0.005
## 2005 2005      -0.005
## 2006 2006      -0.001
## 2007 2007      -0.002
## 2008 2008      -0.005

# Overall line plot
p_line <- ggplot(delays_age_all, aes(x = Year, y = AvgDepDelay, color = AgeBin, group = AgeBin)) +
  geom_line() + geom_point() + labs(title = "Avg Dep Delay by Plane Age Across Years")
ggsave(file.path(data_path, "delays_by_age_overall_r.png"), p_line)

## Saving 7 x 5 in image

print(p_line)

knitr::opts_chunk$set(echo = TRUE)
required_packages <- c("arrow", "dplyr", "ggplot2", "tidyr")
for (pkg in required_packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}

# Path and years
data_path <- "C:/Users/RUKN AL BAB/Desktop/ST2195_Part2/data"
years <- c(2004, 2005, 2006, 2007, 2008)

coeffs <- list()

for (y in years) {
  df_year <- read_parquet(file.path(data_path, paste0("cleaned_full_", y, ".parquet")))
  df_year$ArrHour <- floor(df_year$CRSArrTime / 100)
  
  if (sum(df_year$Diverted) == 0) {
    cat(sprintf("No diversions in %d. Skipping.\n", y))
    next
  }
  
  # Sample
  sample_size <- 500000
  if (nrow(df_year) > sample_size) {
    df_sample <- df_year[sample(nrow(df_year), sample_size), ]
  } else {
    df_sample <- df_year
  }
  
  # Ensure both classes
  if (length(unique(df_sample$Diverted)) < 2) {
    cat(sprintf("No diversions in sample for %d. Using full.\n", y))
    df_sample <- df_year
  }
  
  # Fit logit
  formula <- Diverted ~ factor(Month) + factor(DayofMonth) + DepHour + ArrHour + 
              OriginLat + OriginLong + DestLat + DestLong + Distance + factor(UniqueCarrier)
  fit <- glm(formula, data = df_sample, family = "binomial")
  
  coeffs[[as.character(y)]] <- coef(fit)
  
  cat(sprintf("Model for %d: Fitted on %d samples, %d diversions\n", y, nrow(df_sample), sum(df_sample$Diverted)))
}

## Model for 2004: Fitted on 500000 samples, 946 diversions
## Model for 2005: Fitted on 500000 samples, 1006 diversions
## Model for 2006: Fitted on 500000 samples, 1133 diversions
## Model for 2007: Fitted on 500000 samples, 1162 diversions
## Model for 2008: Fitted on 500000 samples, 1223 diversions

library(arrow)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.2.0
## ✔ lubridate 1.9.5     ✔ stringr   1.6.0
## ✔ purrr     1.2.1     ✔ tibble    3.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::duration() masks arrow::duration()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::lag()          masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Path and years
data_path <- "C:/Users/RUKN AL BAB/Desktop/ST2195_Part2/data"
years <- c(2004, 2005, 2006, 2007, 2008)

coeffs <- list()

for (y in years) {
  
  cat(paste("Processing year:", y, "\n"))
  
  # Load only needed columns
  df_year <- read_parquet(
    file.path(data_path, paste0("cleaned_full_", y, ".parquet")),
    col_select = c("Diverted","Month","DayofMonth","DepHour","CRSArrTime",
                   "OriginLat","OriginLong","DestLat","DestLong",
                   "Distance","UniqueCarrier")
  )
  
  # Create arrival hour
  df_year$ArrHour <- floor(df_year$CRSArrTime / 100)
  
  # Skip if no diversions
  if (sum(df_year$Diverted, na.rm = TRUE) == 0) {
    cat(sprintf("No diversions in %d. Skipping.\n", y))
    next
  }
  
  # Smaller sample for 8GB RAM
  sample_size <- 200000
  
  if (nrow(df_year) > sample_size) {
    df_sample <- df_year %>% sample_n(sample_size)
  } else {
    df_sample <- df_year
  }
  
  # Ensure both classes exist
  if (length(unique(df_sample$Diverted)) < 2) {
    cat(sprintf("Sample missing class for %d. Using full dataset.\n", y))
    df_sample <- df_year
  }
  
  # Logistic regression
  fit <- glm(
    Diverted ~ factor(Month) + factor(DayofMonth) +
      DepHour + ArrHour +
      OriginLat + OriginLong +
      DestLat + DestLong +
      Distance + factor(UniqueCarrier),
    data = df_sample,
    family = "binomial"
  )
  
  # Save coefficients
  coeffs[[as.character(y)]] <- coef(fit)
  
  cat(sprintf("Model for %d fitted on %d rows (%d diversions)\n",
              y, nrow(df_sample), sum(df_sample$Diverted)))
  
  # Free memory
  rm(df_year, df_sample, fit)
  gc()
}

## Processing year: 2004 
## Model for 2004 fitted on 200000 rows (413 diversions)
## Processing year: 2005 
## Model for 2005 fitted on 200000 rows (357 diversions)
## Processing year: 2006 
## Model for 2006 fitted on 200000 rows (475 diversions)
## Processing year: 2007 
## Model for 2007 fitted on 200000 rows (447 diversions)
## Processing year: 2008 
## Model for 2008 fitted on 200000 rows (462 diversions)

library(tidyverse)

if (length(coeffs) == 0) {
  cat("No models fitted.\n")
  
} else {

  key_features <- c("Distance", "DepHour", "ArrHour", "OriginLat", "DestLat")

  # Use only years that actually produced models
  coeff_years <- names(coeffs)

  coeff_df <- data.frame(Year = as.numeric(coeff_years))

  for (feat in key_features) {
    coeff_df[[feat]] <- sapply(coeffs, function(c) {
      if (feat %in% names(c)) {
        c[feat]
      } else {
        NA
      }
    })
  }

  # Save coefficients table
  write.csv(
    coeff_df,
    file.path(data_path, "logit_coeffs_key_r.csv"),
    row.names = FALSE
  )

  # Convert to long format
  coeff_long <- pivot_longer(
    coeff_df,
    cols = -Year,
    names_to = "Feature",
    values_to = "Coef"
  )

  # Plot
  p <- ggplot(coeff_long, aes(x = Year, y = Coef, color = Feature, group = Feature)) +
    geom_line() +
    geom_point() +
    labs(
      title = "Logistic Coefficients for Diverted Across Years",
      x = "Year",
      y = "Coefficient"
    ) +
    theme_minimal()

  # Save plot
  ggsave(
    file.path(data_path, "logit_coeffs_plot_r.png"),
    p,
    width = 8,
    height = 5
  )

  print(p)

  # Show example coefficients
  last_y <- names(coeffs)[length(coeffs)]

  cat(sprintf("\nSample coefficients for %s:\n", last_y))

  for (feat in key_features) {
    if (feat %in% names(coeffs[[last_y]])) {
      cat(sprintf("%s: %.4f\n", feat, coeffs[[last_y]][feat]))
    }
  }
}

## 
## Sample coefficients for 2008:
## Distance: 0.0003
## DepHour: -0.2691
## ArrHour: 0.2429
## OriginLat: 0.0120
## DestLat: 0.0177