setwd("C:/Users/Cynthiali0621/Desktop")
if(!require("tidyverse")) {install.packages("tidyverse"); library("tidyverse")}
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(dplyr)
library(ggplot2)


``` r
data <- read.csv("DelayData.csv")
data$Date <- as.Date(paste(data$year, data$month, data$dayofmonth, sep = "-"))

# Aggregate data to monthly level
monthly_data <- data %>%
  group_by(year, month) %>%
  summarize(avg_depdelay = mean(depdelay, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Step 1: Create a time series object with monthly frequency
ts_data <- ts(monthly_data$avg_depdelay, start = c(2004, 1), frequency = 12)

# Step 2: Use training data only
train <- window(ts_data, end = c(2012, 12))  # Training: Jan 2004 to Dec 2012

# Step 3: Fit ARIMA model to the training data
arima_model <- auto.arima(train)  # Automatically selects the best ARIMA model

# Step 4: Generate in-sample predictions for the training data
in_sample_forecast <- fitted(arima_model)  # In-sample predictions

# Step 5: Extract actual and predicted values for visualization
train_values <- as.vector(train)  # Actual values in the training data
predicted_values <- as.vector(in_sample_forecast)  # Predicted values in the training data
time_index <- time(train)  # Extract the time index of the training set

# Combine actual and predicted values into a data frame for visualization
plot_data <- data.frame(
  Time = time_index,
  Actual = train_values,
  Predicted = predicted_values
)

# Step 6: Plot in-sample forecasts vs. actual values using ggplot
ggplot(plot_data, aes(x = Time)) +
  geom_line(aes(y = Actual, color = "Actual"), size = 1.2) +
  geom_line(aes(y = Predicted, color = "Predicted"), size = 1.2) +
  labs(
    title = "(Training set)ARIMA In-Sample Forecast vs Actual",
    x = "Time",
    y = "Average Departure Delay(in minutes)",
    color = "Legend"
  ) +
  scale_color_manual(values = c("Actual" = "red", "Predicted" = "blue")) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

# Step 7: Plot the last 50 points in the training data
last_50_indices <- (length(train_values) - 49):length(train_values)
plot_data_last_50 <- plot_data[last_50_indices, ]

ggplot(plot_data_last_50, aes(x = Time)) +
  geom_line(aes(y = Actual, color = "Actual"), size = 1.2) +
  geom_line(aes(y = Predicted, color = "Predicted"), size = 1.2) +
  labs(
    title = "(Training set)ARIMA In-Sample Forecast: Last 50 Points",
    x = "Time",
    y = "Average Departure Delay",
    color = "Legend"
  ) +
  scale_color_manual(values = c("Actual" = "red", "Predicted" = "blue")) +
  theme_minimal()

arima_model
## Series: train 
## ARIMA(1,0,2)(2,1,0)[12] 
## 
## Coefficients:
##           ar1     ma1     ma2     sar1     sar2
##       -0.6604  1.1578  0.5708  -0.4743  -0.2360
## s.e.   0.1198  0.1390  0.1354   0.1146   0.1154
## 
## sigma^2 = 5.502:  log likelihood = -217.47
## AIC=446.93   AICc=447.87   BIC=462.32
# Step 8: Calculate accuracy metrics
errors <- train_values - predicted_values  # Residuals (errors)

# Mean Absolute Error (MAE)
mae <- mean(abs(errors))

# Root Mean Squared Error (RMSE)
rmse <- sqrt(mean(errors^2))

# Mean Absolute Percentage Error (MAPE)
mape <- mean(abs(errors / train_values)) * 100

# Print accuracy metrics
cat("Model Accuracy Metrics:\n")
## Model Accuracy Metrics:
cat("MAE: ", round(mae, 2), "\n")
## MAE:  1.56
cat("RMSE: ", round(rmse, 2), "\n")
## RMSE:  2.15
cat("MAPE: ", round(mape, 2), "%\n")
## MAPE:  20.04 %
accuracy_metrics <- accuracy(in_sample_forecast, train)
print(accuracy_metrics)
##                  ME     RMSE     MAE       MPE     MAPE        ACF1 Theil's U
## Test set 0.01967483 2.153087 1.56225 -4.848281 20.04147 -0.04758306 0.5540357
fitted_values <- fitted(arima_model)
residuals <- train - fitted_values
ss_residual <- sum(residuals^2)
ss_total <- sum((train - mean(train))^2)
r_squared <- 1 - (ss_residual / ss_total)
cat("R-squared:", r_squared, "\n")
## R-squared: 0.5162474
fitted_values <- fitted(arima_model)
residuals <- train - fitted_values
ss_residual <- sum(residuals^2)
ss_total <- sum((train - mean(train))^2)
r_squared <- 1 - (ss_residual / ss_total)
cat("R-squared:", r_squared, "\n")
## R-squared: 0.5162474
# Step 1: Data cleaning and preparation
data <- read.csv("DelayData.csv")
data$Date <- as.Date(paste(data$year, data$month, data$dayofmonth, sep = "-"))

# Aggregate data to monthly level
monthly_data <- data %>%
  group_by(year, month) %>%
  summarize(avg_depdelay = mean(depdelay, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create a sequence of months for plotting
monthly_data$time <- seq(from = as.Date("2004-01-01"), to = as.Date("2017-12-01"), by = "month")

# Step 2: Create Random Walk Model
monthly_data$PredictedDelay <- lag(monthly_data$avg_depdelay, 1)

# Step 3: Plot entire series
ggplot(monthly_data, aes(x = time)) +
  geom_line(aes(y = avg_depdelay, color = "Actual"), size = 1.2) +
  geom_line(aes(y = PredictedDelay, color = "Predicted"), size = 1.2) +
  labs(title = "Random Walk Model: Entire Series",
       x = "Time",
       y = "Average Departure Delay") +
  scale_color_manual(values = c("Actual" = "black", "Predicted" = "red")) +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# Step 4: Plot last 50 months
last_50 <- tail(monthly_data, 50)

ggplot(last_50, aes(x = time)) +
  geom_line(aes(y = avg_depdelay, color = "Actual"), size = 1.2) +
  geom_line(aes(y = PredictedDelay, color = "Predicted"), size = 1.2) +
  labs(title = "Random Walk Model: Last 50 Months",
       x = "Time",
       y = "Average Departure Delay") +
  scale_color_manual(values = c("Actual" = "black", "Predicted" = "red")) +
  theme_minimal()

# Step 5: Evaluate performance
valid_data <- monthly_data[!is.na(monthly_data$PredictedDelay), ]  # Remove NA rows
actual <- valid_data$avg_depdelay
predicted <- valid_data$PredictedDelay
r_squared <- 1 - sum((actual - predicted)^2) / sum((actual - mean(actual))^2)
cat("R-squared: ", round(r_squared, 5), "\n")
## R-squared:  -0.31439
# Step 1: data cleaning
#Convert data to Date and aggregate by month
data$Date <- as.Date(paste(data$year, data$month, data$dayofmonth, sep = "-"))
monthly_data <- data %>%
  group_by(year, month) %>%
  summarize(avg_depdelay = mean(depdelay, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create a sequence of months starting from January 2004 to December 2017
monthly_data$time <- seq(from = as.Date("2004-01-01"), to = as.Date("2017-12-01"), by = "month")
# Step 2: Create a time series object with monthly frequency
ts_data <- ts(monthly_data$avg_depdelay, start = c(2004, 1), frequency = 12)

# Step 3: Split data into training and testing sets
train <- window(ts_data, end = c(2012, 12))  # Training: Jan 2004 to Dec 2012
test <- window(ts_data, start = c(2013, 1))  # Testing: Jan 2013 to Dec 2017

# Step 4: Fit ARIMA model
arima_model <- auto.arima(train)
forecasted_values <- forecast(arima_model, h = length(test))  # Forecast for test period
summary(arima_model)
## Series: train 
## ARIMA(1,0,2)(2,1,0)[12] 
## 
## Coefficients:
##           ar1     ma1     ma2     sar1     sar2
##       -0.6604  1.1578  0.5708  -0.4743  -0.2360
## s.e.   0.1198  0.1390  0.1354   0.1146   0.1154
## 
## sigma^2 = 5.502:  log likelihood = -217.47
## AIC=446.93   AICc=447.87   BIC=462.32
## 
## Training set error measures:
##                      ME     RMSE     MAE       MPE     MAPE      MASE
## Training set 0.01967483 2.153087 1.56225 -4.848281 20.04147 0.6609573
##                     ACF1
## Training set -0.04758306
# Step 5: Plot forecasts and actual test data
plot(forecasted_values, main = "ARIMA Forecast vs Actual Test Data")
lines(test, col = "red", lwd = 2)