setwd("C:/Users/Cynthiali0621/Desktop")
if(!require("tidyverse")) {install.packages("tidyverse"); library("tidyverse")}
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(dplyr)
library(ggplot2)
``` r
data <- read.csv("DelayData.csv")
data$Date <- as.Date(paste(data$year, data$month, data$dayofmonth, sep = "-"))
# Aggregate data to monthly level
monthly_data <- data %>%
group_by(year, month) %>%
summarize(avg_depdelay = mean(depdelay, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Step 1: Create a time series object with monthly frequency
ts_data <- ts(monthly_data$avg_depdelay, start = c(2004, 1), frequency = 12)
# Step 2: Use training data only
train <- window(ts_data, end = c(2012, 12)) # Training: Jan 2004 to Dec 2012
# Step 3: Fit ARIMA model to the training data
arima_model <- auto.arima(train) # Automatically selects the best ARIMA model
# Step 4: Generate in-sample predictions for the training data
in_sample_forecast <- fitted(arima_model) # In-sample predictions
# Step 5: Extract actual and predicted values for visualization
train_values <- as.vector(train) # Actual values in the training data
predicted_values <- as.vector(in_sample_forecast) # Predicted values in the training data
time_index <- time(train) # Extract the time index of the training set
# Combine actual and predicted values into a data frame for visualization
plot_data <- data.frame(
Time = time_index,
Actual = train_values,
Predicted = predicted_values
)
# Step 6: Plot in-sample forecasts vs. actual values using ggplot
ggplot(plot_data, aes(x = Time)) +
geom_line(aes(y = Actual, color = "Actual"), size = 1.2) +
geom_line(aes(y = Predicted, color = "Predicted"), size = 1.2) +
labs(
title = "(Training set)ARIMA In-Sample Forecast vs Actual",
x = "Time",
y = "Average Departure Delay(in minutes)",
color = "Legend"
) +
scale_color_manual(values = c("Actual" = "red", "Predicted" = "blue")) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

# Step 7: Plot the last 50 points in the training data
last_50_indices <- (length(train_values) - 49):length(train_values)
plot_data_last_50 <- plot_data[last_50_indices, ]
ggplot(plot_data_last_50, aes(x = Time)) +
geom_line(aes(y = Actual, color = "Actual"), size = 1.2) +
geom_line(aes(y = Predicted, color = "Predicted"), size = 1.2) +
labs(
title = "(Training set)ARIMA In-Sample Forecast: Last 50 Points",
x = "Time",
y = "Average Departure Delay",
color = "Legend"
) +
scale_color_manual(values = c("Actual" = "red", "Predicted" = "blue")) +
theme_minimal()

arima_model
## Series: train
## ARIMA(1,0,2)(2,1,0)[12]
##
## Coefficients:
## ar1 ma1 ma2 sar1 sar2
## -0.6604 1.1578 0.5708 -0.4743 -0.2360
## s.e. 0.1198 0.1390 0.1354 0.1146 0.1154
##
## sigma^2 = 5.502: log likelihood = -217.47
## AIC=446.93 AICc=447.87 BIC=462.32
# Step 8: Calculate accuracy metrics
errors <- train_values - predicted_values # Residuals (errors)
# Mean Absolute Error (MAE)
mae <- mean(abs(errors))
# Root Mean Squared Error (RMSE)
rmse <- sqrt(mean(errors^2))
# Mean Absolute Percentage Error (MAPE)
mape <- mean(abs(errors / train_values)) * 100
# Print accuracy metrics
cat("Model Accuracy Metrics:\n")
## Model Accuracy Metrics:
cat("MAE: ", round(mae, 2), "\n")
## MAE: 1.56
cat("RMSE: ", round(rmse, 2), "\n")
## RMSE: 2.15
cat("MAPE: ", round(mape, 2), "%\n")
## MAPE: 20.04 %
accuracy_metrics <- accuracy(in_sample_forecast, train)
print(accuracy_metrics)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 0.01967483 2.153087 1.56225 -4.848281 20.04147 -0.04758306 0.5540357
fitted_values <- fitted(arima_model)
residuals <- train - fitted_values
ss_residual <- sum(residuals^2)
ss_total <- sum((train - mean(train))^2)
r_squared <- 1 - (ss_residual / ss_total)
cat("R-squared:", r_squared, "\n")
## R-squared: 0.5162474
fitted_values <- fitted(arima_model)
residuals <- train - fitted_values
ss_residual <- sum(residuals^2)
ss_total <- sum((train - mean(train))^2)
r_squared <- 1 - (ss_residual / ss_total)
cat("R-squared:", r_squared, "\n")
## R-squared: 0.5162474
# Step 1: Data cleaning and preparation
data <- read.csv("DelayData.csv")
data$Date <- as.Date(paste(data$year, data$month, data$dayofmonth, sep = "-"))
# Aggregate data to monthly level
monthly_data <- data %>%
group_by(year, month) %>%
summarize(avg_depdelay = mean(depdelay, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create a sequence of months for plotting
monthly_data$time <- seq(from = as.Date("2004-01-01"), to = as.Date("2017-12-01"), by = "month")
# Step 2: Create Random Walk Model
monthly_data$PredictedDelay <- lag(monthly_data$avg_depdelay, 1)
# Step 3: Plot entire series
ggplot(monthly_data, aes(x = time)) +
geom_line(aes(y = avg_depdelay, color = "Actual"), size = 1.2) +
geom_line(aes(y = PredictedDelay, color = "Predicted"), size = 1.2) +
labs(title = "Random Walk Model: Entire Series",
x = "Time",
y = "Average Departure Delay") +
scale_color_manual(values = c("Actual" = "black", "Predicted" = "red")) +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# Step 4: Plot last 50 months
last_50 <- tail(monthly_data, 50)
ggplot(last_50, aes(x = time)) +
geom_line(aes(y = avg_depdelay, color = "Actual"), size = 1.2) +
geom_line(aes(y = PredictedDelay, color = "Predicted"), size = 1.2) +
labs(title = "Random Walk Model: Last 50 Months",
x = "Time",
y = "Average Departure Delay") +
scale_color_manual(values = c("Actual" = "black", "Predicted" = "red")) +
theme_minimal()

# Step 5: Evaluate performance
valid_data <- monthly_data[!is.na(monthly_data$PredictedDelay), ] # Remove NA rows
actual <- valid_data$avg_depdelay
predicted <- valid_data$PredictedDelay
r_squared <- 1 - sum((actual - predicted)^2) / sum((actual - mean(actual))^2)
cat("R-squared: ", round(r_squared, 5), "\n")
## R-squared: -0.31439
# Step 1: data cleaning
#Convert data to Date and aggregate by month
data$Date <- as.Date(paste(data$year, data$month, data$dayofmonth, sep = "-"))
monthly_data <- data %>%
group_by(year, month) %>%
summarize(avg_depdelay = mean(depdelay, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Create a sequence of months starting from January 2004 to December 2017
monthly_data$time <- seq(from = as.Date("2004-01-01"), to = as.Date("2017-12-01"), by = "month")
# Step 2: Create a time series object with monthly frequency
ts_data <- ts(monthly_data$avg_depdelay, start = c(2004, 1), frequency = 12)
# Step 3: Split data into training and testing sets
train <- window(ts_data, end = c(2012, 12)) # Training: Jan 2004 to Dec 2012
test <- window(ts_data, start = c(2013, 1)) # Testing: Jan 2013 to Dec 2017
# Step 4: Fit ARIMA model
arima_model <- auto.arima(train)
forecasted_values <- forecast(arima_model, h = length(test)) # Forecast for test period
summary(arima_model)
## Series: train
## ARIMA(1,0,2)(2,1,0)[12]
##
## Coefficients:
## ar1 ma1 ma2 sar1 sar2
## -0.6604 1.1578 0.5708 -0.4743 -0.2360
## s.e. 0.1198 0.1390 0.1354 0.1146 0.1154
##
## sigma^2 = 5.502: log likelihood = -217.47
## AIC=446.93 AICc=447.87 BIC=462.32
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01967483 2.153087 1.56225 -4.848281 20.04147 0.6609573
## ACF1
## Training set -0.04758306
# Step 5: Plot forecasts and actual test data
plot(forecasted_values, main = "ARIMA Forecast vs Actual Test Data")
lines(test, col = "red", lwd = 2)
