INFO-H 510 Final Project

Primary objective

Disclaimer: Initial Draft

Introduction

Data Cleaning

1. Read data

library(readr)
library(ggplot2)
library(patchwork)
library(dplyr)
library(lubridate)
library(GGally)
library(corrplot)
library(car)
library(ggthemes)
library(ggrepel)
library(boot)
library(broom)
library(lindia)
library(tidyverse)
library(lubridate)
library(zoo)
library(forecast)
library(tsoutliers)
week2=read_csv("C:/Users/rajas/Desktop/Desktop/Applied Data Science/INFOH510/R Jupyter/Metro_Interstate_Traffic_Volume.csv")
week2=week2[week2$temp>0,]
week2=week2[week2$rain_1h< 60,]
week2<- week2|>
  mutate(temp=(((temp-273)*9/5))+32)
week2$hour<- as.integer(format(as.POSIXct(week2$date_time),"%H")) #converting the date_time information into hours,month,year, weekdays to get relevant insights.
week2$month<- month(as.integer(format(as.POSIXct(week2$date_time),"%m")),label = TRUE) #using lubridate library to get the month labels
week2$year<- as.integer(format(as.POSIXct(week2$date_time),"%y"))
week2$day<- as.integer(format(as.POSIXct(week2$date_time),"%d"))
week2$weekday<-weekdays(as.Date(week2$date_time))
week2$weekday<-factor(week2$weekday,levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) #sorting the weekdays
df<-week2

2. Data homogeneity

df %>%
  mutate(year = year(date_time)) %>%
  count(year) %>%
  ggplot(aes(x = factor(year), y = n)) +
  geom_col(fill = "steelblue") +
  labs(title = "Hourly Records Per Year", x = "Year", y = "Number of Records") +
  theme_minimal()

3. Missing data

# Create a complete sequence of hourly timestamps
complete_times <- data.frame(date_time = seq(min(df$date_time), max(df$date_time), by = "hour"))

# Join with the existing dataset to find missing timestamps
df_full <- full_join(complete_times, df, by = "date_time") %>%
  arrange(date_time)

# Check where traffic_volume is NA (indicates a missing record)
missing_times <- df_full %>% filter(is.na(traffic_volume))

# Summary of missing timestamps
missing_summary <- missing_times %>%
  mutate(year = year(date_time)) %>%
  count(year)

print(missing_summary)

  year    n
1 2012   72
2 2013 1466
3 2014 4269
4 2015 5167
5 2016  947
6 2017   47
7 2018   19

4. Filtering and extrapolating data

add before and after visualizations for dates

df_clean <- df[df$year>=16,]

# 2. Create full hourly sequence for 2016-2018
full_seq <- tibble(date_time = seq(from = min(df_clean$date_time),
                                   to   = max(df_clean$date_time),
                                   by   = "hour"))

# 3. Join and detect gaps
df_full <- full_seq %>%
  left_join(df_clean, by = "date_time") %>%
  arrange(date_time)

# 4. Fill missing values via linear interpolation (or LOCF if preferred)
df_interp <- df_full %>%
  mutate(across(c(temp, rain_1h, snow_1h, clouds_all, traffic_volume),
                ~na.approx(., x = date_time, na.rm = FALSE))) %>%
  fill(holiday, weather_main, weather_description, .direction = "downup")

# Optional: Confirm no more holes
sum(is.na(df_interp$traffic_volume))  # Should be 0

[1] 0

# Highlight filled points
df_interp <- df_interp %>%
  mutate(missing = is.na(traffic_volume))

ggplot(df_interp, aes(x = date_time, y = traffic_volume)) +
  geom_line(color = "steelblue") +
  labs(title = "Traffic Volume Over Time (2016–2018)") +
  theme_minimal()

df_clean<-df_interp

# Create a complete sequence of hourly timestamps
complete_times <- data.frame(date_time = seq(min(df_interp$date_time), max(df_interp$date_time), by = "hour"))

# Join with the existing dataset to find missing timestamps
df_full <- full_join(complete_times, df_interp, by = "date_time") %>%
  arrange(date_time)

# Check where traffic_volume is NA (indicates a missing record)
missing_times <- df_full %>% filter(is.na(traffic_volume))

# Summary of missing timestamps
missing_summary <- missing_times %>%
  mutate(year = year(date_time)) %>%
  count(year)

print(missing_summary)

[1] year n   
<0 rows> (or 0-length row.names)

Inferential Statistics

1. Summary

# Summary statistics of traffic volume
summary(df_clean$traffic_volume)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0    1241    3442    3273    4913    7280

# Grouped summaries by weekday, hour
df_clean %>%
  mutate(weekday = wday(date_time, label = TRUE),
         hour = hour(date_time)) %>%
  group_by(weekday) %>%
  summarise(mean_volume = mean(traffic_volume), .groups = "drop") %>%
  ggplot(aes(x = weekday, y = mean_volume)) +
  geom_col(fill = "tomato") +
  labs(title = "Average Traffic Volume by Weekday")

2.Hypothesis testing

Q: Is there a significant difference in traffic volume between weekdays and weekends?

df_clean <- df_clean %>%
  mutate(weekend = ifelse(wday(date_time) %in% c(1, 7), "Weekend", "Weekday"))

# t-test
t_test_result <- t.test(traffic_volume ~ weekend, data = df_clean)
print(t_test_result)


    Welch Two Sample t-test

data:  traffic_volume by weekend
t = 41.519, df = 19654, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Weekday and group Weekend is not equal to 0
95 percent confidence interval:
 882.4137 969.8589
sample estimates:
mean in group Weekday mean in group Weekend 
             3535.217              2609.081

Q: Does rainfall or snowfall affect traffic volume?

# Categorize rainfall
df_clean <- df_clean %>%
  mutate(rain_group = ifelse(rain_1h > 0, "Rain", "No Rain"),
         snow_group = ifelse(snow_1h > 0, "Snow", "No Snow"))

# ANOVA for rain effect
anova_rain <- aov(traffic_volume ~ rain_group, data = df_clean)
summary(anova_rain)

               Df    Sum Sq  Mean Sq F value Pr(>F)  
rain_group      1 1.265e+07 12649696   3.271 0.0705 .
Residuals   28870 1.117e+11  3867793                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# ANOVA for snow effect
anova_snow <- aov(traffic_volume ~ snow_group, data = df_clean)
summary(anova_snow)

               Df    Sum Sq Mean Sq F value Pr(>F)
snow_group      1 1.228e+06 1228079   0.317  0.573
Residuals   28870 1.117e+11 3868188

3. Correlation & Regression

Correlation Matrix

df_corr <- df_clean %>%
  select(traffic_volume, temp, rain_1h, snow_1h, clouds_all)

cor_matrix <- cor(df_corr, use = "complete.obs")
print(cor_matrix)

               traffic_volume        temp      rain_1h      snow_1h  clouds_all
traffic_volume    1.000000000  0.11936974 -0.019437214 -0.003082033  0.09874833
temp              0.119369741  1.00000000  0.093895807 -0.020219846 -0.06326886
rain_1h          -0.019437214  0.09389581  1.000000000 -0.003490101  0.07535976
snow_1h          -0.003082033 -0.02021985 -0.003490101  1.000000000  0.02989620
clouds_all        0.098748334 -0.06326886  0.075359755  0.029896204  1.00000000

Linear Regression

model <- lm(traffic_volume ~ temp + rain_1h + snow_1h + clouds_all, data = df_clean)
summary(model)


Call:
lm(formula = traffic_volume ~ temp + rain_1h + snow_1h + clouds_all, 
    data = df_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-3807.8 -1888.9   168.4  1599.9  5394.3 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2462.9500    31.4702  78.263  < 2e-16 ***
temp           11.5146     0.5178  22.237  < 2e-16 ***
rain_1h      -120.4670    17.6398  -6.829  8.7e-12 ***
snow_1h     -1726.2851  2581.5666  -0.669    0.504    
clouds_all      5.4926     0.2913  18.857  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1940 on 28867 degrees of freedom
Multiple R-squared:  0.02718,   Adjusted R-squared:  0.02705 
F-statistic: 201.6 on 4 and 28867 DF,  p-value: < 2.2e-16

4. Probability Modeling

Fit traffic volume distribution & check normality

# Histogram & density
ggplot(df_clean, aes(x = traffic_volume)) +
  geom_histogram(aes(y = ..density..), bins = 50, fill = "skyblue", alpha = 0.6) +
  geom_density(color = "red") +
  labs(title = "Traffic Volume Distribution")

Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.

# Shapiro-Wilk normality test (for a sample due to large data)
shapiro.test(sample(df_clean$traffic_volume, 5000))


    Shapiro-Wilk normality test

data:  sample(df_clean$traffic_volume, 5000)
W = 0.93343, p-value < 2.2e-16

5. Effect of Holidays & Time of Day

df_clean <- df_clean %>%
  mutate(hour = hour(date_time),
         is_holiday = ifelse(holiday != "None", "Holiday", "Non-Holiday"))

# Holiday effect
ggplot(df_clean, aes(x = is_holiday, y = traffic_volume)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Traffic Volume on Holidays vs Non-Holidays")

# Hourly trend
df_clean %>%
  group_by(hour) %>%
  summarise(avg_volume = mean(traffic_volume)) %>%
  ggplot(aes(x = hour, y = avg_volume)) +
  geom_line(color = "darkblue", linewidth = 1) +
  labs(title = "Hourly Average Traffic Volume")

Weekends likely have lower traffic volume (t-test result). Rain and snow may show statistically significant but small impacts. Strong daily and weekly patterns visible. Temperature might show a mild positive correlation with traffic. Holidays tend to reduce volume, especially federal holidays

Logistical Regression

Binary Response Variable

threshold <- quantile(df_clean$traffic_volume, 0.75)

# Create binary response variable
df_clean <- df_clean %>%
  mutate(high_traffic = ifelse(traffic_volume >= threshold, 1, 0),
         hour = hour(date_time),
         weekday = wday(date_time, label = TRUE),
         is_holiday = ifelse(holiday != "None", "Holiday", "Non-Holiday"))

Fit Logistic Regression Model

We’ll include variables like:

hour, weekday
temp, rain_1h, snow_1h
clouds_all
is_holiday

# Convert categorical variables
df_model <- df_clean %>%
  mutate(weekday = as.factor(weekday),
         is_holiday = as.factor(is_holiday))

# Fit logistic model
logit_model <- glm(high_traffic ~ hour + weekday + temp + rain_1h + snow_1h + clouds_all + is_holiday,
                   data = df_model, family = binomial)

summary(logit_model)


Call:
glm(formula = high_traffic ~ hour + weekday + temp + rain_1h + 
    snow_1h + clouds_all + is_holiday, family = binomial, data = df_model)

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -1.449e+01  8.902e+01  -0.163   0.8707    
hour                   8.575e-03  2.113e-03   4.057 4.97e-05 ***
weekday.L              1.349e+00  9.459e-02  14.258  < 2e-16 ***
weekday.Q             -3.230e+00  9.137e-02 -35.352  < 2e-16 ***
weekday.C              3.370e-01  7.196e-02   4.683 2.83e-06 ***
weekday^4             -1.263e+00  5.052e-02 -25.006  < 2e-16 ***
weekday^5              5.167e-02  3.751e-02   1.377   0.1684    
weekday^6             -1.657e-01  3.341e-02  -4.959 7.10e-07 ***
temp                   6.143e-03  6.742e-04   9.111  < 2e-16 ***
rain_1h               -7.974e-02  2.527e-02  -3.156   0.0016 ** 
snow_1h                8.304e-01  2.707e+00   0.307   0.7590    
clouds_all             1.609e-03  3.728e-04   4.316 1.59e-05 ***
is_holidayNon-Holiday  1.250e+01  8.902e+01   0.140   0.8884    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 32485  on 28871  degrees of freedom
Residual deviance: 28394  on 28859  degrees of freedom
AIC: 28420

Number of Fisher Scoring iterations: 12

Model Diagnostics

Pseudo R² & AIC

library(pscl)

Classes and Methods for R originally developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University (2002-2015),
by and under the direction of Simon Jackman.
hurdle and zeroinfl functions by Achim Zeileis.

pR2(logit_model)  # McFadden's R²

fitting null model for pseudo-r2

          llh       llhNull            G2      McFadden          r2ML 
-1.419705e+04 -1.624233e+04  4.090566e+03  1.259230e-01  1.321005e-01 
         r2CU 
 1.955918e-01

Confusion Matrix & Accuracy

# Predict probabilities and classify
df_model$predicted <- ifelse(predict(logit_model, type = "response") > 0.5, 1, 0)

# Confusion matrix
table(Predicted = df_model$predicted, Actual = df_model$high_traffic)

         Actual
Predicted     0     1
        0 21648  7224

ROC Curve & AUC

library(pROC)

Type 'citation("pROC")' for a citation.


Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var

roc_obj <- roc(df_model$high_traffic, predict(logit_model, type = "response"))

Setting levels: control = 0, case = 1

Setting direction: controls < cases

plot(roc_obj, col = "blue", main = "ROC Curve")

auc(roc_obj)

Area under the curve: 0.7148

Residual Analysis (for influence)

# Plot residuals
plot(logit_model$residuals, main = "Residuals of Logistic Model", pch = 20)
abline(h = 0, col = "red")

# Leverage plot
plot(hatvalues(logit_model), main = "Leverage", pch = 20)
abline(h = 2*mean(hatvalues(logit_model)), col = "red", lty = 2)

Anomaly detection

Holt-Winters Anomaly Detection in R

Decompose and smooth the traffic volume time series.
Detect points where observed values significantly deviate from the expected trend/seasonality.
Visualize and interpret anomalies year by year.

1. Prepare the Time Series

# Optional: aggregate hourly if necessary
df_ts <- df_interp %>%
  select(date_time, traffic_volume) %>%
  arrange(date_time)

# Create time series object (hourly, start in 2016)
ts_data <- ts(df_ts$traffic_volume, frequency = 24 * 365.25, start = c(2016, 1))

2. Fit Holt-Winters Model

hw_model <- HoltWinters(ts_data)
plot(hw_model, main = "Holt-Winters Smoothed Traffic Volume")

3. Forecast & Extract Residuals

We’ll use the model’s residuals to detect anomalies.

# Forecast using fitted model (to match length of original data)
hw_fitted <- hw_model$fitted[,1]

# Compute residuals
residuals_hw <- ts_data - hw_fitted

# Visualize residuals
plot(residuals_hw, main = "Holt-Winters Residuals", col = "darkred")
abline(h = c(-2*sd(residuals_hw), 2*sd(residuals_hw)), col = "blue", lty = 2)

4. Detect Anomalies Using tsoutliers

# Find outliers (AO = additive outliers)
#outlier_result <- tso(ts_data, types = c("AO"))

# View summary
#summary(outlier_result)

5. Visualize Anomalies Overlayed on Time Series

# Extract positions
#anomaly_indices <- outlier_result$outliers$time

# Overlay on time series
#autoplot(ts_data) +
  #geom_vline(xintercept = anomaly_indices, color = "red", linetype = "dashed") +
  #labs(title = "Traffic Volume with Detected Anomalies", y = "Traffic Volume") +
  #theme_minimal()