Primary objective
Disclaimer: Initial Draft
Disclaimer: Initial Draft
library(readr)
library(ggplot2)
library(patchwork)
library(dplyr)
library(lubridate)
library(GGally)
library(corrplot)
library(car)
library(ggthemes)
library(ggrepel)
library(boot)
library(broom)
library(lindia)
library(tidyverse)
library(lubridate)
library(zoo)
library(forecast)
library(tsoutliers)
=read_csv("C:/Users/rajas/Desktop/Desktop/Applied Data Science/INFOH510/R Jupyter/Metro_Interstate_Traffic_Volume.csv")
week2=week2[week2$temp>0,]
week2=week2[week2$rain_1h< 60,]
week2<- week2|>
week2mutate(temp=(((temp-273)*9/5))+32)
$hour<- as.integer(format(as.POSIXct(week2$date_time),"%H")) #converting the date_time information into hours,month,year, weekdays to get relevant insights.
week2$month<- month(as.integer(format(as.POSIXct(week2$date_time),"%m")),label = TRUE) #using lubridate library to get the month labels
week2$year<- as.integer(format(as.POSIXct(week2$date_time),"%y"))
week2$day<- as.integer(format(as.POSIXct(week2$date_time),"%d"))
week2$weekday<-weekdays(as.Date(week2$date_time))
week2$weekday<-factor(week2$weekday,levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) #sorting the weekdays
week2<-week2 df
%>%
df mutate(year = year(date_time)) %>%
count(year) %>%
ggplot(aes(x = factor(year), y = n)) +
geom_col(fill = "steelblue") +
labs(title = "Hourly Records Per Year", x = "Year", y = "Number of Records") +
theme_minimal()
# Create a complete sequence of hourly timestamps
<- data.frame(date_time = seq(min(df$date_time), max(df$date_time), by = "hour"))
complete_times
# Join with the existing dataset to find missing timestamps
<- full_join(complete_times, df, by = "date_time") %>%
df_full arrange(date_time)
# Check where traffic_volume is NA (indicates a missing record)
<- df_full %>% filter(is.na(traffic_volume))
missing_times
# Summary of missing timestamps
<- missing_times %>%
missing_summary mutate(year = year(date_time)) %>%
count(year)
print(missing_summary)
year n
1 2012 72
2 2013 1466
3 2014 4269
4 2015 5167
5 2016 947
6 2017 47
7 2018 19
add before and after visualizations for dates
<- df[df$year>=16,]
df_clean
# 2. Create full hourly sequence for 2016-2018
<- tibble(date_time = seq(from = min(df_clean$date_time),
full_seq to = max(df_clean$date_time),
by = "hour"))
# 3. Join and detect gaps
<- full_seq %>%
df_full left_join(df_clean, by = "date_time") %>%
arrange(date_time)
# 4. Fill missing values via linear interpolation (or LOCF if preferred)
<- df_full %>%
df_interp mutate(across(c(temp, rain_1h, snow_1h, clouds_all, traffic_volume),
~na.approx(., x = date_time, na.rm = FALSE))) %>%
fill(holiday, weather_main, weather_description, .direction = "downup")
# Optional: Confirm no more holes
sum(is.na(df_interp$traffic_volume)) # Should be 0
[1] 0
# Highlight filled points
<- df_interp %>%
df_interp mutate(missing = is.na(traffic_volume))
ggplot(df_interp, aes(x = date_time, y = traffic_volume)) +
geom_line(color = "steelblue") +
labs(title = "Traffic Volume Over Time (2016–2018)") +
theme_minimal()
<-df_interp df_clean
# Create a complete sequence of hourly timestamps
<- data.frame(date_time = seq(min(df_interp$date_time), max(df_interp$date_time), by = "hour"))
complete_times
# Join with the existing dataset to find missing timestamps
<- full_join(complete_times, df_interp, by = "date_time") %>%
df_full arrange(date_time)
# Check where traffic_volume is NA (indicates a missing record)
<- df_full %>% filter(is.na(traffic_volume))
missing_times
# Summary of missing timestamps
<- missing_times %>%
missing_summary mutate(year = year(date_time)) %>%
count(year)
print(missing_summary)
[1] year n
<0 rows> (or 0-length row.names)
# Summary statistics of traffic volume
summary(df_clean$traffic_volume)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 1241 3442 3273 4913 7280
# Grouped summaries by weekday, hour
%>%
df_clean mutate(weekday = wday(date_time, label = TRUE),
hour = hour(date_time)) %>%
group_by(weekday) %>%
summarise(mean_volume = mean(traffic_volume), .groups = "drop") %>%
ggplot(aes(x = weekday, y = mean_volume)) +
geom_col(fill = "tomato") +
labs(title = "Average Traffic Volume by Weekday")
Q: Is there a significant difference in traffic volume between weekdays and weekends?
<- df_clean %>%
df_clean mutate(weekend = ifelse(wday(date_time) %in% c(1, 7), "Weekend", "Weekday"))
# t-test
<- t.test(traffic_volume ~ weekend, data = df_clean)
t_test_result print(t_test_result)
Welch Two Sample t-test
data: traffic_volume by weekend
t = 41.519, df = 19654, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Weekday and group Weekend is not equal to 0
95 percent confidence interval:
882.4137 969.8589
sample estimates:
mean in group Weekday mean in group Weekend
3535.217 2609.081
Q: Does rainfall or snowfall affect traffic volume?
# Categorize rainfall
<- df_clean %>%
df_clean mutate(rain_group = ifelse(rain_1h > 0, "Rain", "No Rain"),
snow_group = ifelse(snow_1h > 0, "Snow", "No Snow"))
# ANOVA for rain effect
<- aov(traffic_volume ~ rain_group, data = df_clean)
anova_rain summary(anova_rain)
Df Sum Sq Mean Sq F value Pr(>F)
rain_group 1 1.265e+07 12649696 3.271 0.0705 .
Residuals 28870 1.117e+11 3867793
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for snow effect
<- aov(traffic_volume ~ snow_group, data = df_clean)
anova_snow summary(anova_snow)
Df Sum Sq Mean Sq F value Pr(>F)
snow_group 1 1.228e+06 1228079 0.317 0.573
Residuals 28870 1.117e+11 3868188
Correlation Matrix
<- df_clean %>%
df_corr select(traffic_volume, temp, rain_1h, snow_1h, clouds_all)
<- cor(df_corr, use = "complete.obs")
cor_matrix print(cor_matrix)
traffic_volume temp rain_1h snow_1h clouds_all
traffic_volume 1.000000000 0.11936974 -0.019437214 -0.003082033 0.09874833
temp 0.119369741 1.00000000 0.093895807 -0.020219846 -0.06326886
rain_1h -0.019437214 0.09389581 1.000000000 -0.003490101 0.07535976
snow_1h -0.003082033 -0.02021985 -0.003490101 1.000000000 0.02989620
clouds_all 0.098748334 -0.06326886 0.075359755 0.029896204 1.00000000
Linear Regression
<- lm(traffic_volume ~ temp + rain_1h + snow_1h + clouds_all, data = df_clean)
model summary(model)
Call:
lm(formula = traffic_volume ~ temp + rain_1h + snow_1h + clouds_all,
data = df_clean)
Residuals:
Min 1Q Median 3Q Max
-3807.8 -1888.9 168.4 1599.9 5394.3
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2462.9500 31.4702 78.263 < 2e-16 ***
temp 11.5146 0.5178 22.237 < 2e-16 ***
rain_1h -120.4670 17.6398 -6.829 8.7e-12 ***
snow_1h -1726.2851 2581.5666 -0.669 0.504
clouds_all 5.4926 0.2913 18.857 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1940 on 28867 degrees of freedom
Multiple R-squared: 0.02718, Adjusted R-squared: 0.02705
F-statistic: 201.6 on 4 and 28867 DF, p-value: < 2.2e-16
Fit traffic volume distribution & check normality
# Histogram & density
ggplot(df_clean, aes(x = traffic_volume)) +
geom_histogram(aes(y = ..density..), bins = 50, fill = "skyblue", alpha = 0.6) +
geom_density(color = "red") +
labs(title = "Traffic Volume Distribution")
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.
# Shapiro-Wilk normality test (for a sample due to large data)
shapiro.test(sample(df_clean$traffic_volume, 5000))
Shapiro-Wilk normality test
data: sample(df_clean$traffic_volume, 5000)
W = 0.93343, p-value < 2.2e-16
<- df_clean %>%
df_clean mutate(hour = hour(date_time),
is_holiday = ifelse(holiday != "None", "Holiday", "Non-Holiday"))
# Holiday effect
ggplot(df_clean, aes(x = is_holiday, y = traffic_volume)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Traffic Volume on Holidays vs Non-Holidays")
# Hourly trend
%>%
df_clean group_by(hour) %>%
summarise(avg_volume = mean(traffic_volume)) %>%
ggplot(aes(x = hour, y = avg_volume)) +
geom_line(color = "darkblue", linewidth = 1) +
labs(title = "Hourly Average Traffic Volume")
Weekends likely have lower traffic volume (t-test result). Rain and snow may show statistically significant but small impacts. Strong daily and weekly patterns visible. Temperature might show a mild positive correlation with traffic. Holidays tend to reduce volume, especially federal holidays
<- quantile(df_clean$traffic_volume, 0.75)
threshold
# Create binary response variable
<- df_clean %>%
df_clean mutate(high_traffic = ifelse(traffic_volume >= threshold, 1, 0),
hour = hour(date_time),
weekday = wday(date_time, label = TRUE),
is_holiday = ifelse(holiday != "None", "Holiday", "Non-Holiday"))
We’ll include variables like:
hour
, weekday
temp
, rain_1h
, snow_1h
clouds_all
is_holiday
# Convert categorical variables
<- df_clean %>%
df_model mutate(weekday = as.factor(weekday),
is_holiday = as.factor(is_holiday))
# Fit logistic model
<- glm(high_traffic ~ hour + weekday + temp + rain_1h + snow_1h + clouds_all + is_holiday,
logit_model data = df_model, family = binomial)
summary(logit_model)
Call:
glm(formula = high_traffic ~ hour + weekday + temp + rain_1h +
snow_1h + clouds_all + is_holiday, family = binomial, data = df_model)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.449e+01 8.902e+01 -0.163 0.8707
hour 8.575e-03 2.113e-03 4.057 4.97e-05 ***
weekday.L 1.349e+00 9.459e-02 14.258 < 2e-16 ***
weekday.Q -3.230e+00 9.137e-02 -35.352 < 2e-16 ***
weekday.C 3.370e-01 7.196e-02 4.683 2.83e-06 ***
weekday^4 -1.263e+00 5.052e-02 -25.006 < 2e-16 ***
weekday^5 5.167e-02 3.751e-02 1.377 0.1684
weekday^6 -1.657e-01 3.341e-02 -4.959 7.10e-07 ***
temp 6.143e-03 6.742e-04 9.111 < 2e-16 ***
rain_1h -7.974e-02 2.527e-02 -3.156 0.0016 **
snow_1h 8.304e-01 2.707e+00 0.307 0.7590
clouds_all 1.609e-03 3.728e-04 4.316 1.59e-05 ***
is_holidayNon-Holiday 1.250e+01 8.902e+01 0.140 0.8884
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 32485 on 28871 degrees of freedom
Residual deviance: 28394 on 28859 degrees of freedom
AIC: 28420
Number of Fisher Scoring iterations: 12
library(pscl)
Classes and Methods for R originally developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University (2002-2015),
by and under the direction of Simon Jackman.
hurdle and zeroinfl functions by Achim Zeileis.
pR2(logit_model) # McFadden's R²
fitting null model for pseudo-r2
llh llhNull G2 McFadden r2ML
-1.419705e+04 -1.624233e+04 4.090566e+03 1.259230e-01 1.321005e-01
r2CU
1.955918e-01
# Predict probabilities and classify
$predicted <- ifelse(predict(logit_model, type = "response") > 0.5, 1, 0)
df_model
# Confusion matrix
table(Predicted = df_model$predicted, Actual = df_model$high_traffic)
Actual
Predicted 0 1
0 21648 7224
library(pROC)
Type 'citation("pROC")' for a citation.
Attaching package: 'pROC'
The following objects are masked from 'package:stats':
cov, smooth, var
<- roc(df_model$high_traffic, predict(logit_model, type = "response")) roc_obj
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_obj, col = "blue", main = "ROC Curve")
auc(roc_obj)
Area under the curve: 0.7148
# Plot residuals
plot(logit_model$residuals, main = "Residuals of Logistic Model", pch = 20)
abline(h = 0, col = "red")
# Leverage plot
plot(hatvalues(logit_model), main = "Leverage", pch = 20)
abline(h = 2*mean(hatvalues(logit_model)), col = "red", lty = 2)
Decompose and smooth the traffic volume time series.
Detect points where observed values significantly deviate from the expected trend/seasonality.
Visualize and interpret anomalies year by year.
# Optional: aggregate hourly if necessary
<- df_interp %>%
df_ts select(date_time, traffic_volume) %>%
arrange(date_time)
# Create time series object (hourly, start in 2016)
<- ts(df_ts$traffic_volume, frequency = 24 * 365.25, start = c(2016, 1)) ts_data
<- HoltWinters(ts_data)
hw_model plot(hw_model, main = "Holt-Winters Smoothed Traffic Volume")
We’ll use the model’s residuals to detect anomalies.
# Forecast using fitted model (to match length of original data)
<- hw_model$fitted[,1]
hw_fitted
# Compute residuals
<- ts_data - hw_fitted
residuals_hw
# Visualize residuals
plot(residuals_hw, main = "Holt-Winters Residuals", col = "darkred")
abline(h = c(-2*sd(residuals_hw), 2*sd(residuals_hw)), col = "blue", lty = 2)
# Find outliers (AO = additive outliers)
#outlier_result <- tso(ts_data, types = c("AO"))
# View summary
#summary(outlier_result)
# Extract positions
#anomaly_indices <- outlier_result$outliers$time
# Overlay on time series
#autoplot(ts_data) +
#geom_vline(xintercept = anomaly_indices, color = "red", linetype = "dashed") +
#labs(title = "Traffic Volume with Detected Anomalies", y = "Traffic Volume") +
#theme_minimal()