Data dive Week 6: Confidence Intervals

1. Reading Data

Reading the data and performing minor adjustments to remove inappropriate outliers and make the data easy to work with.

library(readr)
library(ggplot2)
library(patchwork)
library(dplyr)
library(lubridate)
library(GGally)
library(corrplot)
week2=read_csv("C:/Users/rajas/OneDrive/Desktop/Desktop/Applied Data Science/INFOH510/R Jupyter/Metro_Interstate_Traffic_Volume.csv")
week2=week2[week2$temp>0,]
week2=week2[week2$rain_1h< 60,]
week2<- week2|>
  mutate(temp=(((temp-273)*9/5))+32)
week2$hour<- as.integer(format(as.POSIXct(week2$date_time),"%H")) #converting the date_time information into hours,month,year, weekdays to get relevant insights.
week2$month<- month(as.integer(format(as.POSIXct(week2$date_time),"%m")),label = TRUE) #using lubridate library to get the month labels
week2$year<- as.integer(format(as.POSIXct(week2$date_time),"%y"))
week2$day<- as.integer(format(as.POSIXct(week2$date_time),"%d"))
week2$weekday<-weekdays(as.Date(week2$date_time))
week2$weekday<-factor(week2$weekday,levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) #sorting the weekdays
data_df<-week2

2. Variable Selection

In order to do relationship analysis, we need to create a couple explanatory variables. The following are the same.

Traffic Volume vs. Peak Hour Indicator
- Response Variable : traffic_volume
- Explanatory Variable : peak_hour (Binary variable: 1 if it’s rush hour, 0 otherwise)
Traffic Volume vs. Temperature Difference from Mean
- Response Variable : traffic_volume
- Explanatory Variable : temp_deviation (Difference between the current temperature and the dataset’s mean temperature)

data_df<- data_df|>
  mutate(is_peak_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18, 19), 1, 0))

# Calculate mean temperature
mean_temp <- mean(data_df$temp, na.rm = TRUE)

# Create 'temp_deviation' (Difference from mean temperature)
data_df<- data_df|>
  mutate(temp_deviation = temp - mean_temp)

3. Visualizing Relationships

3.1 Traffic Volume vs. Peak Hour Indicator

ggplot(data_df, aes(x = as.factor(is_peak_hour), y = traffic_volume, fill = as.factor(is_peak_hour))) +
  geom_boxplot() +
  labs(title = "Traffic Volume During Peak vs. Non-Peak Hours",
       x = "Peak Hour (1 = Yes, 0 = No)",
       y = "Traffic Volume") +
    theme(axis.text=element_text(size=25),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))

The box plot above shows the mean for peak hours is much higher than non-peak hours. This is in accordance with the usual assumption that peak hours generate most traffic.

3.2 Traffic Volume vs. Temperature Deviation

# Create temperature deviation categories
data_df <- data_df |>
  mutate(temp_dev_category = case_when(
    temp_deviation <= -5 ~ "Much Colder",
    temp_deviation > -5 & temp_deviation <= -2 ~ "Colder",
    temp_deviation > -2 & temp_deviation <= 2 ~ "Normal",
    temp_deviation > 2 & temp_deviation <= 5 ~ "Warmer",
    temp_deviation > 5 ~ "Much Warmer"
  ))

# Convert to ordered factor for proper plotting
data_df$temp_dev_category <- factor(data_df$temp_dev_category, levels = c("Much Colder", "Colder", "Normal", "Warmer", "Much Warmer"))

# Aggregate traffic volume by temperature deviation bins
data_summary <- data_df %>%
  group_by(temp_dev_category) %>%
  summarise(mean_traffic = mean(traffic_volume, na.rm = TRUE))

# Line plot
ggplot(data_summary, aes(x = temp_dev_category, y = mean_traffic, group = 1)) +
  geom_line(color = "steelblue", linewidth = 1.2) +
  geom_point(size = 3, color = "blue") +
  labs(title = "Mean Traffic Volume vs. Temperature Deviation",
       x = "Temperature Deviation Category",
       y = "Mean Traffic Volume") +
  theme(axis.text=element_text(size=25),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))

Classifying the temperature into few easily identifiable buckets with much colder being <-5 and Much warmer being greater than 5.

As expected much colder days have negative impact on traffic volume. This could be due to rain/snow and other adverse weather conditions impairing traffic. It is interesting to note that colder days has more traffic voluem than normal days. This could be due to various factors. Examining specific road conditions (rain/snow) during these temperatures will give us a better idea why this is the case.

4. Correlation analysis

Traffic Volume vs. Peak Hour (Point-Biserial Correlation)

cor_peak <- cor(data_df$traffic_volume, data_df$is_peak_hour, method = "pearson")
cat("Correlation between Traffic Volume and Peak Hour:", cor_peak, "\n")

## Correlation between Traffic Volume and Peak Hour: 0.4338524

We see high positive correlation indicating that there is large traffic volume during peak hours which is expected.

Traffic Volume vs. Temperature Deviation (Pearson Correlation)

cor_temp <- cor(data_df$traffic_volume, data_df$temp_deviation, method = "pearson")
cat("Correlation between Traffic Volume and Temperature Deviation:", cor_temp, "\n")

## Correlation between Traffic Volume and Temperature Deviation: 0.1322578

We see slight positive correlation with temperature and traffic volume. This indicates that traffic patterns are dependent on temperatures. But the correlation is not too strong to signify outright dependence on temperature.

5. Confidence Interval Analysis

4.1 Confidence Interval for Traffic Volume

n <- nrow(data_df)
mean_traffic <- mean(data_df$traffic_volume)
sd_traffic <- sd(data_df$traffic_volume)
error_margin <- qt(0.975, df = n-1) * (sd_traffic / sqrt(n))

conf_interval <- c(mean_traffic - error_margin, mean_traffic + error_margin)
cat("95% Confidence Interval for Traffic Volume:", conf_interval, "\n")

## 95% Confidence Interval for Traffic Volume: 3242.436 3277.912

Looking at the confidence interval itself, we can see that the range of possible mean. Plotting the confidence interval will give us a better idea about the true mean.

# Calculate 95% confidence interval for traffic volume
n <- nrow(data_df)
mean_traffic <- mean(data_df$traffic_volume, na.rm = TRUE)
sd_traffic <- sd(data_df$traffic_volume, na.rm = TRUE)
error_margin <- qt(0.975, df = n-1) * (sd_traffic / sqrt(n))
conf_low <- mean_traffic - error_margin
conf_high <- mean_traffic + error_margin

# Histogram with confidence interval lines
ggplot(data_df, aes(x = traffic_volume)) +
  geom_histogram(binwidth = 500, fill = "skyblue", color = "black", alpha = 0.7) +
  geom_vline(xintercept = mean_traffic, color = "red", size = 1) +
  geom_vline(xintercept = conf_low, color = "blue", linetype = "dotted", size = 1.2) +
  geom_vline(xintercept = conf_high, color = "blue", linetype = "dotted", size = 1.2) +
  labs(title = "Traffic Volume Distribution with 95% Confidence Interval",
       x = "Traffic Volume",
       y = "Frequency") +
  theme(axis.text=element_text(size=25),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))

The blue dotted line above indicates the confidence interval of 95%. The red line shows the mean traffic of the dataset. We can clearly see that the 95% confidence interval is a small range wherein the theoretical mean falls within it.