Reading the data and performing minor adjustments to remove inappropriate outliers and make the data easy to work with.
library(readr)
library(ggplot2)
library(patchwork)
library(dplyr)
library(lubridate)
library(GGally)
library(corrplot)
week2=read_csv("C:/Users/rajas/OneDrive/Desktop/Desktop/Applied Data Science/INFOH510/R Jupyter/Metro_Interstate_Traffic_Volume.csv")
week2=week2[week2$temp>0,]
week2=week2[week2$rain_1h< 60,]
week2<- week2|>
mutate(temp=(((temp-273)*9/5))+32)
week2$hour<- as.integer(format(as.POSIXct(week2$date_time),"%H")) #converting the date_time information into hours,month,year, weekdays to get relevant insights.
week2$month<- month(as.integer(format(as.POSIXct(week2$date_time),"%m")),label = TRUE) #using lubridate library to get the month labels
week2$year<- as.integer(format(as.POSIXct(week2$date_time),"%y"))
week2$day<- as.integer(format(as.POSIXct(week2$date_time),"%d"))
week2$weekday<-weekdays(as.Date(week2$date_time))
week2$weekday<-factor(week2$weekday,levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) #sorting the weekdays
data_df<-week2
Selecting 5 samples of data from the dataset. These samples are about 50% of data from the dataset with replacement. For simplicity, we are saving the different samples into dataframes.
sample_size <- floor(nrow(data_df) * 0.5)
# Set seed for reproducibility
set.seed(123)
# Create 5 bootstrap samples (with replacement)
df1 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
df2 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
df3 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
df4 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
df5 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
In order to understand the similarities and dissimilarities within these samples, we start with comparing summary statistics to see if the they vary significantly.
# Function to get summary stats
get_summary <- function(df) {
summary(df$traffic_volume)
}
# Get summary statistics for each sample
summary_df1 <- get_summary(df1)
summary_df2 <- get_summary(df2)
summary_df3 <- get_summary(df3)
summary_df4 <- get_summary(df4)
summary_df5 <- get_summary(df5)
# Print summary statistics
summary_df1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1209 3389 3260 4923 7217
summary_df2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1165 3353 3244 4913 7241
summary_df3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1208 3405 3275 4955 7260
summary_df4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1187 3370 3255 4936 7260
summary_df5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1205 3429 3272 4930 7260
# Combine data for visualization
df_combined <- data.frame(
traffic_volume = c(df1$traffic_volume, df2$traffic_volume, df3$traffic_volume, df4$traffic_volume, df5$traffic_volume),
sample = rep(c("df1", "df2", "df3", "df4", "df5"), each = sample_size)
)
# Boxplot comparison
plt1<-df_combined|>
ggplot()+
geom_boxplot(aes(x = sample, y = traffic_volume, fill = sample),alpha = 0.7) +
labs(title = "Traffic Volume Summary Statistics Across Samples", x = "Sample", y = "Traffic Volume") +
theme(axis.text=element_text(size=25),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
plot.title = element_text(size = 20),
legend.key.size = unit(2,"cm"),
legend.text = element_text(size = 18),
legend.title = element_text(size = 14),
panel.background = element_rect(fill = 'white'),
panel.grid.major = element_line(color = "grey"))
plt2<-df_combined|>
ggplot() +
geom_violin(aes(x = sample, y = traffic_volume, fill = sample),alpha = 0.7) +
labs(title = "Traffic Volume Distribution Across Samples", x = "Sample", y = "Traffic Volume") +
theme(axis.text=element_text(size=25),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
plot.title = element_text(size = 20),
legend.key.size = unit(2,"cm"),
legend.text = element_text(size = 18),
legend.title = element_text(size = 14),
panel.background = element_rect(fill = 'white'),
panel.grid.major = element_line(color = "grey"))
#plt1+plt2+plot_layout(guides="collect")
plt1
plt2
From the above Box plots and violin plots for each samples we can conclude that the traffic volume is not very sensitive to sampling. There is very little change in distribution. This is also depicted by the density plot shown below.
df_combined <- data.frame(
traffic_volume = c(df1$traffic_volume, df2$traffic_volume, df3$traffic_volume, df4$traffic_volume, df5$traffic_volume),
sample = rep(c("df1", "df2", "df3", "df4", "df5"), each = sample_size)
)
# Density plot comparison
ggplot(df_combined, aes(x = traffic_volume, fill = sample)) +
geom_density(alpha = 0.4) +
labs(title = "Traffic Volume Distribution Across Samples", x = "Traffic Volume", fill = "Sample")+
theme(axis.text=element_text(size=20),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
plot.title = element_text(size = 20),
legend.key.size = unit(2,"cm"),
legend.text = element_text(size = 18),
legend.title = element_text(size = 14),
panel.background = element_rect(fill = 'white'),
panel.grid.major = element_line(color = "grey"))
To detect if any anomalous observations we will check if any of the data points fall over some basic limits based on the inter quartile range. We will take 1.5*IQR (Commonly followed rule as it closely follows the Gaussian distribution) as the upper and lower limits to check if there are any anomalies.
# Function to detect anomalies using IQR method
detect_anomalies <- function(df) {
Q1 <- quantile(df$traffic_volume, 0.25)
Q3 <- quantile(df$traffic_volume, 0.75)
IQR_value <- Q3 - Q1
anomalies <- df %>% filter(traffic_volume < (Q1 - 0.75 * IQR_value) |
traffic_volume > (Q3 + 0.75 * IQR_value))
return(nrow(anomalies)) # Number of anomalies
}
# Get anomaly counts
anomalies_df1 <- detect_anomalies(df1)
anomalies_df2 <- detect_anomalies(df2)
anomalies_df3 <- detect_anomalies(df3)
anomalies_df4 <- detect_anomalies(df4)
anomalies_df5 <- detect_anomalies(df5)
# Print anomaly counts
anomalies_df1
## [1] 0
anomalies_df2
## [1] 0
anomalies_df3
## [1] 0
anomalies_df4
## [1] 0
anomalies_df5
## [1] 0
As seen from the output, there seems to be no specific outliers for the samples we chose.
In order to check if the data distribution is consistent across the samples, we can do correlation analysis across different columns to see how they behave. Lets take traffic volume and temp for comparison. From previous assignments we know that temp is slightly positively correlated to traffic volume, which means that people drive slightly more on hotter days. Lets find the correlation and plot it across the samples to see if this holds true.
# Function to check correlation between traffic volume and temperature
get_correlation <- function(df) {
cor(df$traffic_volume, df$temp, use = "complete.obs")
}
# Compute correlations
cor_df1 <- get_correlation(df1)
cor_df2 <- get_correlation(df2)
cor_df3 <- get_correlation(df3)
cor_df4 <- get_correlation(df4)
cor_df5 <- get_correlation(df5)
# Print correlation values
cor_df1
## [1] 0.130397
cor_df2
## [1] 0.1371391
cor_df3
## [1] 0.1279851
cor_df4
## [1] 0.1317125
cor_df5
## [1] 0.133282
# Compute correlation with temperature for each sample
cor_values <- data.frame(
Sample = c("df1", "df2", "df3", "df4", "df5"),
Correlation = c(
cor(df1$traffic_volume, df1$temp, use = "complete.obs"),
cor(df2$traffic_volume, df2$temp, use = "complete.obs"),
cor(df3$traffic_volume, df3$temp, use = "complete.obs"),
cor(df4$traffic_volume, df4$temp, use = "complete.obs"),
cor(df5$traffic_volume, df5$temp, use = "complete.obs")
)
)
# Plot correlation values
ggplot(cor_values, aes(x = Sample, y = Correlation, fill = Sample)) +
geom_bar(stat = "identity", alpha = 0.7) +
labs(title = "Traffic Volume vs Temperature Correlation Across Samples", y = "Correlation") +
theme(axis.text=element_text(size=20),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
plot.title = element_text(size = 20),
legend.key.size = unit(2,"cm"),
legend.text = element_text(size = 18),
legend.title = element_text(size = 14),
panel.background = element_rect(fill = 'white'),
panel.grid.major = element_line(color = "grey"))
As we see, above the slight positive correlation holds true across the samples. There are minor difference in the scale of correlation but mostly across the samples its similar.
This exercise primarily indicates that in future, if we want to sample the data to extrapolate insights or to check validity across the data set, we can confidently pick a random sample and expect reasonably accurate results.
Monte Carlo simulations can help assess the variability of key metrics (like mean traffic volume or correlations) across the subsamples. The idea is to repeatedly sample from the dataset and observe the distribution of computed statistics.
This simulation for Traffic Volume Mean estimates the variability of mean traffic volume across the subsamples.
# Function to perform Monte Carlo simulations
monte_carlo_mean <- function(data_df, n_simulations = 1000, sample_size = floor(nrow(data_df) * 0.5)) {
means <- numeric(n_simulations) # Store results
for (i in 1:n_simulations) {
sample_data <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ] # Resample
means[i] <- mean(sample_data$traffic_volume, na.rm = TRUE) # Compute mean
}
return(means)
}
# Run Monte Carlo simulations for each sample
set.seed(123)
mc_means_df1 <- monte_carlo_mean(df1)
mc_means_df2 <- monte_carlo_mean(df2)
mc_means_df3 <- monte_carlo_mean(df3)
mc_means_df4 <- monte_carlo_mean(df4)
mc_means_df5 <- monte_carlo_mean(df5)
# Combine results into a dataframe for visualization
mc_means_combined <- data.frame(
Mean_Traffic_Volume = c(mc_means_df1, mc_means_df2, mc_means_df3, mc_means_df4, mc_means_df5),
Sample = rep(c("df1", "df2", "df3", "df4", "df5"), each = 1000)
)
# Density plot of Monte Carlo means
ggplot(mc_means_combined, aes(x = Mean_Traffic_Volume, fill = Sample)) +
geom_density(alpha = 0.4) +
labs(title = "Monte Carlo Simulation: Traffic Volume Means Across Samples",
x = "Mean Traffic Volume", y = "Density") +
theme(axis.text=element_text(size=20),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
plot.title = element_text(size = 20),
legend.key.size = unit(2,"cm"),
legend.text = element_text(size = 18),
legend.title = element_text(size = 14),
panel.background = element_rect(fill = 'white'),
panel.grid.major = element_line(color = "grey"))
Looking at the above graph, the distributions of mean traffic volume for the five samples overlap significantly suggesting mean traffic volume is fairly stable across different resamples. df3 has a slightly higher peak mean compared to others. This indicates small variations due to random sampling but not major anomalies. The width of the density curves shows variability in mean traffic volume. Since all samples have a similar spread, it indicates minimal fluctuations in traffic volume across the samples.
Now, lets see how the correlation between traffic volume and temperature fluctuates across repeated resampling.
# Function to perform Monte Carlo simulations for correlation
monte_carlo_correlation <- function(data_df, n_simulations = 1000, sample_size = floor(nrow(data_df) * 0.5)) {
correlations <- numeric(n_simulations)
for (i in 1:n_simulations) {
sample_data <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
correlations[i] <- cor(sample_data$traffic_volume, sample_data$temp, use = "complete.obs")
}
return(correlations)
}
# Run Monte Carlo simulations for each sample
set.seed(123)
mc_cor_df1 <- monte_carlo_correlation(df1)
mc_cor_df2 <- monte_carlo_correlation(df2)
mc_cor_df3 <- monte_carlo_correlation(df3)
mc_cor_df4 <- monte_carlo_correlation(df4)
mc_cor_df5 <- monte_carlo_correlation(df5)
# Combine results into a dataframe
mc_cor_combined <- data.frame(
Correlation = c(mc_cor_df1, mc_cor_df2, mc_cor_df3, mc_cor_df4, mc_cor_df5),
Sample = rep(c("df1", "df2", "df3", "df4", "df5"), each = 1000)
)
# Density plot of Monte Carlo correlations
ggplot(mc_cor_combined, aes(x = Correlation, fill = Sample)) +
geom_density(alpha = 0.4) +
labs(title = "Monte Carlo Simulation: Correlation Between Traffic Volume & Temperature",
x = "Correlation", y = "Density") +
theme(axis.text=element_text(size=20),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
plot.title = element_text(size = 20),
legend.key.size = unit(2,"cm"),
legend.text = element_text(size = 18),
legend.title = element_text(size = 14),
panel.background = element_rect(fill = 'white'),
panel.grid.major = element_line(color = "grey"))
Since the correlation values are consistent across resamples, the relationship between temperature and traffic volume is stable. We confirmed this in the 3.2 Correlation analysis as well.