Data Dive 4: Sampling and Drawing Conclusions

1. Reading Data

Reading the data and performing minor adjustments to remove inappropriate outliers and make the data easy to work with.

library(readr)
library(ggplot2)
library(patchwork)
library(dplyr)
library(lubridate)
library(GGally)
library(corrplot)
week2=read_csv("C:/Users/rajas/OneDrive/Desktop/Desktop/Applied Data Science/INFOH510/R Jupyter/Metro_Interstate_Traffic_Volume.csv")
week2=week2[week2$temp>0,]
week2=week2[week2$rain_1h< 60,]
week2<- week2|>
  mutate(temp=(((temp-273)*9/5))+32)
week2$hour<- as.integer(format(as.POSIXct(week2$date_time),"%H")) #converting the date_time information into hours,month,year, weekdays to get relevant insights.
week2$month<- month(as.integer(format(as.POSIXct(week2$date_time),"%m")),label = TRUE) #using lubridate library to get the month labels
week2$year<- as.integer(format(as.POSIXct(week2$date_time),"%y"))
week2$day<- as.integer(format(as.POSIXct(week2$date_time),"%d"))
week2$weekday<-weekdays(as.Date(week2$date_time))
week2$weekday<-factor(week2$weekday,levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) #sorting the weekdays
data_df<-week2

2. Sampling Data

Selecting 5 samples of data from the dataset. These samples are about 50% of data from the dataset with replacement. For simplicity, we are saving the different samples into dataframes.

sample_size <- floor(nrow(data_df) * 0.5)

# Set seed for reproducibility
set.seed(123)

# Create 5 bootstrap samples (with replacement)
df1 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
df2 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
df3 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
df4 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
df5 <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]

3. Analyzing the samples

In order to understand the similarities and dissimilarities within these samples, we start with comparing summary statistics to see if the they vary significantly.

# Function to get summary stats
get_summary <- function(df) {
  summary(df$traffic_volume)
}

# Get summary statistics for each sample
summary_df1 <- get_summary(df1)
summary_df2 <- get_summary(df2)
summary_df3 <- get_summary(df3)
summary_df4 <- get_summary(df4)
summary_df5 <- get_summary(df5)

# Print summary statistics
summary_df1

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1209    3389    3260    4923    7217

summary_df2

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1165    3353    3244    4913    7241

summary_df3

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1208    3405    3275    4955    7260

summary_df4

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1187    3370    3255    4936    7260

summary_df5

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1205    3429    3272    4930    7260

# Combine data for visualization
df_combined <- data.frame(
  traffic_volume = c(df1$traffic_volume, df2$traffic_volume, df3$traffic_volume, df4$traffic_volume, df5$traffic_volume),
  sample = rep(c("df1", "df2", "df3", "df4", "df5"), each = sample_size)
)

# Boxplot comparison
plt1<-df_combined|>
    ggplot()+
    geom_boxplot(aes(x = sample, y = traffic_volume, fill = sample),alpha = 0.7) +
    labs(title = "Traffic Volume Summary Statistics Across Samples", x = "Sample", y = "Traffic Volume") +
    theme(axis.text=element_text(size=25),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))
plt2<-df_combined|>
    ggplot() +
    geom_violin(aes(x = sample, y = traffic_volume, fill = sample),alpha = 0.7) +
    labs(title = "Traffic Volume Distribution Across Samples", x = "Sample", y = "Traffic Volume") +
    theme(axis.text=element_text(size=25),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))

#plt1+plt2+plot_layout(guides="collect")
plt1

plt2

From the above Box plots and violin plots for each samples we can conclude that the traffic volume is not very sensitive to sampling. There is very little change in distribution. This is also depicted by the density plot shown below.

df_combined <- data.frame(
  traffic_volume = c(df1$traffic_volume, df2$traffic_volume, df3$traffic_volume, df4$traffic_volume, df5$traffic_volume),
  sample = rep(c("df1", "df2", "df3", "df4", "df5"), each = sample_size)
)

# Density plot comparison
ggplot(df_combined, aes(x = traffic_volume, fill = sample)) +
  geom_density(alpha = 0.4) +
  labs(title = "Traffic Volume Distribution Across Samples", x = "Traffic Volume", fill = "Sample")+
  theme(axis.text=element_text(size=20),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))

3.1 Anomaly detection

To detect if any anomalous observations we will check if any of the data points fall over some basic limits based on the inter quartile range. We will take 1.5*IQR (Commonly followed rule as it closely follows the Gaussian distribution) as the upper and lower limits to check if there are any anomalies.

# Function to detect anomalies using IQR method
detect_anomalies <- function(df) {
  Q1 <- quantile(df$traffic_volume, 0.25)
  Q3 <- quantile(df$traffic_volume, 0.75)
  IQR_value <- Q3 - Q1
  anomalies <- df %>% filter(traffic_volume < (Q1 - 0.75 * IQR_value) | 
                             traffic_volume > (Q3 + 0.75 * IQR_value))
  return(nrow(anomalies))  # Number of anomalies
}

# Get anomaly counts
anomalies_df1 <- detect_anomalies(df1)
anomalies_df2 <- detect_anomalies(df2)
anomalies_df3 <- detect_anomalies(df3)
anomalies_df4 <- detect_anomalies(df4)
anomalies_df5 <- detect_anomalies(df5)

# Print anomaly counts
anomalies_df1

## [1] 0

anomalies_df2

## [1] 0

anomalies_df3

## [1] 0

anomalies_df4

## [1] 0

anomalies_df5

## [1] 0

As seen from the output, there seems to be no specific outliers for the samples we chose.

3.2 Correlation analysis

In order to check if the data distribution is consistent across the samples, we can do correlation analysis across different columns to see how they behave. Lets take traffic volume and temp for comparison. From previous assignments we know that temp is slightly positively correlated to traffic volume, which means that people drive slightly more on hotter days. Lets find the correlation and plot it across the samples to see if this holds true.

# Function to check correlation between traffic volume and temperature
get_correlation <- function(df) {
  cor(df$traffic_volume, df$temp, use = "complete.obs")
}

# Compute correlations
cor_df1 <- get_correlation(df1)
cor_df2 <- get_correlation(df2)
cor_df3 <- get_correlation(df3)
cor_df4 <- get_correlation(df4)
cor_df5 <- get_correlation(df5)

# Print correlation values
cor_df1

## [1] 0.130397

cor_df2

## [1] 0.1371391

cor_df3

## [1] 0.1279851

cor_df4

## [1] 0.1317125

cor_df5

## [1] 0.133282

# Compute correlation with temperature for each sample
cor_values <- data.frame(
  Sample = c("df1", "df2", "df3", "df4", "df5"),
  Correlation = c(
    cor(df1$traffic_volume, df1$temp, use = "complete.obs"),
    cor(df2$traffic_volume, df2$temp, use = "complete.obs"),
    cor(df3$traffic_volume, df3$temp, use = "complete.obs"),
    cor(df4$traffic_volume, df4$temp, use = "complete.obs"),
    cor(df5$traffic_volume, df5$temp, use = "complete.obs")
  )
)

# Plot correlation values
ggplot(cor_values, aes(x = Sample, y = Correlation, fill = Sample)) +
  geom_bar(stat = "identity", alpha = 0.7) +
  labs(title = "Traffic Volume vs Temperature Correlation Across Samples", y = "Correlation") +
  theme(axis.text=element_text(size=20),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))

As we see, above the slight positive correlation holds true across the samples. There are minor difference in the scale of correlation but mostly across the samples its similar.

This exercise primarily indicates that in future, if we want to sample the data to extrapolate insights or to check validity across the data set, we can confidently pick a random sample and expect reasonably accurate results.

4. Additional Analysis : Monte Carlo Simulations

Monte Carlo simulations can help assess the variability of key metrics (like mean traffic volume or correlations) across the subsamples. The idea is to repeatedly sample from the dataset and observe the distribution of computed statistics.

This simulation for Traffic Volume Mean estimates the variability of mean traffic volume across the subsamples.

# Function to perform Monte Carlo simulations
monte_carlo_mean <- function(data_df, n_simulations = 1000, sample_size = floor(nrow(data_df) * 0.5)) {
  means <- numeric(n_simulations)  # Store results

  for (i in 1:n_simulations) {
    sample_data <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]  # Resample
    means[i] <- mean(sample_data$traffic_volume, na.rm = TRUE)  # Compute mean
  }

  return(means)
}

# Run Monte Carlo simulations for each sample
set.seed(123)
mc_means_df1 <- monte_carlo_mean(df1)
mc_means_df2 <- monte_carlo_mean(df2)
mc_means_df3 <- monte_carlo_mean(df3)
mc_means_df4 <- monte_carlo_mean(df4)
mc_means_df5 <- monte_carlo_mean(df5)

# Combine results into a dataframe for visualization
mc_means_combined <- data.frame(
  Mean_Traffic_Volume = c(mc_means_df1, mc_means_df2, mc_means_df3, mc_means_df4, mc_means_df5),
  Sample = rep(c("df1", "df2", "df3", "df4", "df5"), each = 1000)
)

# Density plot of Monte Carlo means
ggplot(mc_means_combined, aes(x = Mean_Traffic_Volume, fill = Sample)) +
  geom_density(alpha = 0.4) +
  labs(title = "Monte Carlo Simulation: Traffic Volume Means Across Samples",
       x = "Mean Traffic Volume", y = "Density") +
  theme(axis.text=element_text(size=20),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))

Looking at the above graph, the distributions of mean traffic volume for the five samples overlap significantly suggesting mean traffic volume is fairly stable across different resamples. df3 has a slightly higher peak mean compared to others. This indicates small variations due to random sampling but not major anomalies. The width of the density curves shows variability in mean traffic volume. Since all samples have a similar spread, it indicates minimal fluctuations in traffic volume across the samples.

Now, lets see how the correlation between traffic volume and temperature fluctuates across repeated resampling.

# Function to perform Monte Carlo simulations for correlation
monte_carlo_correlation <- function(data_df, n_simulations = 1000, sample_size = floor(nrow(data_df) * 0.5)) {
  correlations <- numeric(n_simulations)  

  for (i in 1:n_simulations) {
    sample_data <- data_df[sample(nrow(data_df), sample_size, replace = TRUE), ]
    correlations[i] <- cor(sample_data$traffic_volume, sample_data$temp, use = "complete.obs")
  }

  return(correlations)
}

# Run Monte Carlo simulations for each sample
set.seed(123)
mc_cor_df1 <- monte_carlo_correlation(df1)
mc_cor_df2 <- monte_carlo_correlation(df2)
mc_cor_df3 <- monte_carlo_correlation(df3)
mc_cor_df4 <- monte_carlo_correlation(df4)
mc_cor_df5 <- monte_carlo_correlation(df5)

# Combine results into a dataframe
mc_cor_combined <- data.frame(
  Correlation = c(mc_cor_df1, mc_cor_df2, mc_cor_df3, mc_cor_df4, mc_cor_df5),
  Sample = rep(c("df1", "df2", "df3", "df4", "df5"), each = 1000)
)

# Density plot of Monte Carlo correlations
ggplot(mc_cor_combined, aes(x = Correlation, fill = Sample)) +
  geom_density(alpha = 0.4) +
  labs(title = "Monte Carlo Simulation: Correlation Between Traffic Volume & Temperature",
       x = "Correlation", y = "Density") +
  theme(axis.text=element_text(size=20),
          axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          plot.title = element_text(size = 20),
          legend.key.size = unit(2,"cm"),
          legend.text = element_text(size = 18),
          legend.title = element_text(size = 14),
          panel.background = element_rect(fill = 'white'),
          panel.grid.major = element_line(color = "grey"))

Since the correlation values are consistent across resamples, the relationship between temperature and traffic volume is stable. We confirmed this in the 3.2 Correlation analysis as well.