library(ggplot2)
library(moments)
library(boot)
file_path = "C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/STT810_Homeworks/synthetic_covid_impact_on_work.csv"
main_df = read.csv(file_path)
head(main_df)
## Increased_Work_Hours Work_From_Home Hours_Worked_Per_Day Meetings_Per_Day
## 1 1 1 6.392394 2.6845944
## 2 1 1 9.171984 3.3392246
## 3 1 0 10.612561 2.2183327
## 4 1 1 5.546169 5.1505662
## 5 0 1 11.424615 3.1211255
## 6 1 1 7.742898 -0.5829769
## Productivity_Change Stress_Level Health_Issue Job_Security
## 1 1 Low 0 0
## 2 1 Low 0 1
## 3 0 Medium 0 0
## 4 0 Medium 0 0
## 5 1 Medium 0 1
## 6 1 Low 1 0
## Childcare_Responsibilities Commuting_Changes Technology_Adaptation
## 1 1 1 1
## 2 0 1 1
## 3 0 0 0
## 4 0 1 0
## 5 1 1 0
## 6 1 1 1
## Salary_Changes Team_Collaboration_Challenges Sector Affected_by_Covid
## 1 0 1 Retail 1
## 2 0 1 IT 1
## 3 0 0 Retail 1
## 4 0 0 Education 1
## 5 1 1 Education 1
## 6 0 1 IT 1
# Calculate the mean of Hours_Worked_Per_Day
mean_hrs = mean(main_df$Hours_Worked_Per_Day, na.rm = TRUE)
# Calculate the standard deviation of Hours_Worked_Per_Day
std_hrs = sd(main_df$Hours_Worked_Per_Day, na.rm = TRUE)
# Calculate the 5th and 95th quantiles_hrs of Hours_Worked_Per_Day
quantiles_hrs = quantile(main_df$Hours_Worked_Per_Day, probs = c(0.05, 0.95), na.rm = TRUE)
# Print the results
cat("Mean of Hours_Worked_Per_Day:", mean_hrs, "\n")
## Mean of Hours_Worked_Per_Day: 8.006538
cat("Standard Deviation of Hours_Worked_Per_Day:", std_hrs, "\n")
## Standard Deviation of Hours_Worked_Per_Day: 1.978468
cat("5th Quantile of Hours_Worked_Per_Day:", quantiles_hrs[1], "\n")
## 5th Quantile of Hours_Worked_Per_Day: 4.809743
cat("95th Quantile of Hours_Worked_Per_Day:", quantiles_hrs[2], "\n")
## 95th Quantile of Hours_Worked_Per_Day: 11.29278
# Visual checks for normality
# Q-Q plot
qqnorm(main_df$Hours_Worked_Per_Day)
qqline(main_df$Hours_Worked_Per_Day)

# Histogram with density curve
ggplot(main_df, aes(x=Hours_Worked_Per_Day)) +
geom_histogram(aes(y=..density..), binwidth=0.5, fill="lightblue", color="black") +
geom_density(color="red") +
labs(title="Distribution of Hours Worked Per Day",
x="Hours Worked",
y="Density")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Calculate skewness and kurtosis for numerical assessment
skew = skewness(main_df$Hours_Worked_Per_Day)
kurt = kurtosis(main_df$Hours_Worked_Per_Day)
# a. Constructing confidence interval
n = length(main_df$Hours_Worked_Per_Day)
se = std_hrs/sqrt(n)
# For large samples, we can use z-score instead of t-score
z_crit = qnorm(0.975) # for 95% confidence
margin_error = z_crit * se
ci_lower = mean_hrs - margin_error
ci_upper = mean_hrs + margin_error
# b.i Analytical p-value calculation
# Using z-test since n is large
z_stat = (mean_hrs - 8)/(std_hrs/sqrt(n))
p_value_analytical = 1 - pnorm(z_stat)
# b.ii Parametric simulation
set.seed(123)
n_simulation = 10000
sim_means = replicate(n_simulation, {
sim_list_data = rnorm(n, mean=8, sd=std_hrs)
mean(sim_list_data)
})
p_value_simulation = mean(sim_means >= mean_hrs)
# Print all results
cat("\nSample Statistics:\n")
##
## Sample Statistics:
cat("Sample size:", n, "\n")
## Sample size: 10000
cat("Sample mean:", mean_hrs, "\n")
## Sample mean: 8.006538
cat("Standard deviation:", std_hrs, "\n")
## Standard deviation: 1.978468
cat("Standard error:", se, "\n")
## Standard error: 0.01978468
cat("Skewness:", skew, "\n")
## Skewness: 0.01377519
cat("Kurtosis:", kurt, "\n")
## Kurtosis: 2.968298
cat("\nHypothesis Test Results:\n")
##
## Hypothesis Test Results:
cat("95% Confidence Interval:", ci_lower, "to", ci_upper, "\n")
## 95% Confidence Interval: 7.96776 to 8.045315
cat("Analytical p-value:", p_value_analytical, "\n")
## Analytical p-value: 0.3705347
cat("Simulation p-value:", p_value_simulation, "\n")
## Simulation p-value: 0.378
# Function to calculate statistics for bootstrapping
calc_stats = function(list_data, indices) {
d = list_data[indices]
c(
mean = mean(d),
sd = sd(d),
quantile5 = quantile(d, 0.05),
quantile95 = quantile(d, 0.95)
)
}
# Extract Hours_Worked_Per_Day from main_df
hours_worked = main_df$Hours_Worked_Per_Day
# Calculate original statistics
cat("Original Statistics:\n")
## Original Statistics:
original_mean = mean(hours_worked)
original_std = sd(hours_worked)
original_q5 = quantile(hours_worked, 0.05)
original_q95 = quantile(hours_worked, 0.95)
cat("Mean:", round(original_mean, 6), "\n")
## Mean: 8.006538
cat("Standard Deviation:", round(original_std, 6), "\n")
## Standard Deviation: 1.978468
cat("5th percentile:", round(original_q5, 6), "\n")
## 5th percentile: 4.809743
cat("95th percentile:", round(original_q95, 6), "\n\n")
## 95th percentile: 11.29278
# Perform bootstrapping
set.seed(123) # for reproducibility
boot_results = boot(hours_worked, calc_stats, R=10000)
# Calculate bootstrap confidence intervals
CI_mean = boot.ci(boot_results, type="perc", index=1)
CI_std = boot.ci(boot_results, type="perc", index=2)
CI_quantile5 = boot.ci(boot_results, type="perc", index=3)
CI_quantile_95 = boot.ci(boot_results, type="perc", index=4)
cat("Bootstrap 95% Confidence Intervals:\n")
## Bootstrap 95% Confidence Intervals:
cat("Mean CI:", round(CI_mean$percent[4], 6), "to", round(CI_mean$percent[5], 6), "\n")
## Mean CI: 7.966871 to 8.044802
cat("SD CI:", round(CI_std$percent[4], 6), "to", round(CI_std$percent[5], 6), "\n")
## SD CI: 1.952257 to 2.006132
cat("5th percentile CI:", round(CI_quantile5$percent[4], 6), "to", round(CI_quantile5$percent[5], 6), "\n")
## 5th percentile CI: 4.738974 to 4.878245
cat("95th percentile CI:", round(CI_quantile_95$percent[4], 6), "to", round(CI_quantile_95$percent[5], 6), "\n\n")
## 95th percentile CI: 11.20232 to 11.37013
# Calculate chi-square confidence interval for standard deviation
n = length(hours_worked)
s = sd(hours_worked)
alpha = 0.05
chi_lower = sqrt((n-1) * s^2 / qchisq(1-alpha/2, n-1))
chi_upper = sqrt((n-1) * s^2 / qchisq(alpha/2, n-1))
cat("Chi-square based CI for Standard Deviation:\n")
## Chi-square based CI for Standard Deviation:
cat("(", round(chi_lower, 6), ",", round(chi_upper, 6), ")\n\n")
## ( 1.951425 , 2.006277 )
# Create visualization of bootstrap distributions
par(mfrow=c(2,2))
# Histogram of bootstrapped means
hist(boot_results$t[,1], main="Bootstrap Distribution - Mean",
xlab="Mean", breaks=50)
abline(v=CI_mean$percent[4:5], col="red", lty=2)
abline(v=original_mean, col="blue", lwd=2)
# Histogram of bootstrapped standard deviations
hist(boot_results$t[,2], main="Bootstrap Distribution - SD",
xlab="Standard Deviation", breaks=50)
abline(v=CI_std$percent[4:5], col="red", lty=2)
abline(v=original_std, col="blue", lwd=2)
abline(v=c(chi_lower, chi_upper), col="green", lty=2)
# Histogram of bootstrapped 5th percentiles
hist(boot_results$t[,3], main="Bootstrap Distribution - 5th Percentile",
xlab="5th Percentile", breaks=50)
abline(v=CI_quantile5$percent[4:5], col="red", lty=2)
abline(v=original_q5, col="blue", lwd=2)
# Histogram of bootstrapped 95th percentiles
hist(boot_results$t[,4], main="Bootstrap Distribution - 95th Percentile",
xlab="95th Percentile", breaks=50)
abline(v=CI_quantile_95$percent[4:5], col="red", lty=2)
abline(v=original_q95, col="blue", lwd=2)
# Add legend to the first plot
legend("topright",
legend=c("CI Bounds", "Original Value", "Chi-sq CI (SD only)"),
col=c("red", "blue", "green"),
lty=c(2, 1, 2),
cex=0.6)

# Set random seed for reproducibility
set.seed(123)
# Given list_data
list_data = c(7, 5, 9, 9, 3, 5, 6, 10, 6, 8, 5, 13, 6, 9, 9)
n = length(list_data)
interval = 3 # seconds
observed_rate = mean(list_data)/interval # Convert to rate per second
# Calculate test statistic from observed list_data
observed_mean = mean(list_data)
# Monte Carlo simulation
n_simulations = 10000
simulated_means = numeric(n_simulations)
# Null hypothesis: lambda = 3.2 (per second)
null_lambda = 3.2 * interval # Convert to expected counts per 3-second interval
# Perform simulation
for(i in 1:n_simulations) {
# Generate sample under null hypothesis
simulated_list_data = rpois(n, lambda = null_lambda)
simulated_means[i] = mean(simulated_list_data)
}
# Calculate correct p-value for H1: λ > 3.2
# This is the proportion of simulated means that are LESS than or equal to observed
# because we want P(X ≥ observed | H0) for a right-tailed test
p_value_correct = mean(simulated_means >= observed_mean)
# Print results
cat("Results:\n")
## Results:
cat("Observed mean count per 3 seconds:", round(observed_mean, 4), "\n")
## Observed mean count per 3 seconds: 7.3333
cat("Observed rate per second:", round(observed_rate, 4), "\n")
## Observed rate per second: 2.4444
cat("Null hypothesis rate per second:", 3.2, "\n\n")
## Null hypothesis rate per second: 3.2
cat("Hypothesis Test Results:\n")
## Hypothesis Test Results:
cat("H0: λ ≤ 3.2 vs H1: λ > 3.2\n")
## H0: λ ≤ 3.2 vs H1: λ > 3.2
cat("P-value:", round(p_value_correct, 4), "\n")
## P-value: 0.9979
cat("95% confidence level decision:",
ifelse(p_value_correct < 0.05, "Reject null hypothesis", "Fail to reject null hypothesis"), "\n")
## 95% confidence level decision: Fail to reject null hypothesis
# Create visualization
par(mfrow=c(1,1))
hist(simulated_means, breaks=50,
main="Distribution of Simulated Means Under Null Hypothesis",
xlab="Mean Count (per 3 seconds)",
col="lightblue", border="white")
abline(v=observed_mean, col="red", lwd=2)
abline(v=null_lambda, col="blue", lwd=2, lty=2)
legend("topright",
legend=c("Observed Mean", "Null Hypothesis Mean"),
col=c("red", "blue"),
lwd=2,
lty=c(1, 2))
# Add arrows and annotations to show direction of hypothesis test
arrows(null_lambda, 40, null_lambda + 2, 40,
col="darkgreen", length=0.1, lwd=2)
text(null_lambda, 45, "Region to reject H0\nin favor of λ > 3.2",
col="darkgreen", pos=4)

# Specify the full path to the CSV file
file_path = "C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/STT810_Homeworks/react.csv"
# Read the CSV file
react_list_data = read.csv(file_path)
# Check the first few rows to ensure the main_df was read correctly
head(react_list_data)
## Difference
## 1 -9
## 2 -6
## 3 -5
## 4 -5
## 5 -5
## 6 -4
# Calculate bootstrap 98% confidence interval for the mean difference in reaction size estimates
set.seed(123) # For reproducibility
# Number of bootstrap samples
n_boot = 10000
# Perform bootstrap sampling and calculate the mean difference for each sample
bootstrap_means = replicate(n_boot, mean(sample(react_list_data$Difference, replace = TRUE)))
# Calculate the 98% confidence interval (1% in each tail)
CI_98_react = quantile(bootstrap_means, probs = c(0.01, 0.99))
cat("98% Bootstrap Confidence Interval for Mean Difference in Reaction Size:", CI_98_react[1], "to", CI_98_react[2], "\n")
## 98% Bootstrap Confidence Interval for Mean Difference in Reaction Size: -1.035928 to -0.5508982
# Perform bootstrap sampling and calculate the mean for each sample
bootstrap_hours_means = replicate(n_boot, mean(sample(main_df$Hours_Worked_Per_Day, replace = TRUE)))
# Calculate the 98% confidence interval (1% in each tail)
CI_98_hrs = quantile(bootstrap_hours_means, probs = c(0.01, 0.99))
cat("98% Bootstrap Confidence Interval for Mean of Hours_Worked_Per_Day:", CI_98_hrs[1], "to", CI_98_hrs[2], "\n")
## 98% Bootstrap Confidence Interval for Mean of Hours_Worked_Per_Day: 7.960511 to 8.050721