Homework4.knit

library(ggplot2)
library(moments)
library(boot)

file_path = "C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/STT810_Homeworks/synthetic_covid_impact_on_work.csv"
main_df = read.csv(file_path)
head(main_df)

##   Increased_Work_Hours Work_From_Home Hours_Worked_Per_Day Meetings_Per_Day
## 1                    1              1             6.392394        2.6845944
## 2                    1              1             9.171984        3.3392246
## 3                    1              0            10.612561        2.2183327
## 4                    1              1             5.546169        5.1505662
## 5                    0              1            11.424615        3.1211255
## 6                    1              1             7.742898       -0.5829769
##   Productivity_Change Stress_Level Health_Issue Job_Security
## 1                   1          Low            0            0
## 2                   1          Low            0            1
## 3                   0       Medium            0            0
## 4                   0       Medium            0            0
## 5                   1       Medium            0            1
## 6                   1          Low            1            0
##   Childcare_Responsibilities Commuting_Changes Technology_Adaptation
## 1                          1                 1                     1
## 2                          0                 1                     1
## 3                          0                 0                     0
## 4                          0                 1                     0
## 5                          1                 1                     0
## 6                          1                 1                     1
##   Salary_Changes Team_Collaboration_Challenges    Sector Affected_by_Covid
## 1              0                             1    Retail                 1
## 2              0                             1        IT                 1
## 3              0                             0    Retail                 1
## 4              0                             0 Education                 1
## 5              1                             1 Education                 1
## 6              0                             1        IT                 1

# Calculate the mean of Hours_Worked_Per_Day
mean_hrs = mean(main_df$Hours_Worked_Per_Day, na.rm = TRUE)

# Calculate the standard deviation of Hours_Worked_Per_Day
std_hrs = sd(main_df$Hours_Worked_Per_Day, na.rm = TRUE)

# Calculate the 5th and 95th quantiles_hrs of Hours_Worked_Per_Day
quantiles_hrs = quantile(main_df$Hours_Worked_Per_Day, probs = c(0.05, 0.95), na.rm = TRUE)

# Print the results
cat("Mean of Hours_Worked_Per_Day:", mean_hrs, "\n")

## Mean of Hours_Worked_Per_Day: 8.006538

cat("Standard Deviation of Hours_Worked_Per_Day:", std_hrs, "\n")

## Standard Deviation of Hours_Worked_Per_Day: 1.978468

cat("5th Quantile of Hours_Worked_Per_Day:", quantiles_hrs[1], "\n")

## 5th Quantile of Hours_Worked_Per_Day: 4.809743

cat("95th Quantile of Hours_Worked_Per_Day:", quantiles_hrs[2], "\n")

## 95th Quantile of Hours_Worked_Per_Day: 11.29278

# Visual checks for normality


# Q-Q plot
qqnorm(main_df$Hours_Worked_Per_Day)
qqline(main_df$Hours_Worked_Per_Day)

# Histogram with density curve
ggplot(main_df, aes(x=Hours_Worked_Per_Day)) +
  geom_histogram(aes(y=..density..), binwidth=0.5, fill="lightblue", color="black") +
  geom_density(color="red") +
  labs(title="Distribution of Hours Worked Per Day",
       x="Hours Worked",
       y="Density")

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Calculate skewness and kurtosis for numerical assessment

skew = skewness(main_df$Hours_Worked_Per_Day)
kurt = kurtosis(main_df$Hours_Worked_Per_Day)

# a. Constructing confidence interval
n = length(main_df$Hours_Worked_Per_Day)
se = std_hrs/sqrt(n)

# For large samples, we can use z-score instead of t-score
z_crit = qnorm(0.975)  # for 95% confidence
margin_error = z_crit * se

ci_lower = mean_hrs - margin_error
ci_upper = mean_hrs + margin_error

# b.i Analytical p-value calculation
# Using z-test since n is large
z_stat = (mean_hrs - 8)/(std_hrs/sqrt(n))
p_value_analytical = 1 - pnorm(z_stat)

# b.ii Parametric simulation
set.seed(123)
n_simulation = 10000
sim_means = replicate(n_simulation, {
  sim_list_data = rnorm(n, mean=8, sd=std_hrs)
  mean(sim_list_data)
})
p_value_simulation = mean(sim_means >= mean_hrs)

# Print all results
cat("\nSample Statistics:\n")

## 
## Sample Statistics:

cat("Sample size:", n, "\n")

## Sample size: 10000

cat("Sample mean:", mean_hrs, "\n")

## Sample mean: 8.006538

cat("Standard deviation:", std_hrs, "\n")

## Standard deviation: 1.978468

cat("Standard error:", se, "\n")

## Standard error: 0.01978468

cat("Skewness:", skew, "\n")

## Skewness: 0.01377519

cat("Kurtosis:", kurt, "\n")

## Kurtosis: 2.968298

cat("\nHypothesis Test Results:\n")

## 
## Hypothesis Test Results:

cat("95% Confidence Interval:", ci_lower, "to", ci_upper, "\n")

## 95% Confidence Interval: 7.96776 to 8.045315

cat("Analytical p-value:", p_value_analytical, "\n")

## Analytical p-value: 0.3705347

cat("Simulation p-value:", p_value_simulation, "\n")

## Simulation p-value: 0.378

# Function to calculate statistics for bootstrapping
calc_stats = function(list_data, indices) {
  d = list_data[indices]
  c(
    mean = mean(d),
    sd = sd(d),
    quantile5 = quantile(d, 0.05),
    quantile95 = quantile(d, 0.95)
  )
}

# Extract Hours_Worked_Per_Day from main_df
hours_worked = main_df$Hours_Worked_Per_Day

# Calculate original statistics
cat("Original Statistics:\n")

## Original Statistics:

original_mean = mean(hours_worked)
original_std = sd(hours_worked)
original_q5 = quantile(hours_worked, 0.05)
original_q95 = quantile(hours_worked, 0.95)

cat("Mean:", round(original_mean, 6), "\n")

## Mean: 8.006538

cat("Standard Deviation:", round(original_std, 6), "\n")

## Standard Deviation: 1.978468

cat("5th percentile:", round(original_q5, 6), "\n")

## 5th percentile: 4.809743

cat("95th percentile:", round(original_q95, 6), "\n\n")

## 95th percentile: 11.29278

# Perform bootstrapping
set.seed(123) # for reproducibility
boot_results = boot(hours_worked, calc_stats, R=10000)

# Calculate bootstrap confidence intervals
CI_mean = boot.ci(boot_results, type="perc", index=1)
CI_std = boot.ci(boot_results, type="perc", index=2)
CI_quantile5 = boot.ci(boot_results, type="perc", index=3)
CI_quantile_95 = boot.ci(boot_results, type="perc", index=4)

cat("Bootstrap 95% Confidence Intervals:\n")

## Bootstrap 95% Confidence Intervals:

cat("Mean CI:", round(CI_mean$percent[4], 6), "to", round(CI_mean$percent[5], 6), "\n")

## Mean CI: 7.966871 to 8.044802

cat("SD CI:", round(CI_std$percent[4], 6), "to", round(CI_std$percent[5], 6), "\n")

## SD CI: 1.952257 to 2.006132

cat("5th percentile CI:", round(CI_quantile5$percent[4], 6), "to", round(CI_quantile5$percent[5], 6), "\n")

## 5th percentile CI: 4.738974 to 4.878245

cat("95th percentile CI:", round(CI_quantile_95$percent[4], 6), "to", round(CI_quantile_95$percent[5], 6), "\n\n")

## 95th percentile CI: 11.20232 to 11.37013

# Calculate chi-square confidence interval for standard deviation
n = length(hours_worked)
s = sd(hours_worked)
alpha = 0.05

chi_lower = sqrt((n-1) * s^2 / qchisq(1-alpha/2, n-1))
chi_upper = sqrt((n-1) * s^2 / qchisq(alpha/2, n-1))

cat("Chi-square based CI for Standard Deviation:\n")

## Chi-square based CI for Standard Deviation:

cat("(", round(chi_lower, 6), ",", round(chi_upper, 6), ")\n\n")

## ( 1.951425 , 2.006277 )

# Create visualization of bootstrap distributions
par(mfrow=c(2,2))

# Histogram of bootstrapped means
hist(boot_results$t[,1], main="Bootstrap Distribution - Mean",
     xlab="Mean", breaks=50)
abline(v=CI_mean$percent[4:5], col="red", lty=2)
abline(v=original_mean, col="blue", lwd=2)

# Histogram of bootstrapped standard deviations
hist(boot_results$t[,2], main="Bootstrap Distribution - SD",
     xlab="Standard Deviation", breaks=50)
abline(v=CI_std$percent[4:5], col="red", lty=2)
abline(v=original_std, col="blue", lwd=2)
abline(v=c(chi_lower, chi_upper), col="green", lty=2)

# Histogram of bootstrapped 5th percentiles
hist(boot_results$t[,3], main="Bootstrap Distribution - 5th Percentile",
     xlab="5th Percentile", breaks=50)
abline(v=CI_quantile5$percent[4:5], col="red", lty=2)
abline(v=original_q5, col="blue", lwd=2)

# Histogram of bootstrapped 95th percentiles
hist(boot_results$t[,4], main="Bootstrap Distribution - 95th Percentile",
     xlab="95th Percentile", breaks=50)
abline(v=CI_quantile_95$percent[4:5], col="red", lty=2)
abline(v=original_q95, col="blue", lwd=2)

# Add legend to the first plot
legend("topright", 
       legend=c("CI Bounds", "Original Value", "Chi-sq CI (SD only)"),
       col=c("red", "blue", "green"), 
       lty=c(2, 1, 2),
       cex=0.6)

# Set random seed for reproducibility
set.seed(123)

# Given list_data
list_data = c(7, 5, 9, 9, 3, 5, 6, 10, 6, 8, 5, 13, 6, 9, 9)
n = length(list_data)
interval = 3  # seconds
observed_rate = mean(list_data)/interval  # Convert to rate per second

# Calculate test statistic from observed list_data
observed_mean = mean(list_data)

# Monte Carlo simulation
n_simulations = 10000
simulated_means = numeric(n_simulations)

# Null hypothesis: lambda = 3.2 (per second)
null_lambda = 3.2 * interval  # Convert to expected counts per 3-second interval

# Perform simulation
for(i in 1:n_simulations) {
    # Generate sample under null hypothesis
    simulated_list_data = rpois(n, lambda = null_lambda)
    simulated_means[i] = mean(simulated_list_data)
}

# Calculate correct p-value for H1: λ > 3.2
# This is the proportion of simulated means that are LESS than or equal to observed
# because we want P(X ≥ observed | H0) for a right-tailed test
p_value_correct = mean(simulated_means >= observed_mean)

# Print results
cat("Results:\n")

## Results:

cat("Observed mean count per 3 seconds:", round(observed_mean, 4), "\n")

## Observed mean count per 3 seconds: 7.3333

cat("Observed rate per second:", round(observed_rate, 4), "\n")

## Observed rate per second: 2.4444

cat("Null hypothesis rate per second:", 3.2, "\n\n")

## Null hypothesis rate per second: 3.2

cat("Hypothesis Test Results:\n")

## Hypothesis Test Results:

cat("H0: λ ≤ 3.2 vs H1: λ > 3.2\n")

## H0: λ ≤ 3.2 vs H1: λ > 3.2

cat("P-value:", round(p_value_correct, 4), "\n")

## P-value: 0.9979

cat("95% confidence level decision:", 
    ifelse(p_value_correct < 0.05, "Reject null hypothesis", "Fail to reject null hypothesis"), "\n")

## 95% confidence level decision: Fail to reject null hypothesis

# Create visualization
par(mfrow=c(1,1))
hist(simulated_means, breaks=50, 
     main="Distribution of Simulated Means Under Null Hypothesis",
     xlab="Mean Count (per 3 seconds)",
     col="lightblue", border="white")
abline(v=observed_mean, col="red", lwd=2)
abline(v=null_lambda, col="blue", lwd=2, lty=2)
legend("topright", 
       legend=c("Observed Mean", "Null Hypothesis Mean"),
       col=c("red", "blue"), 
       lwd=2,
       lty=c(1, 2))

# Add arrows and annotations to show direction of hypothesis test
arrows(null_lambda, 40, null_lambda + 2, 40, 
       col="darkgreen", length=0.1, lwd=2)
text(null_lambda, 45, "Region to reject H0\nin favor of λ > 3.2", 
     col="darkgreen", pos=4)

# Specify the full path to the CSV file
file_path = "C:/Users/Nandan Hegde/OneDrive/Documents/MSU_Grad_Studies/STT810_Homeworks/react.csv"

# Read the CSV file
react_list_data = read.csv(file_path)

# Check the first few rows to ensure the main_df was read correctly
head(react_list_data)

##   Difference
## 1         -9
## 2         -6
## 3         -5
## 4         -5
## 5         -5
## 6         -4

# Calculate bootstrap 98% confidence interval for the mean difference in reaction size estimates
set.seed(123)  # For reproducibility

# Number of bootstrap samples
n_boot = 10000

# Perform bootstrap sampling and calculate the mean difference for each sample
bootstrap_means = replicate(n_boot, mean(sample(react_list_data$Difference, replace = TRUE)))

# Calculate the 98% confidence interval (1% in each tail)
CI_98_react = quantile(bootstrap_means, probs = c(0.01, 0.99))
cat("98% Bootstrap Confidence Interval for Mean Difference in Reaction Size:", CI_98_react[1], "to", CI_98_react[2], "\n")

## 98% Bootstrap Confidence Interval for Mean Difference in Reaction Size: -1.035928 to -0.5508982

# Perform bootstrap sampling and calculate the mean for each sample
bootstrap_hours_means = replicate(n_boot, mean(sample(main_df$Hours_Worked_Per_Day, replace = TRUE)))

# Calculate the 98% confidence interval (1% in each tail)
CI_98_hrs = quantile(bootstrap_hours_means, probs = c(0.01, 0.99))
cat("98% Bootstrap Confidence Interval for Mean of Hours_Worked_Per_Day:", CI_98_hrs[1], "to", CI_98_hrs[2], "\n")

## 98% Bootstrap Confidence Interval for Mean of Hours_Worked_Per_Day: 7.960511 to 8.050721