SDS-HW1

Question 2

Part a

set.seed(338877)

# Generate n = 100 data points
n <- 100
X <- runif(n, 0, 1)  # X ~ Uniform(0,1)
beta0 <- 5
beta1 <- 3
epsilon <- rnorm(n, mean = 0, sd = 1)  # ε ~ N(0,1)
Y <- beta0 + beta1 * X + epsilon  # Generate Y values

# Plot the data
plot(X, Y, main = "Scatter Plot of X and Y with Regression Line", xlab = "X", ylab = "Y", pch = 16, col = "slateblue")

# Fit the regression model
model <- lm(Y ~ X)

# Add the fitted line to the plot
abline(model, col = "red", lwd = 2)

Part b

set.seed(338877)
num_simulations <- 1000
estimates_beta1 <- numeric(num_simulations) # Initialize variable to store estimates of beta1

# Repeat experiment in (a) 1000 times
for (i in 1:num_simulations) {
  X <- runif(100, 0, 1)  # X ~ Uniform(0,1)
  epsilon <- rnorm(100, mean = 0, sd = 1)  # ε ~ N(0,1)
  Y <- 5 + 3 * X + epsilon  # Generate Y values
  
  # Fit regression model
  model <- lm(Y ~ X)
  
  #Store beta1 estimate
  estimates_beta1[i] <- coef(model)[2]
}


# Calculate mean and display expected mean
mean_beta1 <- mean(estimates_beta1)
cat("Mean of β1 estimates:", mean_beta1, "\n")

## Mean of β1 estimates: 2.996678

cat("Expected mean of β1 estimates: 3\n")

## Expected mean of β1 estimates: 3

# Plot histogram
hist(estimates_beta1, breaks = 30, col = "slateblue", border = "black",
     main = "Histogram of β1 Estimates", xlab = "β1 Estimates", probability = TRUE)

Part c

set.seed(338877)
num_simulations <- 1000
estimates_beta1_cauchy <- numeric(num_simulations) # Initialize variable to store estimates of beta1

# Repeat experiment in (a) 1000 times with cauchy errors
for (i in 1:num_simulations) {
  X <- runif(100, 0, 1)  # X ~ Uniform(0,1)
  epsilon <- rcauchy(100, location = 0, scale = 1)  # ε ~ Cauchy(0,1)
  Y <- 5 + 3 * X + epsilon  # Generate Y values
  
  # Fit regression model
  model <- lm(Y ~ X)
  
  # Store beta1 estimate
  estimates_beta1_cauchy[i] <- coef(model)[2]
}

# Calculate mean of beta1 estimates
mean_beta1_cauchy <- mean(estimates_beta1_cauchy, na.rm = TRUE)
cat("Mean of β1 estimates (Cauchy errors):", mean_beta1_cauchy, "\n")

## Mean of β1 estimates (Cauchy errors): -0.6045391

# Plot histogram of β1 estimates with Cauchy errors
hist(estimates_beta1_cauchy, breaks = 30, col = "lightblue", border = "black",
     main = "Histogram of β1 Estimates (with Cauchy Errors)", xlab = "β1 Estimates", probability = TRUE)

The histogram changes from a normal-like bell shaped curve to showing two large bars at extreme positive and negative values. The graph has high variability.

Part d

set.seed(338877)
n <- 100
num_simulations <- 1000
beta0 <- 5
beta1 <- 3
estimates_beta1 <- numeric(num_simulations)

# Repeat experiment 1000 times
for (i in 1:num_simulations) {
  X <- runif(n, 0, 1)  # X ~ Uniform(0,1)
  delta <- rnorm(n, mean = 0, sd = sqrt(2))  # δ ~ N(0,2)
  epsilon <- rnorm(n, mean = 0, sd = 1)  # ε ~ N(0,1)
  
  W <- X + delta  # W = X + δ (noisy approximations of X)
  Y <- beta0 + beta1 * X + epsilon  # Y = β0 + β1 * X + ε
  
  # Fit the regression model using W as the predictor
  model <- lm(Y ~ W)
  
  # Store beta1 estimate
  estimates_beta1[i] <- coef(model)[2]
}

# Calculate mean of beta1 estimates
mean_beta1 <- mean(estimates_beta1)
cat("Mean of β1 estimates (errors-in-variables):", mean_beta1, "\n")

## Mean of β1 estimates (errors-in-variables): 0.1206781

# Plot histogram of β1 estimates
hist(estimates_beta1, breaks = 30, col = "lightblue1", border = "black",
     main = "Histogram of β1 Estimates (Errors-in-Variables)", xlab = "β1 Estimates", probability = TRUE)

The estimates of Beta1 are centered around 0.1206781, which is significantly lower than 3. Therefore, the effect of having errors in the Xi values is a downward bias in the estimated slope so the true relationship is consistently being underestimated due to the errors present.

Question 3

Part d

set.seed(338877)

# 500 observations
n <- 500
X <- runif(n, 0, 1)  # X ~ Unif(0,1)
epsilon <- rnorm(n, mean = 0, sd = 1)  # ε ~ N(0,1)
Y <- exp(X) + epsilon  # Y = exp(X) + ε

# Plot the graph
plot(X, Y, main = "Scatter plot of Y vs X (500 observations)", xlab = "X", ylab = "Y", pch = 16, col = 'darkolivegreen3')

# Linear regression
model <- lm(Y ~ X)
#summary(model)

# Estimated coefficients
beta0_hat <- coef(model)[1]
beta1_hat <- coef(model)[2]
cat("Estimated Intercept (β0) w/ 500 observations:", beta0_hat, "\n")

## Estimated Intercept (β0) w/ 500 observations: 0.9512109

cat("Estimated Slope (β1) w/ 500 observations:", beta1_hat, "\n")

## Estimated Slope (β1) w/ 500 observations: 1.475152

# Repeat with 1 million observations
n <- 1e6
X <- runif(n, 0, 1) 
epsilon <- rnorm(n, mean = 0, sd = 1)
Y <- exp(X) + epsilon

# Plot the graph
plot(X, Y, main = "Scatter plot of Y vs X (1000 observations)", xlab = "X", ylab = "Y", pch = 16, col = 'darkolivegreen3')

# Linear regression
model <- lm(Y ~ X)

# Estimated coefficients
beta0_hat <- coef(model)[1]
beta1_hat <- coef(model)[2]
cat("Estimated Intercept (β0) w/ 1 million observations:", beta0_hat, "\n")

## Estimated Intercept (β0) w/ 1 million observations: 0.8707354

cat("Estimated Slope (β1) w/ 1 million observations:", beta1_hat, "\n")

## Estimated Slope (β1) w/ 1 million observations: 1.694841

The true values of β0 and β1 are 1.6903 and 0.8731 respectively.

The estimated coefficients with the 500 observations are slightly off from the true values. 0.95 is a bit higher than the true value and 1.475 is a bit lower. But when the number of observations increased to 1 millions, the estimates were very close to the true value. I believe this is due to the law of large numbers.

Part e

set.seed(338877)
n <- 500
X <- runif(n, 0, 1)  # X ~ Unif(0,1)
epsilon <- rnorm(n, mean = 0, sd = 1)  # ε ~ N(0,1)
Y <- exp(X) + epsilon  # Y = exp(X) + ε

# Plot the graph
plot(X, Y, main = "Scatter plot of Y vs X (500 observations)", xlab = "X", ylab = "Y", pch = 16, col = 'darkolivegreen3')

# (i) Estimated best regression line
model <- lm(Y ~ X)
beta0_hat <- coef(model)[1]
beta1_hat <- coef(model)[2]

# (ii) True best regression line (E[Y] = β0 + β1 * X)
beta0_true <- 4 * exp(1) - 10
beta1_true <- 18 - 6 * exp(1)

# (iii) True conditional mean/regression function (E[Y | X] = exp(X))
x_seq <- seq(0, 1, length.out = 100)  # Generate values for smooth curve
y_true <- exp(x_seq)  # True function E[Y | X] = exp(X)

# Add lines to graph
abline(a = beta0_hat, b = beta1_hat, col = "red", lwd = 2)
abline(a = beta0_true, b = beta1_true, col = "black", lwd = 2)
lines(x_seq, y_true, col = "blue", lwd = 2)

# Add Legend
legend("topleft", legend = c("Estimated Regression Line", "True Best Regression Line", "True Regression Function"), col = c("red", "black", "blue"), lwd = 2, lty = c(2, 2, 1), bty = "n")

SDS-HW1

Samhitha Kunadharaju

2025-02-06

Question 2

Part a

Part b

Part c

Part d

Question 3

Part d

Part e