** Please submit both .Rmd file and the compiled .html file.

Check your work directory.

getwd()
## [1] "C:/Users/zahir/OneDrive/Desktop/Rutgers Semester Folders/Spring 2025/Regression Methods/RMD HW/HW 2"

Problem 1

library(faraway)
## Warning: package 'faraway' was built under R version 4.4.3
data(prostate)
head(prostate)
##       lcavol lweight age      lbph svi      lcp gleason pgg45     lpsa
## 1 -0.5798185  2.7695  50 -1.386294   0 -1.38629       6     0 -0.43078
## 2 -0.9942523  3.3196  58 -1.386294   0 -1.38629       6     0 -0.16252
## 3 -0.5108256  2.6912  74 -1.386294   0 -1.38629       7    20 -0.16252
## 4 -1.2039728  3.2828  58 -1.386294   0 -1.38629       6     0 -0.16252
## 5  0.7514161  3.4324  62 -1.386294   0 -1.38629       6     0  0.37156
## 6 -1.0498221  3.2288  50 -1.386294   0 -1.38629       6     0  0.76547

(a)

plot(prostate$lcavol, prostate$lpsa, xlab = "Log Cancer Vol",
     ylab = "Log PSA",
     main = "Scatter Plot of lcavol vs lpsa")

(b)

mean_lpsa <- mean(prostate$lpsa)
sd_lpsa <- sd(prostate$lpsa)

mean_lcavol <- mean(prostate$lcavol)
sd_lcavol <- sd(prostate$lcavol)

correlation <- cor(prostate$lpsa, prostate$lcavol)

cat("Sample mean of lpsa:", mean_lpsa, "\nSample std dev of lpsa", sd_lpsa, "\n")
## Sample mean of lpsa: 2.478387 
## Sample std dev of lpsa 1.154329
cat("Sample mean of lcavol:", mean_lcavol, "\nSample std dev of lcavol", sd_lcavol, "\n")
## Sample mean of lcavol: 1.35001 
## Sample std dev of lcavol 1.178625

(c)

model <- lm(lpsa ~ lcavol, data = prostate)

b0 <- coef(model)[1]
b1 <- coef(model)[2]


s2_e <- summary(model)$sigma^2

cat("Estimated intercept (b0):", b0, "\n")
## Estimated intercept (b0): 1.507298
cat("Estimated slope (b1):", b1, "\n")
## Estimated slope (b1): 0.7193201
cat("Estimated error variance (s2_e):", s2_e, "\n")
## Estimated error variance (s2_e): 0.6201553
plot(prostate$lcavol, prostate$lpsa,
     xlab = "Log Cancer Vol",
     ylab = "Log PSA",
     main = "Scatter Plot with regression line")

abline(model, col = "red")

(d)

#For every unit increase in lcavol, B1hat tells us that we increase by 0.7193

#Intercept B0 lets us know that when lcavol is at zero, expected lpsa is 1.507298

(e)

model_summary <- summary(model)
r_sqrd <- model_summary$r.squared

cat("R-sqrd val:", r_sqrd)
## R-sqrd val: 0.5394319

(f)

#change lpsa (b1 * (-sdlcavol))
change_lpsa <- b1 * (-sd_lcavol)
change_in_units <- change_lpsa / sd_lpsa

cat("Change in lpsa (in og units):", change_lpsa, "\n")
## Change in lpsa (in og units): -0.8478086
cat("Change in lpsa (in units):", change_in_units)
## Change in lpsa (in units): -0.7344603

(g)

coef_matr <- model_summary$coefficients

alpha = 0.10

df_resid <- model_summary$df[2]

t_crit <- qt(1 - alpha/2, df_resid)

b0_est <- coef_matr[1,1]
b0_se <- coef_matr[1,2]
CI_b0 <- b0_est + c(-1, 1) * t_crit * b0_se

b1_est <- coef_matr[2,1]
b1_se <- coef_matr[2,2]
CI_b1 <- b1_est + c(-1, 1) * t_crit * b1_se

cat("90% CI for b0 (Intrcpt):", CI_b0, "\n")
## 90% CI for b0 (Intrcpt): 1.304755 1.709841
cat("90% CI for b1 (Slope):", CI_b1)
## 90% CI for b1 (Slope): 0.6060482 0.832592

(h)

conf_interv <- confint(model, level = 0.9)

print(conf_interv)
##                   5 %     95 %
## (Intercept) 1.3047545 1.709841
## lcavol      0.6060482 0.832592

(i)

#Null hypoth: is that B1 = 0; and alt hypth: is that B1 != 0

#@ alpha = 0.1, 90% CI for b1 (Slope): 0.6060482 0.832592
#we can reject the null hypothesis since the CI we have does not include 0, meaning that there is significant evidence that lpsa is related to lcavol.

(j)

observation <- data.frame(lcavol = 2)

conf_interv2 <- predict(model, newdata = observation, interval = "confidence", level = 0.95)

pred_interv2 <- predict(model, newdata = observation, interval = "prediction", level = 0.95)

cat("95% CI for mean response µ0 at x0 = 2:\n", conf_interv2, "\n")
## 95% CI for mean response µ0 at x0 = 2:
##  2.945938 2.764442 3.127434
cat("\n95% PI for an individual response y0 at x0 = 2:\n", pred_interv2)
## 
## 95% PI for an individual response y0 at x0 = 2:
##  2.945938 1.372054 4.519822

Problem 2

(2)

set.seed(123)

N <- 1000
n <- 50
b0_T <- 4
b1_T <- 6
s2_T <- 0.3
s_T <- sqrt(s2_T)

b0_hat <- numeric(N)
b1_hat <- numeric(N)
sig_hat <- numeric(N)

for (i in 1:N) {
  x <- rnorm(n, mean = 0, sd = 1)
  epsil <- rnorm(n, mean = 0, sd = s_T)
  y <- b0_T + b1_T * x + epsil
  
  model1 <- lm(y ~ x)
  
  b0_hat[i] <- model1$coef[1]
  b1_hat[i] <- model1$coef[2]
  sig_hat[i] <- summary(model1)$sigma
}
cat("Avg estim of b0:", mean(b0_hat), "\n")
## Avg estim of b0: 4.004058
cat("Avg estim of b1:", mean(b1_hat), "\n")
## Avg estim of b1: 5.99825
cat("Avg estim of sig:", mean(sig_hat), "\n")
## Avg estim of sig: 0.5450496

(a)

hist(b0_hat, main = "Histogram of 1000 Estimates of b0",
     xlab = expression(hat(beta)[0]),)

b0_var <- var(b0_hat)
cat("Sample variance of b0_hat:", b0_var)
## Sample variance of b0_hat: 0.00573333

(b)

hist(b1_hat, main = "Histogram of 1000 Estimates of b1",
     xlab = expression(hat(beta)[1]),)

b1_var <- var(b1_hat)
cat("Sample variance of b1_var:", b1_var)
## Sample variance of b1_var: 0.006287197

(c)

hist(sig_hat, main = "Histogram of 1000 Estimates of sigma^2",
     xlab = expression(hat(sigma)^2),)

(d)

ci_interv_length <- numeric(N)
z_crit <- qnorm(0.95)

for (i in 1:N) {
  x <- rnorm(n, mean = 0, sd = 1)
  eps <- rnorm(n, mean = 0, sd = s_T)
  y <- b0_T + b1_T * x + eps
  
  sum_of_sqrd_dev_x <- sum((x-mean(x))^2)
  
  std_err_b1 <- s_T / sqrt(sum_of_sqrd_dev_x)
  
  ci_interv_length[i] <- 2 * z_crit * std_err_b1
}

hist(ci_interv_length, main = "Histogram of 90% CI Lenths for B1 (sigma known)",
     xlab = "Length of 90% CI for B1")

xbar_length <- mean(ci_interv_length)
cat("Average 90% CI Length for B1 (sigma known):", xbar_length)
## Average 90% CI Length for B1 (sigma known): 0.2606583

(e)

for (i in 1:N) {
  x <- rnorm(n, mean = 0, sd = 1)
  eps <- rnorm(n, mean = 0, sd = sqrt(s2_T))
  y <- b0_T + b1_T * x + eps
  
  model2 <- lm(y ~ x)
  
  sum_of_sqrs_of_x <- sum((x - mean(x))^2)
  
  t_crit <- qt(1 - alpha/2, df = n - 2)
  
  se_b1 <- summary(model2)$sigma / sqrt(sum_of_sqrs_of_x)
  
  ci_interv_length[i] <- 2 * t_crit * se_b1
}
hist(ci_interv_length, main = "Histogram of 90% CI Lengths for B1 (sigma unknown)",
     xlab = "Length of 90% CI for B1")

xbar_length <- mean(ci_interv_length)
cat("Average 90% CI Length for B1 (sigma unknown):", xbar_length)
## Average 90% CI Length for B1 (sigma unknown): 0.2632748

(f)

n = 10
ci_interv_length <- numeric(N)
z_crit <- qnorm(0.95)

for (i in 1:N) {
  x <- rnorm(n, mean = 0, sd = 1)
  eps <- rnorm(n, mean = 0, sd = s_T)
  y <- b0_T + b1_T * x + eps
  
  sum_of_sqrd_dev_x <- sum((x-mean(x))^2)
  
  std_err_b1 <- s_T / sqrt(sum_of_sqrd_dev_x)
  
  ci_interv_length[i] <- 2 * z_crit * std_err_b1
}

hist(ci_interv_length, main = "Histogram of 90% CI Lenths for B1 (sigma known)",
     xlab = "Length of 90% CI for B1")

xbar_length <- mean(ci_interv_length)
cat("Average 90% CI Length for B1 (sigma known):", xbar_length, "\n")
## Average 90% CI Length for B1 (sigma known): 0.6554896
for (i in 1:N) {
  x <- rnorm(n, mean = 0, sd = 1)
  eps <- rnorm(n, mean = 0, sd = sqrt(s2_T))
  y <- b0_T + b1_T * x + eps
  
  model2 <- lm(y ~ x)
  
  sum_of_sqrs_of_x <- sum((x - mean(x))^2)
  
  t_crit <- qt(1 - alpha/2, df = n - 2)
  
  se_b1 <- summary(model2)$sigma / sqrt(sum_of_sqrs_of_x)
  
  ci_interv_length[i] <- 2 * t_crit * se_b1
}
hist(ci_interv_length, main = "Histogram of 90% CI Lengths for B1 (sigma unknown)",
     xlab = "Length of 90% CI for B1")

xbar_length <- mean(ci_interv_length)
cat("Average 90% CI Length for B1 (sigma unknown):", xbar_length)
## Average 90% CI Length for B1 (sigma unknown): 0.7204955
### Average 90% CI Length for B1 (sigma known): 0.6503338 
### Average 90% CI Length for B1 (sigma unknown): 0.7071699

#### The smaller the sample size, the wider the CI gets, and in turn increasing uncertainty for estimating sigma^2, especially for when sigma^2 is unknown vs when it is known