** Please submit both .Rmd file and the compiled .html file.
Check your work directory.
getwd()
## [1] "C:/Users/zahir/OneDrive/Desktop/Rutgers Semester Folders/Spring 2025/Regression Methods/RMD HW/HW 2"
library(faraway)
## Warning: package 'faraway' was built under R version 4.4.3
data(prostate)
head(prostate)
## lcavol lweight age lbph svi lcp gleason pgg45 lpsa
## 1 -0.5798185 2.7695 50 -1.386294 0 -1.38629 6 0 -0.43078
## 2 -0.9942523 3.3196 58 -1.386294 0 -1.38629 6 0 -0.16252
## 3 -0.5108256 2.6912 74 -1.386294 0 -1.38629 7 20 -0.16252
## 4 -1.2039728 3.2828 58 -1.386294 0 -1.38629 6 0 -0.16252
## 5 0.7514161 3.4324 62 -1.386294 0 -1.38629 6 0 0.37156
## 6 -1.0498221 3.2288 50 -1.386294 0 -1.38629 6 0 0.76547
plot(prostate$lcavol, prostate$lpsa, xlab = "Log Cancer Vol",
ylab = "Log PSA",
main = "Scatter Plot of lcavol vs lpsa")
mean_lpsa <- mean(prostate$lpsa)
sd_lpsa <- sd(prostate$lpsa)
mean_lcavol <- mean(prostate$lcavol)
sd_lcavol <- sd(prostate$lcavol)
correlation <- cor(prostate$lpsa, prostate$lcavol)
cat("Sample mean of lpsa:", mean_lpsa, "\nSample std dev of lpsa", sd_lpsa, "\n")
## Sample mean of lpsa: 2.478387
## Sample std dev of lpsa 1.154329
cat("Sample mean of lcavol:", mean_lcavol, "\nSample std dev of lcavol", sd_lcavol, "\n")
## Sample mean of lcavol: 1.35001
## Sample std dev of lcavol 1.178625
model <- lm(lpsa ~ lcavol, data = prostate)
b0 <- coef(model)[1]
b1 <- coef(model)[2]
s2_e <- summary(model)$sigma^2
cat("Estimated intercept (b0):", b0, "\n")
## Estimated intercept (b0): 1.507298
cat("Estimated slope (b1):", b1, "\n")
## Estimated slope (b1): 0.7193201
cat("Estimated error variance (s2_e):", s2_e, "\n")
## Estimated error variance (s2_e): 0.6201553
plot(prostate$lcavol, prostate$lpsa,
xlab = "Log Cancer Vol",
ylab = "Log PSA",
main = "Scatter Plot with regression line")
abline(model, col = "red")
#For every unit increase in lcavol, B1hat tells us that we increase by 0.7193
#Intercept B0 lets us know that when lcavol is at zero, expected lpsa is 1.507298
model_summary <- summary(model)
r_sqrd <- model_summary$r.squared
cat("R-sqrd val:", r_sqrd)
## R-sqrd val: 0.5394319
#change lpsa (b1 * (-sdlcavol))
change_lpsa <- b1 * (-sd_lcavol)
change_in_units <- change_lpsa / sd_lpsa
cat("Change in lpsa (in og units):", change_lpsa, "\n")
## Change in lpsa (in og units): -0.8478086
cat("Change in lpsa (in units):", change_in_units)
## Change in lpsa (in units): -0.7344603
coef_matr <- model_summary$coefficients
alpha = 0.10
df_resid <- model_summary$df[2]
t_crit <- qt(1 - alpha/2, df_resid)
b0_est <- coef_matr[1,1]
b0_se <- coef_matr[1,2]
CI_b0 <- b0_est + c(-1, 1) * t_crit * b0_se
b1_est <- coef_matr[2,1]
b1_se <- coef_matr[2,2]
CI_b1 <- b1_est + c(-1, 1) * t_crit * b1_se
cat("90% CI for b0 (Intrcpt):", CI_b0, "\n")
## 90% CI for b0 (Intrcpt): 1.304755 1.709841
cat("90% CI for b1 (Slope):", CI_b1)
## 90% CI for b1 (Slope): 0.6060482 0.832592
conf_interv <- confint(model, level = 0.9)
print(conf_interv)
## 5 % 95 %
## (Intercept) 1.3047545 1.709841
## lcavol 0.6060482 0.832592
#Null hypoth: is that B1 = 0; and alt hypth: is that B1 != 0
#@ alpha = 0.1, 90% CI for b1 (Slope): 0.6060482 0.832592
#we can reject the null hypothesis since the CI we have does not include 0, meaning that there is significant evidence that lpsa is related to lcavol.
observation <- data.frame(lcavol = 2)
conf_interv2 <- predict(model, newdata = observation, interval = "confidence", level = 0.95)
pred_interv2 <- predict(model, newdata = observation, interval = "prediction", level = 0.95)
cat("95% CI for mean response µ0 at x0 = 2:\n", conf_interv2, "\n")
## 95% CI for mean response µ0 at x0 = 2:
## 2.945938 2.764442 3.127434
cat("\n95% PI for an individual response y0 at x0 = 2:\n", pred_interv2)
##
## 95% PI for an individual response y0 at x0 = 2:
## 2.945938 1.372054 4.519822
set.seed(123)
N <- 1000
n <- 50
b0_T <- 4
b1_T <- 6
s2_T <- 0.3
s_T <- sqrt(s2_T)
b0_hat <- numeric(N)
b1_hat <- numeric(N)
sig_hat <- numeric(N)
for (i in 1:N) {
x <- rnorm(n, mean = 0, sd = 1)
epsil <- rnorm(n, mean = 0, sd = s_T)
y <- b0_T + b1_T * x + epsil
model1 <- lm(y ~ x)
b0_hat[i] <- model1$coef[1]
b1_hat[i] <- model1$coef[2]
sig_hat[i] <- summary(model1)$sigma
}
cat("Avg estim of b0:", mean(b0_hat), "\n")
## Avg estim of b0: 4.004058
cat("Avg estim of b1:", mean(b1_hat), "\n")
## Avg estim of b1: 5.99825
cat("Avg estim of sig:", mean(sig_hat), "\n")
## Avg estim of sig: 0.5450496
hist(b0_hat, main = "Histogram of 1000 Estimates of b0",
xlab = expression(hat(beta)[0]),)
b0_var <- var(b0_hat)
cat("Sample variance of b0_hat:", b0_var)
## Sample variance of b0_hat: 0.00573333
hist(b1_hat, main = "Histogram of 1000 Estimates of b1",
xlab = expression(hat(beta)[1]),)
b1_var <- var(b1_hat)
cat("Sample variance of b1_var:", b1_var)
## Sample variance of b1_var: 0.006287197
hist(sig_hat, main = "Histogram of 1000 Estimates of sigma^2",
xlab = expression(hat(sigma)^2),)
ci_interv_length <- numeric(N)
z_crit <- qnorm(0.95)
for (i in 1:N) {
x <- rnorm(n, mean = 0, sd = 1)
eps <- rnorm(n, mean = 0, sd = s_T)
y <- b0_T + b1_T * x + eps
sum_of_sqrd_dev_x <- sum((x-mean(x))^2)
std_err_b1 <- s_T / sqrt(sum_of_sqrd_dev_x)
ci_interv_length[i] <- 2 * z_crit * std_err_b1
}
hist(ci_interv_length, main = "Histogram of 90% CI Lenths for B1 (sigma known)",
xlab = "Length of 90% CI for B1")
xbar_length <- mean(ci_interv_length)
cat("Average 90% CI Length for B1 (sigma known):", xbar_length)
## Average 90% CI Length for B1 (sigma known): 0.2606583
for (i in 1:N) {
x <- rnorm(n, mean = 0, sd = 1)
eps <- rnorm(n, mean = 0, sd = sqrt(s2_T))
y <- b0_T + b1_T * x + eps
model2 <- lm(y ~ x)
sum_of_sqrs_of_x <- sum((x - mean(x))^2)
t_crit <- qt(1 - alpha/2, df = n - 2)
se_b1 <- summary(model2)$sigma / sqrt(sum_of_sqrs_of_x)
ci_interv_length[i] <- 2 * t_crit * se_b1
}
hist(ci_interv_length, main = "Histogram of 90% CI Lengths for B1 (sigma unknown)",
xlab = "Length of 90% CI for B1")
xbar_length <- mean(ci_interv_length)
cat("Average 90% CI Length for B1 (sigma unknown):", xbar_length)
## Average 90% CI Length for B1 (sigma unknown): 0.2632748
n = 10
ci_interv_length <- numeric(N)
z_crit <- qnorm(0.95)
for (i in 1:N) {
x <- rnorm(n, mean = 0, sd = 1)
eps <- rnorm(n, mean = 0, sd = s_T)
y <- b0_T + b1_T * x + eps
sum_of_sqrd_dev_x <- sum((x-mean(x))^2)
std_err_b1 <- s_T / sqrt(sum_of_sqrd_dev_x)
ci_interv_length[i] <- 2 * z_crit * std_err_b1
}
hist(ci_interv_length, main = "Histogram of 90% CI Lenths for B1 (sigma known)",
xlab = "Length of 90% CI for B1")
xbar_length <- mean(ci_interv_length)
cat("Average 90% CI Length for B1 (sigma known):", xbar_length, "\n")
## Average 90% CI Length for B1 (sigma known): 0.6554896
for (i in 1:N) {
x <- rnorm(n, mean = 0, sd = 1)
eps <- rnorm(n, mean = 0, sd = sqrt(s2_T))
y <- b0_T + b1_T * x + eps
model2 <- lm(y ~ x)
sum_of_sqrs_of_x <- sum((x - mean(x))^2)
t_crit <- qt(1 - alpha/2, df = n - 2)
se_b1 <- summary(model2)$sigma / sqrt(sum_of_sqrs_of_x)
ci_interv_length[i] <- 2 * t_crit * se_b1
}
hist(ci_interv_length, main = "Histogram of 90% CI Lengths for B1 (sigma unknown)",
xlab = "Length of 90% CI for B1")
xbar_length <- mean(ci_interv_length)
cat("Average 90% CI Length for B1 (sigma unknown):", xbar_length)
## Average 90% CI Length for B1 (sigma unknown): 0.7204955
### Average 90% CI Length for B1 (sigma known): 0.6503338
### Average 90% CI Length for B1 (sigma unknown): 0.7071699
#### The smaller the sample size, the wider the CI gets, and in turn increasing uncertainty for estimating sigma^2, especially for when sigma^2 is unknown vs when it is known