This problem uses the dataset prostate. Please install the book package faraway and run the following code to load the data
library(faraway)
data(prostate)
head(prostate)
plot(prostate$lpsa ~ prostate$lcavol)
mean(prostate$lpsa)
[1] 2.478387
mean(prostate$lcavol)
[1] 1.35001
sd(prostate$lpsa)
[1] 1.154329
sd(prostate$lcavol)
[1] 1.178625
cor(prostate$lpsa, prostate$lcavol)
[1] 0.7344603
lmodel <- lm(lpsa ~ lcavol, data = prostate)
summary(lmodel)
Call:
lm(formula = lpsa ~ lcavol, data = prostate)
Residuals:
Min 1Q Median 3Q Max
-1.67625 -0.41648 0.09859 0.50709 1.89673
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.50730 0.12194 12.36 <2e-16 ***
lcavol 0.71932 0.06819 10.55 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7875 on 95 degrees of freedom
Multiple R-squared: 0.5394, Adjusted R-squared: 0.5346
F-statistic: 111.3 on 1 and 95 DF, p-value: < 2.2e-16
plot(prostate$lpsa, prostate$lcavol,
main = "Predicting LPSA based on LCAVOL",
abline(lmodel),
asp = 1
)
NA
NA
NA
The estimated β0 is 1.5073, β1 = 0.71932, and se^2 = 0.12194^2
(5 pts) The intercept, β0 hat, is Expected lpsa when lcavol = 0. The slope, β1 hat, is the expected change in lpsa for a one-unit increase in lcavol
The R^2 is 0.5394.
This is the linear regression equation: \[y = (\widehat{\beta_0}) + (\widehat{\beta_1})x\] The expected value of lpsa when lcavol = x bar (the sample mean) is \[y(\bar{x}) = (\widehat{\beta_0}) + (\widehat{\beta_1})\bar{x}\] And the expected value for lpsa one lcavol is one standard deviation below the mean is \[y(\bar{x}-sx) = (\widehat{\beta_0}) + (\widehat{\beta_1})(\bar{x}-sx)\] The difference is: \[ (\widehat{\beta_0}) + ((\widehat{\beta_1})\bar{x} - (\widehat{\beta_0}) + (\widehat{\beta_1})(\bar{x}-sx))\] which simplifies to:
\[(\widehat{\beta_1})sx\] This means that lpsa will be the above value smaller when lcavol is one standard deviation below the mean. Since we need to express the results in unit sy, to get the answer divide by sy.
\[\left(\frac{\widehat{\beta_1})sx}{sy}\right)\]
beta1_hat <- 0.71932
sx <- sd(prostate$lcavol)
sy <- sd(prostate$lpsa)
# Compute expected change in lpsa (in units of sy)
expected_decrease <- (beta1_hat * sx) / sy
expected_decrease
[1] 0.7344601
From my notes, the confidence interval for 90% is calculated like this: \[[(\widehat{\beta_0}) - 1.645\sqrt{var(\widehat{\beta_0})}, (\widehat{\beta_0}) + 1.65\sqrt{var(\widehat{\beta_0})}]\]
sum_model <- summary(lmodel)
sum_model
Call:
lm(formula = lpsa ~ lcavol, data = prostate)
Residuals:
Min 1Q Median 3Q Max
-1.67625 -0.41648 0.09859 0.50709 1.89673
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.50730 0.12194 12.36 <2e-16 ***
lcavol 0.71932 0.06819 10.55 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7875 on 95 degrees of freedom
Multiple R-squared: 0.5394, Adjusted R-squared: 0.5346
F-statistic: 111.3 on 1 and 95 DF, p-value: < 2.2e-16
beta_0_hat <- coef(lmodel)[1] # 1.50730
#variance is the standed error^2. Since we are taking the square root of the variance, it will be easier to simply find the standard error.
sqrt_var_beta0_hat <- (sum_model$coefficients[1,2]) # this gets the standard error for the intercept (0.12194)
start_value <- beta_0_hat - 1.645 * (sqrt_var_beta0_hat)
end_value <- beta_0_hat + 1.645 * (sqrt_var_beta0_hat)
message("90% confidence interval for beta 0 is: [", start_value, " , ",end_value,"]" )
90% confidence interval for beta 0 is: [1.30671185659567 , 1.70788400716243]
#now for beta_1
beta1_hat <- coef(lmodel)[2] #0.71932
sqrt_var_beta1_hat <- sum_model$coefficients[2,2] #0.06819
start_value_b1 <- beta1_hat - 1.645 * (sqrt_var_beta1_hat)
end_value_b1 <- beta1_hat + 1.645 * (sqrt_var_beta1_hat)
message("90% confidence interval for beta 1 is: [", start_value_b1, " , ",end_value_b1,"]" )
90% confidence interval for beta 1 is: [0.607142853744293 , 0.831497424460505]
confint(lmodel, level = 0.90)
5 % 95 %
(Intercept) 1.3047545 1.709841
lcavol 0.6060482 0.832592
The values are very similar, however there seems to be a rounding issue with my original calculations.
x0 <- 2
#create dataframe with new value
new_data <- data.frame(lcavol = x0)
# Prediction for mu0 (mean of lpsa)
lpsa_mean_prediction <- predict(lmodel, new_data)
print(paste("Prediction for μ0:", lpsa_mean_prediction))
[1] "Prediction for μ0: 2.94593821008385"
# 95% confidence interval for μ0
confidence_interval <- predict(lmodel, new_data, interval = "confidence")
print("95% Confidence Interval for μ0:")
[1] "95% Confidence Interval for μ0:"
print(confidence_interval)
fit lwr upr
1 2.945938 2.764442 3.127434
# 95% prediction interval for y0 (individual lpsa)
prediction_interval <- predict(lmodel, new_data, interval = "prediction")
print("95% Prediction Interval for y0:")
[1] "95% Prediction Interval for y0:"
print(prediction_interval)
fit lwr upr
1 2.945938 1.372054 4.519822
Suppose β0 = 4, β1 = 6, σ^2 = 0.3 and n = 50. We also assume that the xi’s are independent and identically distributed as N(0, 1). Set seed set.seed(123) for the experiment and repeat the following N = 1000 times: • (i) Generate the data {(x1, y1), . . . ,(x50, y50)} from the model. [Note. Use the function rnorm() to generate the xi’s and ϵi ’s with corresponding means and standard deviations.] • (ii) Use the lm() function to get βˆ0, βˆ1 and ˆσ2
[Note. Suppose you get the model object out after calling the lm() function, you can get (βˆ0, βˆ1) using out\(coef. If you defineout.s=summary(out), you can get ˆσ using out.s\)sigma.]
set.seed(123)
N <- 1000
n <- 50
beta0 <- 4
beta1 <- 6
sigma2 <- 0.3
#this creates a vector with size 1000 so when the N simulations are run, the values are stored
beta0_hat <- numeric(N)
beta1_hat <- numeric(N)
sigma2_hat <- numeric(N)
#doing 1000 simulatpions
for (i in 1:N) {
x <- rnorm(n, mean = 0, sd = 1)
epsilon <- rnorm(n, mean = 0, sd = sqrt(sigma2))
y <- beta0 + beta1 * x + epsilon
out <- lm(y ~ x)
beta0_hat[i] <- out$coef[1]
beta1_hat[i] <- out$coef[2]
out.s = summary(out)
sigma2_hat[i] <- out.s$sigma^2
}
hist(beta0_hat, main = "Histogram of β0 hat", xlab = "β0 hat")
var(beta0_hat) # 0.00573333
[1] 0.00573333
hist(beta1_hat, main = "Histogram of β1 hat", xlab = "β1 hat")
var(beta1_hat) #0.006287197
[1] 0.006287197
hist(sigma2_hat, main = "Histogram of sigma^2 hat", xlab = "sigma^2 hat")
#to store the confidence interval lengths in a vector of size N
ci_lengths_known <- numeric(N)
for (i in 1:N) {
# this will generate the data, like before
x <- rnorm(n, mean = 0, sd = 1)
epsilon <- rnorm(n, mean = 0, sd = sqrt(sigma2))
y <- beta0 + beta1 * x + epsilon
model <- lm(y ~ x)
beta1_hat <- coef(model)[2] # Estimated slope
x_var <- sum((x - mean(x))^2) # Sum of squared deviations
# Compute CI length
ci_length <- 2 * 1.645 * sqrt(sigma2 / x_var)
ci_lengths_known[i] <- ci_length
}
hist(ci_lengths_known, main = "Histogram of 90% CI Lengths for β̂1 (Known σ²)",
xlab = "CI Length")
#average CI length
mean(ci_lengths_known)
[1] 0.2606815
#0.2605582
#Since σ^2 is unknown, I will use t-distribution instead of normal distribution
t_val <- qt(0.95, df = n - 2) # t-critical value for 90% CI
ci_lengths_unknown <- numeric(N)
#this follows what I did above
for (i in 1:N) {
x <- rnorm(n, mean = 0, sd = 1)
epsilon <- rnorm(n, mean = 0, sd = sqrt(sigma2))
y <- beta0 + beta1 * x + epsilon
model <- lm(y ~ x)
beta1_hat <- coef(model)[2]
sum_model <- summary(model)
sigma_hat2 <- sum_model$sigma^2
x_var <- sum((x - mean(x))^2)
ci_length <- 2 * t_val * sqrt(sigma_hat2 / x_var)
ci_lengths_unknown[i] <- ci_length
}
# Histogram of CI lengths
hist(ci_lengths_unknown, main = "Histogram of 90% CI Lengths for β̂1 (Unknown σ²)",
xlab = "CI Length")
# average CI length
mean(ci_lengths_unknown)
[1] 0.2632748
#0.2632344
n <- 10 #the new sample size
ci_lengths_known_10 <- numeric(N)
ci_lengths_unknown_10 <- numeric(N)
t_val_10 <- qt(0.95, df = n - 2)
for (i in 1:N) {
x <- rnorm(n, mean = 0, sd = 1)
epsilon <- rnorm(n, mean = 0, sd = sqrt(sigma2))
y <- beta0 + beta1 * x + epsilon
model <- lm(y ~ x)
sum_model <- summary(model)
sigma_hat <- sum_model$sigma
x_var <- sum((x - mean(x))^2)
ci_lengths_known_10[i] <- 2 * 1.645 * sqrt(sigma2 / x_var) #using normal value of 1.645 here
ci_lengths_unknown_10[i] <- 2 * t_val_10 * sqrt(sigma_hat^2 / x_var) #using the calculated t_value here
}
hist(ci_lengths_known_10, main = "Histogram of CI Lengths (n=10, Known σ²)")
mean(ci_lengths_known_10) #0.657455
[1] 0.6555479
hist(ci_lengths_unknown_10, main = "Histogram of CI Lengths (n=10, Unknown σ²)")
mean(ci_lengths_unknown_10) #0.7151685
[1] 0.7253124
When the sample size is 10, the confidence intervals are about 3 times longer than when the sample size is 50 because of the increased variance in small samples. When σ² is unknown, the confidence interval lengths are slightly higher because there is an increase in estimation uncertainty.This is because each simulation will give a slightly different value of σ².