Problem 1
Load the data
hybrid <- read.csv("hybrid_car.csv")
head(hybrid)
## vehicle year msrp accelerate mpg
## 1 Prius (1st Gen) 1997 24509.74 7.46 41.26
## 2 Tino 2000 35354.97 8.20 54.10
## 3 Prius (2nd Gen) 2000 26832.25 7.97 45.23
## 4 Insight 2000 18936.41 9.52 53.00
## 5 Civic (1st Gen) 2001 25833.38 7.04 47.04
## 6 Insight 2001 19036.71 9.52 53.00
str(hybrid)
## 'data.frame': 153 obs. of 5 variables:
## $ vehicle : chr "Prius (1st Gen)" "Tino" "Prius (2nd Gen)" "Insight" ...
## $ year : int 1997 2000 2000 2000 2001 2001 2002 2003 2003 2003 ...
## $ msrp : num 24510 35355 26832 18936 25833 ...
## $ accelerate: num 7.46 8.2 7.97 9.52 7.04 9.52 9.71 8.33 9.52 8.62 ...
## $ mpg : num 41.3 54.1 45.2 53 47 ...
Part (a)
# Fit the regression: msrp ~ year + mpg + accelerate
fit_a <- lm(msrp ~ year + mpg + accelerate, data = hybrid)
summary(fit_a)
##
## Call:
## lm(formula = msrp ~ year + mpg + accelerate, data = hybrid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40179 -8824 -2794 6800 48058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 538047.5 748028.9 0.719 0.473091
## year -265.5 373.2 -0.712 0.477832
## mpg -470.6 127.3 -3.697 0.000306 ***
## accelerate 4291.3 501.6 8.554 1.33e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14850 on 149 degrees of freedom
## Multiple R-squared: 0.5289, Adjusted R-squared: 0.5194
## F-statistic: 55.76 on 3 and 149 DF, p-value: < 2.2e-16
# Residuals vs fitted values plot
plot(fitted(fit_a), resid(fit_a),
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs Fitted Values (Original Scale)",
pch = 20, col = "steelblue")
abline(h = 0, lty = 2, col = "red")
Part (b)
# Create log-transformed variables
hybrid$lmsrp <- log(hybrid$msrp)
hybrid$lmpg <- log(hybrid$mpg)
# Fit the regression: lmsrp ~ year + lmpg + accelerate
fit_b <- lm(lmsrp ~ year + lmpg + accelerate, data = hybrid)
summary(fit_b)
##
## Call:
## lm(formula = lmsrp ~ year + lmpg + accelerate, data = hybrid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07547 -0.21618 -0.00678 0.20964 0.92614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.183371 16.639820 0.251 0.802
## year 0.003439 0.008305 0.414 0.679
## lmpg -0.477847 0.098631 -4.845 3.15e-06 ***
## accelerate 0.086472 0.011163 7.747 1.35e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3303 on 149 degrees of freedom
## Multiple R-squared: 0.5454, Adjusted R-squared: 0.5363
## F-statistic: 59.59 on 3 and 149 DF, p-value: < 2.2e-16
Diagnostic plots
par(mfrow = c(2, 2))
# Residuals vs fitted
plot(fitted(fit_b), resid(fit_b),
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs Fitted (Log Scale)",
pch = 20, col = "steelblue")
abline(h = 0, lty = 2, col = "red")
# Q-Q plot
qqnorm(resid(fit_b), pch = 20, col = "steelblue",
main = "Normal Q-Q Plot of Residuals")
qqline(resid(fit_b), col = "red")
# Scale-Location plot
plot(fitted(fit_b), sqrt(abs(resid(fit_b))),
xlab = "Fitted Values", ylab = expression(sqrt("|Residuals|")),
main = "Scale-Location Plot",
pch = 20, col = "steelblue")
# Residuals vs each predictor
plot(hybrid$accelerate, resid(fit_b),
xlab = "Accelerate", ylab = "Residuals",
main = "Residuals vs Accelerate",
pch = 20, col = "steelblue")
abline(h = 0, lty = 2, col = "red")
par(mfrow = c(1, 1))
# Shapiro-Wilk test for normality
shapiro.test(resid(fit_b))
##
## Shapiro-Wilk normality test
##
## data: resid(fit_b)
## W = 0.99205, p-value = 0.5547
Part (c)
coef(fit_b)
## (Intercept) year lmpg accelerate
## 4.183371010 0.003438883 -0.477847308 0.086471933
Part (d)
B <- 10000
n <- nrow(hybrid)
boot_lmpg <- numeric(B)
for (i in 1:B) {
idx <- sample(1:n, n, replace = TRUE)
boot_data <- hybrid[idx, ]
boot_fit <- lm(lmsrp ~ year + lmpg + accelerate, data = boot_data)
boot_lmpg[i] <- coef(boot_fit)["lmpg"]
}
# 95% bootstrap CI (percentile method)
ci <- quantile(boot_lmpg, c(0.025, 0.975))
cat("95% Bootstrap CI for beta_lmpg:", ci, "\n")
## 95% Bootstrap CI for beta_lmpg: -0.6970528 -0.2757792
# Histogram of bootstrap distribution
hist(boot_lmpg, breaks = 50, col = "lightblue", border = "white",
main = "Bootstrap Distribution of lmpg Coefficient",
xlab = expression(hat(beta)[lmpg]))
abline(v = ci, col = "red", lty = 2, lwd = 2)
abline(v = coef(fit_b)["lmpg"], col = "blue", lwd = 2)
legend("topright", c("Original estimate", "95% CI bounds"),
col = c("blue", "red"), lty = c(1, 2), lwd = 2)
95% Bootstrap CI for beta_lmpg: -0.6934129 -0.2745603
Part (e)
beta_accel <- coef(fit_b)["accelerate"]
ratio <- exp(2 * beta_accel)
pct_diff <- (ratio - 1) * 100
cat("exp(2 * beta_accelerate) =", ratio, "\n")
## exp(2 * beta_accelerate) = 1.188799
cat("Predicted percentage difference:", round(pct_diff, 2), "%\n")
## Predicted percentage difference: 18.88 %
Part (g)
# New observation
new_data <- data.frame(year = 2009, accelerate = 10, lmpg = log(40))
# Prediction interval on log scale
pred_log <- predict(fit_b, newdata = new_data,
interval = "prediction", level = 0.90)
cat("90% PI for log(msrp):", pred_log[, "lwr"], "to", pred_log[, "upr"], "\n")
## 90% PI for log(msrp): 9.644599 to 10.74357
# Transform to msrp scale
pred_msrp <- exp(pred_log)
cat("90% PI for msrp: $", round(pred_msrp[, "lwr"], 2),
"to $", round(pred_msrp[, "upr"], 2), "\n")
## 90% PI for msrp: $ 15438.18 to $ 46331.16