ps8_code.knit

Problem 1

Load the data

hybrid <- read.csv("hybrid_car.csv")
head(hybrid)

##           vehicle year     msrp accelerate   mpg
## 1 Prius (1st Gen) 1997 24509.74       7.46 41.26
## 2            Tino 2000 35354.97       8.20 54.10
## 3 Prius (2nd Gen) 2000 26832.25       7.97 45.23
## 4         Insight 2000 18936.41       9.52 53.00
## 5 Civic (1st Gen) 2001 25833.38       7.04 47.04
## 6         Insight 2001 19036.71       9.52 53.00

str(hybrid)

## 'data.frame':    153 obs. of  5 variables:
##  $ vehicle   : chr  "Prius (1st Gen)" "Tino" "Prius (2nd Gen)" "Insight" ...
##  $ year      : int  1997 2000 2000 2000 2001 2001 2002 2003 2003 2003 ...
##  $ msrp      : num  24510 35355 26832 18936 25833 ...
##  $ accelerate: num  7.46 8.2 7.97 9.52 7.04 9.52 9.71 8.33 9.52 8.62 ...
##  $ mpg       : num  41.3 54.1 45.2 53 47 ...

Part (a)

# Fit the regression: msrp ~ year + mpg + accelerate
fit_a <- lm(msrp ~ year + mpg + accelerate, data = hybrid)
summary(fit_a)

## 
## Call:
## lm(formula = msrp ~ year + mpg + accelerate, data = hybrid)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40179  -8824  -2794   6800  48058 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 538047.5   748028.9   0.719 0.473091    
## year          -265.5      373.2  -0.712 0.477832    
## mpg           -470.6      127.3  -3.697 0.000306 ***
## accelerate    4291.3      501.6   8.554 1.33e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14850 on 149 degrees of freedom
## Multiple R-squared:  0.5289, Adjusted R-squared:  0.5194 
## F-statistic: 55.76 on 3 and 149 DF,  p-value: < 2.2e-16

# Residuals vs fitted values plot
plot(fitted(fit_a), resid(fit_a),
     xlab = "Fitted Values", ylab = "Residuals",
     main = "Residuals vs Fitted Values (Original Scale)",
     pch = 20, col = "steelblue")
abline(h = 0, lty = 2, col = "red")

Part (b)

# Create log-transformed variables
hybrid$lmsrp <- log(hybrid$msrp)
hybrid$lmpg  <- log(hybrid$mpg)

# Fit the regression: lmsrp ~ year + lmpg + accelerate
fit_b <- lm(lmsrp ~ year + lmpg + accelerate, data = hybrid)
summary(fit_b)

## 
## Call:
## lm(formula = lmsrp ~ year + lmpg + accelerate, data = hybrid)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07547 -0.21618 -0.00678  0.20964  0.92614 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.183371  16.639820   0.251    0.802    
## year         0.003439   0.008305   0.414    0.679    
## lmpg        -0.477847   0.098631  -4.845 3.15e-06 ***
## accelerate   0.086472   0.011163   7.747 1.35e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3303 on 149 degrees of freedom
## Multiple R-squared:  0.5454, Adjusted R-squared:  0.5363 
## F-statistic: 59.59 on 3 and 149 DF,  p-value: < 2.2e-16

Diagnostic plots

par(mfrow = c(2, 2))

# Residuals vs fitted
plot(fitted(fit_b), resid(fit_b),
     xlab = "Fitted Values", ylab = "Residuals",
     main = "Residuals vs Fitted (Log Scale)",
     pch = 20, col = "steelblue")
abline(h = 0, lty = 2, col = "red")

# Q-Q plot
qqnorm(resid(fit_b), pch = 20, col = "steelblue",
       main = "Normal Q-Q Plot of Residuals")
qqline(resid(fit_b), col = "red")

# Scale-Location plot
plot(fitted(fit_b), sqrt(abs(resid(fit_b))),
     xlab = "Fitted Values", ylab = expression(sqrt("|Residuals|")),
     main = "Scale-Location Plot",
     pch = 20, col = "steelblue")

# Residuals vs each predictor
plot(hybrid$accelerate, resid(fit_b),
     xlab = "Accelerate", ylab = "Residuals",
     main = "Residuals vs Accelerate",
     pch = 20, col = "steelblue")
abline(h = 0, lty = 2, col = "red")

par(mfrow = c(1, 1))

# Shapiro-Wilk test for normality
shapiro.test(resid(fit_b))

## 
##  Shapiro-Wilk normality test
## 
## data:  resid(fit_b)
## W = 0.99205, p-value = 0.5547

Part (c)

coef(fit_b)

##  (Intercept)         year         lmpg   accelerate 
##  4.183371010  0.003438883 -0.477847308  0.086471933

Part (d)

B <- 10000
n <- nrow(hybrid)
boot_lmpg <- numeric(B)

for (i in 1:B) {
  idx <- sample(1:n, n, replace = TRUE)
  boot_data <- hybrid[idx, ]
  boot_fit <- lm(lmsrp ~ year + lmpg + accelerate, data = boot_data)
  boot_lmpg[i] <- coef(boot_fit)["lmpg"]
}

# 95% bootstrap CI (percentile method)
ci <- quantile(boot_lmpg, c(0.025, 0.975))
cat("95% Bootstrap CI for beta_lmpg:", ci, "\n")

## 95% Bootstrap CI for beta_lmpg: -0.6970528 -0.2757792

# Histogram of bootstrap distribution
hist(boot_lmpg, breaks = 50, col = "lightblue", border = "white",
     main = "Bootstrap Distribution of lmpg Coefficient",
     xlab = expression(hat(beta)[lmpg]))
abline(v = ci, col = "red", lty = 2, lwd = 2)
abline(v = coef(fit_b)["lmpg"], col = "blue", lwd = 2)
legend("topright", c("Original estimate", "95% CI bounds"),
       col = c("blue", "red"), lty = c(1, 2), lwd = 2)

95% Bootstrap CI for beta_lmpg: -0.6934129 -0.2745603

Part (e)

beta_accel <- coef(fit_b)["accelerate"]
ratio <- exp(2 * beta_accel)
pct_diff <- (ratio - 1) * 100
cat("exp(2 * beta_accelerate) =", ratio, "\n")

## exp(2 * beta_accelerate) = 1.188799

cat("Predicted percentage difference:", round(pct_diff, 2), "%\n")

## Predicted percentage difference: 18.88 %

Part (g)

# New observation
new_data <- data.frame(year = 2009, accelerate = 10, lmpg = log(40))

# Prediction interval on log scale
pred_log <- predict(fit_b, newdata = new_data,
                    interval = "prediction", level = 0.90)
cat("90% PI for log(msrp):", pred_log[, "lwr"], "to", pred_log[, "upr"], "\n")

## 90% PI for log(msrp): 9.644599 to 10.74357

# Transform to msrp scale
pred_msrp <- exp(pred_log)
cat("90% PI for msrp: $", round(pred_msrp[, "lwr"], 2),
    "to $", round(pred_msrp[, "upr"], 2), "\n")

## 90% PI for msrp: $ 15438.18 to $ 46331.16