library(ISLR2)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

Chapter 2 — Problem 9 (Auto Dataset)

(a) Quantitative vs Qualitative Predictors

data(Auto)
Auto <- na.omit(Auto)
str(Auto)

## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year

Qualitative: name, origin

(b) Range of Each Quantitative Predictor

quant_vars <- c("mpg","cylinders","displacement",
                "horsepower","weight","acceleration","year")

ranges <- sapply(Auto[, quant_vars], range)
rownames(ranges) <- c("Min", "Max")
print(ranges)

##      mpg cylinders displacement horsepower weight acceleration year
## Min  9.0         3           68         46   1613          8.0   70
## Max 46.6         8          455        230   5140         24.8   82

(c) Mean and Standard Deviation

means <- sapply(Auto[, quant_vars], mean)
sds   <- sapply(Auto[, quant_vars], sd)

summary_stats <- rbind(Mean = round(means, 2), SD = round(sds, 2))
print(summary_stats)

##        mpg cylinders displacement horsepower  weight acceleration  year
## Mean 23.45      5.47       194.41     104.47 2977.58        15.54 75.98
## SD    7.81      1.71       104.64      38.49  849.40         2.76  3.68

(d) Remove Observations 10–85 and Recompute

Auto_sub <- Auto[-(10:85), ]

ranges2 <- sapply(Auto_sub[, quant_vars], range)
rownames(ranges2) <- c("Min", "Max")

means2 <- sapply(Auto_sub[, quant_vars], mean)
sds2   <- sapply(Auto_sub[, quant_vars], sd)

cat("=== Range (subset) ===\n")

## === Range (subset) ===

print(ranges2)

##      mpg cylinders displacement horsepower weight acceleration year
## Min 11.0         3           68         46   1649          8.5   70
## Max 46.6         8          455        230   4997         24.8   82

cat("\n=== Mean (subset) ===\n")

## 
## === Mean (subset) ===

print(round(means2, 2))

##          mpg    cylinders displacement   horsepower       weight acceleration 
##        24.40         5.37       187.24       100.72      2935.97        15.73 
##         year 
##        77.15

cat("\n=== SD (subset) ===\n")

## 
## === SD (subset) ===

print(round(sds2, 2))

##          mpg    cylinders displacement   horsepower       weight acceleration 
##         7.87         1.65        99.68        35.71       811.30         2.69 
##         year 
##         3.11

(e) Graphical Investigation — Scatterplots

pairs(Auto[, quant_vars],
      main = "Scatterplot Matrix - Auto Dataset",
      pch  = 19,
      col  = "steelblue",
      cex  = 0.5)

Findings: There are strong negative relationships between mpg and displacement, horsepower, and weight. Heavier and more powerful cars tend to have lower fuel efficiency. year shows a positive relationship with mpg, suggesting cars became more fuel-efficient over time.

(f) Useful Predictors for mpg

par(mfrow = c(2, 3))

plot(Auto$displacement, Auto$mpg,
     xlab = "Displacement", ylab = "MPG",
     pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ displacement, data = Auto), col = "red", lwd = 2)

plot(Auto$horsepower, Auto$mpg,
     xlab = "Horsepower", ylab = "MPG",
     pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ horsepower, data = Auto), col = "red", lwd = 2)

plot(Auto$weight, Auto$mpg,
     xlab = "Weight", ylab = "MPG",
     pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ weight, data = Auto), col = "red", lwd = 2)

plot(Auto$acceleration, Auto$mpg,
     xlab = "Acceleration", ylab = "MPG",
     pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ acceleration, data = Auto), col = "red", lwd = 2)

plot(Auto$year, Auto$mpg,
     xlab = "Year", ylab = "MPG",
     pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ year, data = Auto), col = "red", lwd = 2)

plot(Auto$cylinders, Auto$mpg,
     xlab = "Cylinders", ylab = "MPG",
     pch = 19, col = "steelblue", cex = 0.6)
abline(lm(mpg ~ cylinders, data = Auto), col = "red", lwd = 2)

par(mfrow = c(1,1))

Conclusion: displacement, horsepower, weight, and cylinders are the strongest predictors of mpg. year is also useful — newer cars are more fuel-efficient.

Chapter 3 — Problem 9 (Auto Dataset — Multiple Linear Regression)

(a) Scatterplot Matrix

pairs(Auto[, quant_vars],
      main = "Scatterplot Matrix - Auto Dataset",
      pch  = 19, col = "steelblue", cex = 0.5)

(b) Correlation Matrix

cor_matrix <- cor(Auto[, quant_vars])
round(cor_matrix, 3)

##                 mpg cylinders displacement horsepower weight acceleration
## mpg           1.000    -0.778       -0.805     -0.778 -0.832        0.423
## cylinders    -0.778     1.000        0.951      0.843  0.898       -0.505
## displacement -0.805     0.951        1.000      0.897  0.933       -0.544
## horsepower   -0.778     0.843        0.897      1.000  0.865       -0.689
## weight       -0.832     0.898        0.933      0.865  1.000       -0.417
## acceleration  0.423    -0.505       -0.544     -0.689 -0.417        1.000
## year          0.581    -0.346       -0.370     -0.416 -0.309        0.290
##                year
## mpg           0.581
## cylinders    -0.346
## displacement -0.370
## horsepower   -0.416
## weight       -0.309
## acceleration  0.290
## year          1.000

(c) Multiple Linear Regression: mpg ~ all except name

lm_auto <- lm(mpg ~ . - name, data = Auto)
summary(lm_auto)

## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

i. Yes — the F-statistic is very large and p-value < 0.001, confirming a significant overall relationship between predictors and response.

ii. displacement, weight, year, and origin are statistically significant (p < 0.05).

iii. The positive coefficient for year (~0.75) suggests each additional year is associated with ~0.75 more mpg — cars became more fuel-efficient over time.

(d) Diagnostic Plots

par(mfrow = c(2, 2))
plot(lm_auto)

par(mfrow = c(1, 1))

Comments:

Residuals vs Fitted: Slight non-linear pattern — the linear model may not fully capture the relationship.
Q-Q Plot: Residuals approximately normal with a few outliers in the tails.
Scale-Location: Some heteroscedasticity at higher fitted values.
Leverage Plot: Observation 14 shows unusually high leverage.

(e) Interaction Effects

lm_interact <- lm(mpg ~ displacement * weight + year * origin, data = Auto)
summary(lm_interact)

## 
## Call:
## lm(formula = mpg ~ displacement * weight + year * origin, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5758 -1.6211 -0.0537  1.3264 13.3266 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.793e+01  8.044e+00   2.229 0.026394 *  
## displacement        -7.519e-02  9.091e-03  -8.271 2.19e-15 ***
## weight              -1.035e-02  6.450e-04 -16.053  < 2e-16 ***
## year                 4.864e-01  1.017e-01   4.782 2.47e-06 ***
## origin              -1.503e+01  4.232e+00  -3.551 0.000432 ***
## displacement:weight  2.098e-05  2.179e-06   9.625  < 2e-16 ***
## year:origin          1.980e-01  5.436e-02   3.642 0.000308 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.969 on 385 degrees of freedom
## Multiple R-squared:  0.8575, Adjusted R-squared:  0.8553 
## F-statistic: 386.2 on 6 and 385 DF,  p-value: < 2.2e-16

Finding: The interaction between displacement and weight is statistically significant, suggesting their combined effect on mpg is non-additive.

(f) Variable Transformations

par(mfrow = c(1, 3))

plot(log(Auto$horsepower), Auto$mpg,
     xlab = "log(horsepower)", ylab = "MPG",
     pch = 19, col = "steelblue", cex = 0.6,
     main = "log(X)")

plot(sqrt(Auto$horsepower), Auto$mpg,
     xlab = "sqrt(horsepower)", ylab = "MPG",
     pch = 19, col = "darkgreen", cex = 0.6,
     main = "sqrt(X)")

plot(Auto$horsepower^2, Auto$mpg,
     xlab = "horsepower^2", ylab = "MPG",
     pch = 19, col = "tomato", cex = 0.6,
     main = "X^2")

par(mfrow = c(1,1))

lm_log  <- lm(mpg ~ log(horsepower),  data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(horsepower), data = Auto)
lm_sq   <- lm(mpg ~ I(horsepower^2), data = Auto)

cat("R² log(hp)  :", round(summary(lm_log)$r.squared,  4), "\n")

## R² log(hp)  : 0.6683

cat("R² sqrt(hp) :", round(summary(lm_sqrt)$r.squared, 4), "\n")

## R² sqrt(hp) : 0.6437

cat("R² hp²      :", round(summary(lm_sq)$r.squared,   4), "\n")

## R² hp²      : 0.5074

Finding: log(horsepower) gives the best linear fit (highest R²), confirming the non-linear relationship between horsepower and mpg.

Chapter 3 — Problem 15 (Boston Dataset)

(a) Simple Linear Regression for Each Predictor

data(Boston)
predictors <- setdiff(names(Boston), "crim")

results <- data.frame(
  Predictor   = predictors,
  Coefficient = NA,
  P_value     = NA
)

for (i in seq_along(predictors)) {
  fit <- lm(Boston$crim ~ Boston[[ predictors[i] ]])
  s   <- summary(fit)
  results$Coefficient[i] <- round(coef(fit)[2], 4)
  results$P_value[i]     <- round(s$coefficients[2, 4], 6)
}

results$Significant <- ifelse(results$P_value < 0.05, "Yes", "No")
print(results)

##    Predictor Coefficient  P_value Significant
## 1         zn     -0.0739 0.000006         Yes
## 2      indus      0.5098 0.000000         Yes
## 3       chas     -1.8928 0.209435          No
## 4        nox     31.2485 0.000000         Yes
## 5         rm     -2.6841 0.000001         Yes
## 6        age      0.1078 0.000000         Yes
## 7        dis     -1.5509 0.000000         Yes
## 8        rad      0.6179 0.000000         Yes
## 9        tax      0.0297 0.000000         Yes
## 10   ptratio      1.1520 0.000000         Yes
## 11     black     -0.0363 0.000000         Yes
## 12     lstat      0.5488 0.000000         Yes
## 13      medv     -0.3632 0.000000         Yes

par(mfrow = c(4, 4))
for (p in predictors) {
  plot(Boston[[p]], Boston$crim,
       xlab = p, ylab = "crim",
       pch = 19, col = "steelblue", cex = 0.4)
  abline(lm(Boston$crim ~ Boston[[p]]), col = "red", lwd = 1.5)
}
par(mfrow = c(1,1))

Finding: Almost all predictors have a statistically significant association with crim. rad and tax show the strongest positive associations.

(b) Multiple Regression

lm_boston <- lm(crim ~ ., data = Boston)
summary(lm_boston)

## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.924 -2.120 -0.353  1.019 75.051 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.033228   7.234903   2.354 0.018949 *  
## zn            0.044855   0.018734   2.394 0.017025 *  
## indus        -0.063855   0.083407  -0.766 0.444294    
## chas         -0.749134   1.180147  -0.635 0.525867    
## nox         -10.313535   5.275536  -1.955 0.051152 .  
## rm            0.430131   0.612830   0.702 0.483089    
## age           0.001452   0.017925   0.081 0.935488    
## dis          -0.987176   0.281817  -3.503 0.000502 ***
## rad           0.588209   0.088049   6.680 6.46e-11 ***
## tax          -0.003780   0.005156  -0.733 0.463793    
## ptratio      -0.271081   0.186450  -1.454 0.146611    
## black        -0.007538   0.003673  -2.052 0.040702 *  
## lstat         0.126211   0.075725   1.667 0.096208 .  
## medv         -0.198887   0.060516  -3.287 0.001087 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared:  0.454,  Adjusted R-squared:  0.4396 
## F-statistic: 31.47 on 13 and 492 DF,  p-value: < 2.2e-16

Finding: Only zn, dis, rad, black, and medv remain statistically significant. Many predictors that were significant individually lose significance when controlling for others.

(c) Univariate vs Multiple Regression Coefficients

multi_coefs    <- coef(lm_boston)[-1]
uni_coefs      <- results$Coefficient
names(uni_coefs) <- predictors

common <- intersect(names(multi_coefs), names(uni_coefs))

plot(uni_coefs[common], multi_coefs[common],
     xlab = "Simple Regression Coefficients",
     ylab = "Multiple Regression Coefficients",
     pch  = 19, col = "steelblue",
     main = "Univariate vs Multiple Regression Coefficients")

abline(0, 1, col = "red", lty = 2)
text(uni_coefs[common], multi_coefs[common],
     labels = common, cex = 0.6, pos = 3)

Finding: Coefficients differ substantially between models, confirming multicollinearity among predictors.

(d) Non-linear Association (Polynomial Models)

# Exclude binary variable 'chas' — cannot fit poly degree 3 with only 2 values
poly_predictors <- setdiff(predictors, "chas")

poly_results <- data.frame(
  Predictor  = poly_predictors,
  Beta2_pval = NA,
  Beta3_pval = NA
)

for (i in seq_along(poly_predictors)) {
  p   <- poly_predictors[i]
  fit <- lm(crim ~ poly(Boston[[p]], 3), data = Boston)
  s   <- summary(fit)$coefficients
  poly_results$Beta2_pval[i] <- round(s[3, 4], 4)
  poly_results$Beta3_pval[i] <- round(s[4, 4], 4)
}

print(poly_results)

##    Predictor Beta2_pval Beta3_pval
## 1         zn     0.0044     0.2295
## 2      indus     0.0011     0.0000
## 3        nox     0.0001     0.0000
## 4         rm     0.0015     0.5086
## 5        age     0.0000     0.0067
## 6        dis     0.0000     0.0000
## 7        rad     0.0091     0.4823
## 8        tax     0.0000     0.2439
## 9    ptratio     0.0024     0.0063
## 10     black     0.4566     0.5436
## 11     lstat     0.0378     0.1299
## 12      medv     0.0000     0.0000

Finding: Several predictors (e.g., dis, nox, medv) show significant quadratic or cubic terms, confirming non-linear relationships with crime rate.

Chapter 4 — Problem 13 (Weekly Dataset)

(a) Numerical and Graphical Summaries

data(Weekly)
summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

pairs(Weekly[, -9],
      pch = 19,
      col = ifelse(Weekly$Direction == "Up", "steelblue", "tomato"),
      cex = 0.4,
      main = "Weekly Data - Blue=Up, Red=Down")

Pattern: Volume has increased steadily over time. The lag variables show very little correlation with Direction.

(b) Logistic Regression — Full Model

glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                data   = Weekly,
                family = binomial)
summary(glm_full)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Finding: Only Lag2 is statistically significant (p < 0.05).

(c) Confusion Matrix — Full Model

probs_full <- predict(glm_full, type = "response")
preds_full <- ifelse(probs_full > 0.5, "Up", "Down")

conf_matrix <- table(Predicted = preds_full, Actual = Weekly$Direction)
print(conf_matrix)

##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557

accuracy <- mean(preds_full == Weekly$Direction)
cat("\nOverall accuracy:", round(accuracy * 100, 2), "%\n")

## 
## Overall accuracy: 56.11 %

Interpretation: The model predicts “Up” most of the time, capturing the overall upward market bias but performing poorly at predicting “Down” weeks.

(d) Training 1990–2008 / Test 2009–2010 with Lag2 only

train <- Weekly$Year <= 2008
test  <- Weekly[!train, ]

glm_lag2 <- glm(Direction ~ Lag2,
                data   = Weekly,
                subset = train,
                family = binomial)

probs_test <- predict(glm_lag2, newdata = test, type = "response")
preds_test <- ifelse(probs_test > 0.5, "Up", "Down")

conf_test <- table(Predicted = preds_test, Actual = test$Direction)
print(conf_test)

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

accuracy_test <- mean(preds_test == test$Direction)
cat("\nTest accuracy (2009-2010):", round(accuracy_test * 100, 2), "%\n")

## 
## Test accuracy (2009-2010): 62.5 %

Finding: Using only Lag2 on the held-out test set gives better accuracy than the full model, confirming that a simpler model generalises better here.

Midterm

Kouassi Yoan Henoc

2026-04-15