1 Chapter 2 – Exercise 9 (Auto Dataset)

Make sure missing values have been removed.

data(Auto)
Auto <- na.omit(Auto)
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

1.1 (a) Quantitative vs. Qualitative Predictors

# Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year
# Qualitative: name, origin (though origin is coded as integer, it is categorical)
cat("Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year\n")
## Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year
cat("Qualitative predictors: origin (categorical: 1=American, 2=European, 3=Japanese), name\n")
## Qualitative predictors: origin (categorical: 1=American, 2=European, 3=Japanese), name

1.2 (b) Range of Each Quantitative Predictor

quant_vars <- c("mpg", "cylinders", "displacement", "horsepower",
                "weight", "acceleration", "year")
ranges <- sapply(Auto[, quant_vars], range)
rownames(ranges) <- c("Min", "Max")
knitr::kable(ranges, caption = "Range of Quantitative Predictors")
Range of Quantitative Predictors
mpg cylinders displacement horsepower weight acceleration year
Min 9.0 3 68 46 1613 8.0 70
Max 46.6 8 455 230 5140 24.8 82

1.3 (c) Mean and Standard Deviation of Each Quantitative Predictor

means <- sapply(Auto[, quant_vars], mean)
sds   <- sapply(Auto[, quant_vars], sd)
summary_stats <- rbind(Mean = round(means, 2), SD = round(sds, 2))
knitr::kable(summary_stats, caption = "Mean and SD of Quantitative Predictors")
Mean and SD of Quantitative Predictors
mpg cylinders displacement horsepower weight acceleration year
Mean 23.45 5.47 194.41 104.47 2977.58 15.54 75.98
SD 7.81 1.71 104.64 38.49 849.40 2.76 3.68

1.4 (d) Remove Observations 10–85, then Recompute

Auto_sub <- Auto[-(10:85), ]
ranges_sub <- sapply(Auto_sub[, quant_vars], range)
rownames(ranges_sub) <- c("Min", "Max")
means_sub  <- sapply(Auto_sub[, quant_vars], mean)
sds_sub    <- sapply(Auto_sub[, quant_vars], sd)

knitr::kable(ranges_sub, caption = "Range after removing obs 10–85")
Range after removing obs 10–85
mpg cylinders displacement horsepower weight acceleration year
Min 11.0 3 68 46 1649 8.5 70
Max 46.6 8 455 230 4997 24.8 82
summary_sub <- rbind(Mean = round(means_sub, 2), SD = round(sds_sub, 2))
knitr::kable(summary_sub, caption = "Mean and SD after removing obs 10–85")
Mean and SD after removing obs 10–85
mpg cylinders displacement horsepower weight acceleration year
Mean 24.40 5.37 187.24 100.72 2935.97 15.73 77.15
SD 7.87 1.65 99.68 35.71 811.30 2.69 3.11

1.5 (e) Graphical Investigation of Predictors (Full Dataset)

ggpairs(Auto[, quant_vars],
        lower  = list(continuous = wrap("points", alpha = 0.3, size = 0.8)),
        diag   = list(continuous = wrap("densityDiag")),
        upper  = list(continuous = wrap("cor", size = 3))) +
  theme_bw(base_size = 9) +
  ggtitle("Scatterplot Matrix – Auto Dataset (Quantitative Variables)")

Findings: Strong negative correlations exist between mpg and displacement, horsepower, and weight. cylinders, displacement, horsepower, and weight are all highly positively correlated with each other. year shows a moderate positive correlation with mpg, suggesting fuel efficiency has improved over model years.

1.6 (f) Which Variables Are Useful for Predicting mpg?

par(mfrow = c(2, 3))
for (v in setdiff(quant_vars, "mpg")) {
  plot(Auto[[v]], Auto$mpg,
       xlab = v, ylab = "mpg",
       main = paste("mpg vs", v),
       pch  = 20, col = rgb(0.2, 0.4, 0.8, 0.4))
  abline(lm(mpg ~ Auto[[v]], data = Auto), col = "red", lwd = 2)
}

par(mfrow = c(1, 1))

Conclusion: displacement, horsepower, and weight show the strongest (negative) linear relationships with mpg and are the most useful predictors. year has a moderate positive relationship, indicating newer cars are more efficient. cylinders also tracks closely with engine size predictors.


2 Chapter 3 – Exercise 9 (Multiple Linear Regression on Auto)

2.1 (a) Scatterplot Matrix

pairs(Auto[, quant_vars],
      pch = 20, col = rgb(0.3, 0.5, 0.7, 0.3),
      main = "Scatterplot Matrix – Auto (All Quantitative Variables)")

2.2 (b) Correlation Matrix (Excluding name)

cor_mat <- cor(Auto[, quant_vars])
knitr::kable(round(cor_mat, 3), caption = "Correlation Matrix")
Correlation Matrix
mpg cylinders displacement horsepower weight acceleration year
mpg 1.000 -0.778 -0.805 -0.778 -0.832 0.423 0.581
cylinders -0.778 1.000 0.951 0.843 0.898 -0.505 -0.346
displacement -0.805 0.951 1.000 0.897 0.933 -0.544 -0.370
horsepower -0.778 0.843 0.897 1.000 0.865 -0.689 -0.416
weight -0.832 0.898 0.933 0.865 1.000 -0.417 -0.309
acceleration 0.423 -0.505 -0.544 -0.689 -0.417 1.000 0.290
year 0.581 -0.346 -0.370 -0.416 -0.309 0.290 1.000
corrplot(cor_mat, method = "color", type = "upper",
         addCoef.col = "black", tl.col = "black",
         number.cex = 0.7, title = "Correlation Matrix", mar = c(0,0,1,0))

2.3 (c) Multiple Linear Regression: mpg ~ all except name

lm_fit <- lm(mpg ~ . - name, data = Auto)
summary(lm_fit)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

i. Relationship between predictors and response?
Yes – the F-statistic is very large with a near-zero p-value, indicating a statistically significant overall relationship.

ii. Which predictors are statistically significant?
displacement, weight, year, and origin all have p-values < 0.05 and appear statistically significant.

iii. What does the coefficient for year suggest?
The positive coefficient (~0.75) suggests that, holding other variables constant, each additional model year is associated with about 0.75 more miles per gallon – fuel efficiency has improved over time.

2.4 (d) Diagnostic Plots

par(mfrow = c(2, 2))
plot(lm_fit)

par(mfrow = c(1, 1))

Comments:

  • Residuals vs Fitted: There is a slight non-linear pattern (U-shape), suggesting the linear model may not fully capture the relationship.
  • Normal Q-Q: Residuals are mostly normal, with slight deviation in the tails.
  • Scale-Location: Some heteroscedasticity may be present (spread increases slightly at higher fitted values).
  • Leverage plot: Observation 14 has notably high leverage. A few points may be high-leverage but not necessarily influential outliers.

2.5 (e) Interaction Terms

lm_inter <- lm(mpg ~ displacement * weight + year * origin + horsepower * weight, data = Auto)
summary(lm_inter)
## 
## Call:
## lm(formula = mpg ~ displacement * weight + year * origin + horsepower * 
##     weight, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4474 -1.6780 -0.0525  1.4634 11.8047 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.841e+01  7.914e+00   2.326  0.02056 *  
## displacement        -3.282e-02  1.798e-02  -1.825  0.06874 .  
## weight              -1.094e-02  6.996e-04 -15.641  < 2e-16 ***
## year                 5.288e-01  1.021e-01   5.177 3.65e-07 ***
## origin              -1.091e+01  4.319e+00  -2.525  0.01198 *  
## horsepower          -1.357e-01  4.204e-02  -3.229  0.00135 ** 
## displacement:weight  1.102e-05  5.187e-06   2.125  0.03421 *  
## year:origin          1.491e-01  5.528e-02   2.698  0.00729 ** 
## weight:horsepower    2.936e-05  1.216e-05   2.415  0.01622 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.899 on 383 degrees of freedom
## Multiple R-squared:  0.8648, Adjusted R-squared:  0.862 
## F-statistic: 306.3 on 8 and 383 DF,  p-value: < 2.2e-16

Finding: The interaction horsepower:weight is statistically significant (p < 0.05), suggesting the combined effect of engine power and vehicle weight on fuel economy is greater than each factor alone.

2.6 (f) Variable Transformations

par(mfrow = c(1, 3))
# log transformation
lm_log  <- lm(mpg ~ log(horsepower), data = Auto)
plot(log(Auto$horsepower), Auto$mpg,
     xlab = "log(horsepower)", ylab = "mpg", main = "mpg vs log(hp)", pch = 20,
     col = rgb(0.2, 0.4, 0.8, 0.5))
abline(lm_log, col = "red", lwd = 2)

# sqrt transformation
lm_sqrt <- lm(mpg ~ sqrt(horsepower), data = Auto)
plot(sqrt(Auto$horsepower), Auto$mpg,
     xlab = "sqrt(horsepower)", ylab = "mpg", main = "mpg vs sqrt(hp)", pch = 20,
     col = rgb(0.2, 0.7, 0.4, 0.5))
abline(lm_sqrt, col = "red", lwd = 2)

# quadratic transformation
lm_sq   <- lm(mpg ~ horsepower + I(horsepower^2), data = Auto)
hp_seq  <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 200)
pred_sq <- predict(lm_sq, newdata = data.frame(horsepower = hp_seq))
plot(Auto$horsepower, Auto$mpg,
     xlab = "horsepower", ylab = "mpg", main = "mpg vs hp + hp²", pch = 20,
     col = rgb(0.8, 0.3, 0.2, 0.5))
lines(hp_seq, pred_sq, col = "red", lwd = 2)

par(mfrow = c(1, 1))

Findings: The log(horsepower) and sqrt(horsepower) transformations linearize the relationship better than the raw variable. The quadratic fit also captures the non-linearity well, producing a lower residual standard error than the simple linear model.


3 Chapter 3 – Exercise 15 (Boston Dataset – Crime Rate)

data(Boston)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7

3.1 (a) Simple Linear Regression for Each Predictor

predictors <- setdiff(names(Boston), "crim")

# Fit univariate models and collect results
uni_results <- lapply(predictors, function(p) {
  fit <- lm(as.formula(paste("crim ~", p)), data = Boston)
  s   <- summary(fit)
  data.frame(
    predictor = p,
    coef      = round(coef(fit)[2], 4),
    p_value   = round(s$coefficients[2, 4], 4),
    r_squared = round(s$r.squared, 4)
  )
})
uni_df <- do.call(rbind, uni_results)
uni_df$significant <- ifelse(uni_df$p_value < 0.05, "Yes", "No")
knitr::kable(uni_df, row.names = FALSE,
             caption = "Univariate Regression Results (response = crim)")
Univariate Regression Results (response = crim)
predictor coef p_value r_squared significant
zn -0.0739 0.0000 0.0402 Yes
indus 0.5098 0.0000 0.1653 Yes
chas -1.8928 0.2094 0.0031 No
nox 31.2485 0.0000 0.1772 Yes
rm -2.6841 0.0000 0.0481 Yes
age 0.1078 0.0000 0.1244 Yes
dis -1.5509 0.0000 0.1441 Yes
rad 0.6179 0.0000 0.3913 Yes
tax 0.0297 0.0000 0.3396 Yes
ptratio 1.1520 0.0000 0.0841 Yes
lstat 0.5488 0.0000 0.2076 Yes
medv -0.3632 0.0000 0.1508 Yes
par(mfrow = c(4, 4))
for (p in predictors) {
  fit <- lm(as.formula(paste("crim ~", p)), data = Boston)
  plot(Boston[[p]], Boston$crim,
       xlab = p, ylab = "crim", pch = 20,
       col  = rgb(0.3, 0.5, 0.8, 0.4), cex = 0.6)
  abline(fit, col = "red", lwd = 2)
}
par(mfrow = c(1, 1))

Finding: Most predictors show a statistically significant association with crim. Exceptions include chas (the Charles River dummy variable), which is not significant. Variables like rad (accessibility to radial highways) and tax show the strongest associations.

3.2 (b) Multiple Regression

lm_multi <- lm(crim ~ ., data = Boston)
summary(lm_multi)
## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.534 -2.248 -0.348  1.087 73.923 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.7783938  7.0818258   1.946 0.052271 .  
## zn           0.0457100  0.0187903   2.433 0.015344 *  
## indus       -0.0583501  0.0836351  -0.698 0.485709    
## chas        -0.8253776  1.1833963  -0.697 0.485841    
## nox         -9.9575865  5.2898242  -1.882 0.060370 .  
## rm           0.6289107  0.6070924   1.036 0.300738    
## age         -0.0008483  0.0179482  -0.047 0.962323    
## dis         -1.0122467  0.2824676  -3.584 0.000373 ***
## rad          0.6124653  0.0875358   6.997 8.59e-12 ***
## tax         -0.0037756  0.0051723  -0.730 0.465757    
## ptratio     -0.3040728  0.1863598  -1.632 0.103393    
## lstat        0.1388006  0.0757213   1.833 0.067398 .  
## medv        -0.2200564  0.0598240  -3.678 0.000261 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared:  0.4493, Adjusted R-squared:  0.4359 
## F-statistic: 33.52 on 12 and 493 DF,  p-value: < 2.2e-16

Finding: In the multiple regression, we can reject H₀: βⱼ = 0 for zn, dis, rad, black, and medv at α = 0.05. Many predictors that were significant in simple regression are no longer significant when controlling for others, suggesting multicollinearity.

3.3 (c) Univariate vs Multiple Regression Coefficients

multi_coefs <- coef(lm_multi)[-1]  # exclude intercept
uni_coefs   <- setNames(uni_df$coef, uni_df$predictor)

# Align by name
common <- intersect(names(uni_coefs), names(multi_coefs))
plot_df <- data.frame(
  univariate = uni_coefs[common],
  multiple   = multi_coefs[common],
  predictor  = common
)

ggplot(plot_df, aes(x = univariate, y = multiple, label = predictor)) +
  geom_point(color = "steelblue", size = 3) +
  geom_text(vjust = -0.6, size = 3) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  labs(title    = "Univariate vs Multiple Regression Coefficients",
       subtitle = "Response: per capita crime rate (crim)",
       x = "Simple Regression Coefficient",
       y = "Multiple Regression Coefficient") +
  theme_bw()

Observation: The coefficients differ substantially for several predictors (notably nox), indicating that simple regression conflates the effect of correlated predictors.

3.4 (d) Non-Linear Association (Polynomial Terms)

poly_results <- lapply(setdiff(predictors, "chas"), function(p) {
  fit3 <- lm(as.formula(paste("crim ~ poly(", p, ", 3)")), data = Boston)
  s    <- summary(fit3)
  coef_p <- s$coefficients
  sig2 <- ifelse(nrow(coef_p) >= 3 && coef_p[3, 4] < 0.05, "Yes", "No")
  sig3 <- ifelse(nrow(coef_p) >= 4 && coef_p[4, 4] < 0.05, "Yes", "No")
  data.frame(predictor = p,
             quad_sig  = sig2,
             cubic_sig = sig3)
})
poly_df <- do.call(rbind, poly_results)
knitr::kable(poly_df, row.names = FALSE,
             caption = "Evidence of Non-Linear Association (cubic polynomial)")
Evidence of Non-Linear Association (cubic polynomial)
predictor quad_sig cubic_sig
zn Yes No
indus Yes Yes
nox Yes Yes
rm Yes No
age Yes Yes
dis Yes Yes
rad Yes No
tax Yes No
ptratio Yes Yes
lstat Yes No
medv Yes Yes

Conclusion: Several predictors (including indus, nox, age, dis, ptratio, medv) show statistically significant quadratic or cubic terms, indicating non-linear relationships with crime rate.


4 Chapter 4 – Exercise 13 (Weekly Stock Market Data)

data(Weekly)
head(Weekly)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down
str(Weekly)
## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

4.1 (a) Numerical and Graphical Summaries

summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
par(mfrow = c(2, 3))
for (v in c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume")) {
  hist(Weekly[[v]], main = v, xlab = v,
       col = "steelblue", border = "white", breaks = 30)
}

par(mfrow = c(1, 1))
plot(Weekly$Year, Weekly$Volume,
     type = "l", col = "steelblue", lwd = 1.5,
     xlab = "Year", ylab = "Volume (avg shares traded, billions)",
     main = "Weekly Trading Volume Over Time")

Patterns: Trading volume has increased substantially over time, especially from the late 1990s through 2008. The lag variables show little obvious pattern on their own.

4.2 (b) Logistic Regression – Full Model

glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                data   = Weekly,
                family = binomial)
summary(glm_full)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Finding: Only Lag2 has a statistically significant p-value (< 0.05). The other lag variables and Volume are not significant in this model.

4.3 (c) Confusion Matrix – Full Model

prob_full  <- predict(glm_full, type = "response")
pred_full  <- ifelse(prob_full > 0.5, "Up", "Down")
conf_full  <- table(Predicted = pred_full, Actual = Weekly$Direction)
conf_full
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
# Overall fraction correct
acc_full <- mean(pred_full == Weekly$Direction)
cat("Overall accuracy (full model):", round(acc_full, 4), "\n")
## Overall accuracy (full model): 0.5611

Interpretation: The model correctly predicts about 56.1% of observations. However, the confusion matrix shows the model is biased toward predicting “Up” – it frequently misclassifies “Down” weeks. The high accuracy largely reflects the market’s upward bias over the sample period.

4.4 (d) Logistic Regression – Training 1990–2008, Test 2009–2010 (Lag2 only)

train <- Weekly$Year <= 2008
test  <- Weekly[!train, ]

glm_lag2 <- glm(Direction ~ Lag2,
                data   = Weekly,
                subset = train,
                family = binomial)
summary(glm_lag2)
## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly, 
##     subset = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4
prob_lag2 <- predict(glm_lag2, newdata = test, type = "response")
pred_lag2 <- ifelse(prob_lag2 > 0.5, "Up", "Down")

conf_lag2 <- table(Predicted = pred_lag2, Actual = test$Direction)
conf_lag2
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
acc_lag2 <- mean(pred_lag2 == test$Direction)
cat("Test accuracy (Lag2 only, 2009–2010):", round(acc_lag2, 4), "\n")
## Test accuracy (Lag2 only, 2009–2010): 0.625

Finding: The model using only Lag2 achieves approximately 62.5% accuracy on held-out data from 2009–2010. This is better than random chance and better than the full model on the training data, suggesting Lag2 is a useful predictor.


End of Report

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] corrplot_0.95 dplyr_1.1.4   GGally_2.4.0  ggplot2_4.0.0 ISLR2_1.3-2  
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        cli_3.6.5          knitr_1.50         rlang_1.1.6       
##  [5] xfun_0.52          purrr_1.1.0        generics_0.1.4     S7_0.2.0          
##  [9] jsonlite_2.0.0     labeling_0.4.3     glue_1.8.0         htmltools_0.5.8.1 
## [13] sass_0.4.10        scales_1.4.0       rmarkdown_2.29     ggstats_0.13.0    
## [17] grid_4.5.1         tibble_3.3.0       evaluate_1.0.5     jquerylib_0.1.4   
## [21] fastmap_1.2.0      yaml_2.3.10        lifecycle_1.0.4    compiler_4.5.1    
## [25] RColorBrewer_1.1-3 pkgconfig_2.0.3    tidyr_1.3.1        rstudioapi_0.17.1 
## [29] farver_2.1.2       digest_0.6.37      R6_2.6.1           tidyselect_1.2.1  
## [33] pillar_1.11.1      magrittr_2.0.4     bslib_0.9.0        withr_3.0.2       
## [37] tools_4.5.1        gtable_0.3.6       cachem_1.1.0