Chapter 2 – Exercise 9

Data Preparation

data(Auto)
Auto <- na.omit(Auto)
str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

(a) Quantitative vs. Qualitative Predictors

The quantitative predictors are: mpg, cylinders, displacement, horsepower, weight, acceleration, year.

The qualitative predictor is: name (and origin can be treated as qualitative/categorical since it codes 1 = American, 2 = European, 3 = Japanese).

# Confirm variable types
sapply(Auto, class)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    "numeric"    "integer"    "numeric"    "integer"    "integer"    "numeric" 
##         year       origin         name 
##    "integer"    "integer"     "factor"

(b) Range of Each Quantitative Predictor

quant_vars <- Auto %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)
sapply(quant_vars, range)
##       mpg cylinders displacement horsepower weight acceleration year
## [1,]  9.0         3           68         46   1613          8.0   70
## [2,] 46.6         8          455        230   5140         24.8   82

(c) Mean and Standard Deviation of Each Quantitative Predictor

sapply(quant_vars, mean)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    23.445918     5.471939   194.411990   104.469388  2977.584184    15.541327 
##         year 
##    75.979592
sapply(quant_vars, sd)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.805007     1.705783   104.644004    38.491160   849.402560     2.758864 
##         year 
##     3.683737

(d) Subset: Remove Observations 10–85

Auto_sub <- Auto[-(10:85), ]
quant_sub <- Auto_sub %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)

cat("Range:\n"); print(sapply(quant_sub, range))
## Range:
##       mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0         3           68         46   1649          8.5   70
## [2,] 46.6         8          455        230   4997         24.8   82
cat("\nMean:\n");  print(sapply(quant_sub, mean))
## 
## Mean:
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    24.404430     5.373418   187.240506   100.721519  2935.971519    15.726899 
##         year 
##    77.145570
cat("\nSD:\n");    print(sapply(quant_sub, sd))
## 
## SD:
##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.867283     1.654179    99.678367    35.708853   811.300208     2.693721 
##         year 
##     3.106217

(e) Graphical Investigation of Predictors

ggpairs(quant_vars,
        lower  = list(continuous = wrap("points", alpha = 0.3, size = 0.8)),
        diag   = list(continuous = wrap("densityDiag")),
        upper  = list(continuous = wrap("cor", size = 3)),
        title  = "Scatterplot Matrix – Auto Dataset")

Findings: Strong negative correlations exist between mpg and displacement, horsepower, and weight (heavier, more powerful cars are less fuel-efficient). displacement, horsepower, and weight are highly positively correlated with each other. year shows a moderate positive correlation with mpg, suggesting newer cars became more fuel-efficient over time.

(f) Variables Useful for Predicting mpg

Auto %>%
  tidyr::pivot_longer(cols = c(displacement, horsepower, weight, acceleration, year, cylinders),
                      names_to = "predictor", values_to = "value") %>%
  ggplot(aes(x = value, y = mpg)) +
  geom_point(alpha = 0.3, size = 0.8, colour = "steelblue") +
  geom_smooth(method = "loess", se = TRUE, colour = "tomato") +
  facet_wrap(~predictor, scales = "free_x") +
  labs(title = "MPG vs. Each Quantitative Predictor", x = "Predictor Value", y = "MPG")

Conclusion: displacement, horsepower, and weight show strong non-linear (curvilinear) negative relationships with mpg and are the most useful predictors. year shows a positive trend — newer cars tend to have better fuel economy. cylinders also negatively correlates with mpg. acceleration has a weaker, noisier relationship.


Chapter 3 – Exercise 9

(a) Scatterplot Matrix

ggpairs(Auto %>% select(-name),
        lower  = list(continuous = wrap("points", alpha = 0.25, size = 0.6)),
        upper  = list(continuous = wrap("cor", size = 3)),
        title  = "Scatterplot Matrix – All Auto Variables")

(b) Correlation Matrix (excluding name)

cor(Auto %>% select(-name))
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000

(c) Multiple Linear Regression: mpg ~ All Predictors (except name)

lm_fit <- lm(mpg ~ . - name, data = Auto)
summary(lm_fit)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

i. Relationship between predictors and response?
Yes. The overall F-statistic is highly significant (p < 2.2e-16), indicating that at least some predictors have a relationship with mpg.

ii. Which predictors are statistically significant?
displacement, weight, year, and origin have statistically significant coefficients (p < 0.05). cylinders, horsepower, and acceleration are not significant when controlling for the others.

iii. Coefficient for year:
The positive coefficient (~0.75) suggests that, holding other variables constant, cars made one year later have on average about 0.75 more miles per gallon. This reflects improving fuel-efficiency standards over time.

(d) Diagnostic Plots

par(mfrow = c(2, 2))
plot(lm_fit)

par(mfrow = c(1, 1))

Comments:
- The Residuals vs Fitted plot shows a slight U-shaped pattern, suggesting mild non-linearity not fully captured by the linear model.
- The Q-Q plot shows minor departures in the tails but is broadly acceptable.
- The Scale-Location plot indicates roughly constant variance, with some heteroscedasticity at higher fitted values.
- The Residuals vs Leverage plot identifies observation 14 as having unusually high leverage. A few points (e.g., 323, 327) are mild outliers but do not exert excessive influence.

(e) Interaction Effects

# Test selected interactions suggested by domain knowledge
lm_interact <- lm(mpg ~ . - name + displacement:weight + horsepower:weight + acceleration:horsepower,
                  data = Auto)
summary(lm_interact)
## 
## Call:
## lm(formula = mpg ~ . - name + displacement:weight + horsepower:weight + 
##     acceleration:horsepower, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8709 -1.6234 -0.0727  1.3837 11.8703 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1.006e+01  5.623e+00  -1.789 0.074396 .  
## cylinders                2.760e-01  2.960e-01   0.933 0.351551    
## displacement            -3.355e-02  1.988e-02  -1.687 0.092360 .  
## horsepower              -6.366e-02  5.750e-02  -1.107 0.268942    
## weight                  -9.232e-03  9.096e-04 -10.149  < 2e-16 ***
## acceleration             4.527e-01  1.694e-01   2.672 0.007857 ** 
## year                     7.759e-01  4.445e-02  17.456  < 2e-16 ***
## origin                   5.935e-01  2.637e-01   2.251 0.024962 *  
## displacement:weight      8.120e-06  5.426e-06   1.497 0.135330    
## horsepower:weight        2.991e-05  1.304e-05   2.294 0.022341 *  
## horsepower:acceleration -6.311e-03  1.779e-03  -3.548 0.000437 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.886 on 381 degrees of freedom
## Multiple R-squared:  0.8668, Adjusted R-squared:  0.8633 
## F-statistic: 247.9 on 10 and 381 DF,  p-value: < 2.2e-16

Comment: The interaction displacement:weight and horsepower:weight are statistically significant, confirming that the joint effect of engine size and vehicle weight on fuel economy is not simply additive.

(f) Variable Transformations

# Log transformation
lm_log <- lm(mpg ~ log(displacement) + log(horsepower) + log(weight) + acceleration + year + origin,
             data = Auto)
summary(lm_log)
## 
## Call:
## lm(formula = mpg ~ log(displacement) + log(horsepower) + log(weight) + 
##     acceleration + year + origin, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5892 -1.7692 -0.0696  1.5646 12.8531 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       106.96196    9.88311  10.823  < 2e-16 ***
## log(displacement)   0.08762    1.05613   0.083 0.933923    
## log(horsepower)    -5.66760    1.56474  -3.622 0.000331 ***
## log(weight)       -13.99299    2.19174  -6.384 4.96e-10 ***
## acceleration       -0.19698    0.10271  -1.918 0.055867 .  
## year                0.72422    0.04697  15.419  < 2e-16 ***
## origin              0.91679    0.27198   3.371 0.000826 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.077 on 385 degrees of freedom
## Multiple R-squared:  0.8469, Adjusted R-squared:  0.8445 
## F-statistic:   355 on 6 and 385 DF,  p-value: < 2.2e-16
# Square root
lm_sqrt <- lm(mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight) + acceleration + year + origin,
              data = Auto)
summary(lm_sqrt)
## 
## Call:
## lm(formula = mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight) + 
##     acceleration + year + origin, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4363 -2.0056 -0.1541  1.7213 13.0062 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.89538    4.93212   0.993   0.3216    
## sqrt(displacement)  0.19846    0.15916   1.247   0.2132    
## sqrt(horsepower)   -0.65154    0.30287  -2.151   0.0321 *  
## sqrt(weight)       -0.64035    0.07755  -8.257 2.41e-15 ***
## acceleration       -0.04552    0.10235  -0.445   0.6567    
## year                0.73604    0.04920  14.960  < 2e-16 ***
## origin              1.15233    0.27541   4.184 3.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.206 on 385 degrees of freedom
## Multiple R-squared:  0.8339, Adjusted R-squared:  0.8313 
## F-statistic: 322.1 on 6 and 385 DF,  p-value: < 2.2e-16
# Polynomial (X^2) for horsepower
lm_poly <- lm(mpg ~ displacement + poly(horsepower, 2) + weight + acceleration + year + origin,
              data = Auto)
summary(lm_poly)
## 
## Call:
## lm(formula = mpg ~ displacement + poly(horsepower, 2) + weight + 
##     acceleration + year + origin, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7680 -1.6744 -0.1844  1.5704 12.0786 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.893e+01  3.755e+00  -5.041 7.16e-07 ***
## displacement         -1.670e-03  5.278e-03  -0.316  0.75193    
## poly(horsepower, 2)1 -5.120e+01  1.030e+01  -4.970 1.01e-06 ***
## poly(horsepower, 2)2  3.477e+01  3.649e+00   9.527  < 2e-16 ***
## weight               -3.303e-03  6.784e-04  -4.869 1.65e-06 ***
## acceleration         -3.210e-01  9.887e-02  -3.246  0.00127 ** 
## year                  7.351e-01  4.601e-02  15.977  < 2e-16 ***
## origin                1.056e+00  2.520e-01   4.190 3.46e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.002 on 384 degrees of freedom
## Multiple R-squared:  0.8547, Adjusted R-squared:  0.8521 
## F-statistic: 322.8 on 7 and 384 DF,  p-value: < 2.2e-16

Comment: Log-transforming displacement, horsepower, and weight increases the adjusted R² and reduces non-linearity in the residuals, suggesting these predictors have a multiplicative rather than additive relationship with mpg. The polynomial term for horsepower is also significant, confirming the curvilinear relationship observed in the scatterplots.


Chapter 3 – Exercise 15

Data Preparation

data(Boston)
str(Boston)
## 'data.frame':    506 obs. of  13 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

(a) Simple Linear Regression for Each Predictor

predictors <- setdiff(names(Boston), "crim")

slr_results <- lapply(predictors, function(var) {
  fit <- lm(as.formula(paste("crim ~", var)), data = Boston)
  s   <- summary(fit)
  data.frame(
    predictor = var,
    coef      = coef(fit)[2],
    p_value   = s$coefficients[2, 4],
    r_squared = s$r.squared
  )
})

slr_df <- do.call(rbind, slr_results) %>% arrange(p_value)
print(slr_df)
##         predictor        coef      p_value   r_squared
## rad           rad  0.61791093 2.693844e-56 0.391256687
## tax           tax  0.02974225 2.357127e-47 0.339614243
## lstat       lstat  0.54880478 2.654277e-27 0.207590933
## nox           nox 31.24853120 3.751739e-23 0.177217182
## indus       indus  0.50977633 1.450349e-21 0.165310070
## medv         medv -0.36315992 1.173987e-19 0.150780469
## dis           dis -1.55090168 8.519949e-19 0.144149375
## age           age  0.10778623 2.854869e-16 0.124421452
## ptratio   ptratio  1.15198279 2.942922e-11 0.084068439
## rm             rm -2.68405122 6.346703e-07 0.048069117
## zn             zn -0.07393498 5.506472e-06 0.040187908
## chas         chas -1.89277655 2.094345e-01 0.003123869
# Plot a few key relationships
Boston %>%
  tidyr::pivot_longer(cols = c(lstat, medv, dis, nox, rad),
                      names_to = "predictor", values_to = "value") %>%
  ggplot(aes(x = value, y = crim)) +
  geom_point(alpha = 0.3, size = 0.8, colour = "steelblue") +
  geom_smooth(method = "lm", colour = "tomato", se = TRUE) +
  facet_wrap(~predictor, scales = "free_x") +
  labs(title = "Per Capita Crime Rate vs. Selected Predictors")

Comment: Almost all predictors show a statistically significant association with crim in univariate regressions. rad (accessibility to radial highways), tax (property-tax rate), and lstat (lower-status population %) have the strongest positive associations. dis (distance to employment centers) and medv (median home value) have negative associations.

(b) Multiple Linear Regression

mlr_fit <- lm(crim ~ ., data = Boston)
summary(mlr_fit)
## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.534 -2.248 -0.348  1.087 73.923 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.7783938  7.0818258   1.946 0.052271 .  
## zn           0.0457100  0.0187903   2.433 0.015344 *  
## indus       -0.0583501  0.0836351  -0.698 0.485709    
## chas        -0.8253776  1.1833963  -0.697 0.485841    
## nox         -9.9575865  5.2898242  -1.882 0.060370 .  
## rm           0.6289107  0.6070924   1.036 0.300738    
## age         -0.0008483  0.0179482  -0.047 0.962323    
## dis         -1.0122467  0.2824676  -3.584 0.000373 ***
## rad          0.6124653  0.0875358   6.997 8.59e-12 ***
## tax         -0.0037756  0.0051723  -0.730 0.465757    
## ptratio     -0.3040728  0.1863598  -1.632 0.103393    
## lstat        0.1388006  0.0757213   1.833 0.067398 .  
## medv        -0.2200564  0.0598240  -3.678 0.000261 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared:  0.4493, Adjusted R-squared:  0.4359 
## F-statistic: 33.52 on 12 and 493 DF,  p-value: < 2.2e-16

Comment: In the multiple regression, only a handful of predictors remain significant after controlling for others: dis, rad, medv, and zn. Many predictors significant in univariate models (e.g., lstat, tax) lose significance due to multicollinearity.

(c) Univariate vs. Multiple Regression Coefficients

# Fully self-contained: rebuild everything needed here
data(Boston)

all_preds <- setdiff(names(Boston), "crim")

# Univariate coefficients
uni_coefs_c <- sapply(all_preds, function(var) {
  fit <- lm(Boston$crim ~ Boston[[var]])
  coef(fit)[2]
})

# Multiple regression coefficients
mlr_c    <- lm(crim ~ ., data = Boston)
multi_c  <- coef(mlr_c)[-1]  # drop intercept

# Build data frame — align by name
coef_df <- data.frame(
  predictor  = all_preds,
  univariate = as.numeric(uni_coefs_c),
  multiple   = as.numeric(multi_c[all_preds]),
  stringsAsFactors = FALSE
)

# Remove any rows with NA
coef_df <- coef_df[complete.cases(coef_df), ]

print(coef_df)  # sanity check — should show 12 rows with no NA
##    predictor  univariate      multiple
## 1         zn -0.07393498  0.0457100386
## 2      indus  0.50977633 -0.0583501107
## 3       chas -1.89277655 -0.8253775522
## 4        nox 31.24853120 -9.9575865471
## 5         rm -2.68405122  0.6289106622
## 6        age  0.10778623 -0.0008482791
## 7        dis -1.55090168 -1.0122467382
## 8        rad  0.61791093  0.6124653115
## 9        tax  0.02974225 -0.0037756465
## 10   ptratio  1.15198279 -0.3040727572
## 11     lstat  0.54880478  0.1388005968
## 12      medv -0.36315992 -0.2200563590
ggplot(coef_df, aes(x = univariate, y = multiple, label = predictor)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey60") +
  geom_point(colour = "steelblue", size = 3) +
  geom_text(vjust = -0.7, hjust = 0.5, size = 3, colour = "black") +
  scale_x_continuous(expand = expansion(mult = 0.15)) +
  scale_y_continuous(expand = expansion(mult = 0.15)) +
  labs(title = "Univariate vs. Multiple Regression Coefficients",
       subtitle = "Each point = one predictor of crim (Boston dataset)",
       x = "Simple (Univariate) Regression Coefficient",
       y = "Multiple Regression Coefficient") +
  theme_bw(base_size = 12)

Comment: The coefficients differ substantially between univariate and multiple regression, most notably for nox — its large positive univariate coefficient shrinks dramatically in multiple regression, indicating confounding. The dashed 45° line represents perfect agreement; most points fall below it, reflecting attenuation due to multicollinearity.

(d) Non-linear Associations (Cubic Polynomial)

poly_results <- lapply(predictors, function(var) {
  # skip binary variable
  if (length(unique(Boston[[var]])) < 4) return(NULL)
  fit <- lm(as.formula(paste("crim ~ poly(", var, ", 3)")), data = Boston)
  s   <- summary(fit)
  pvals <- s$coefficients[-1, 4]
  data.frame(
    predictor   = var,
    p_linear    = pvals[1],
    p_quadratic = pvals[2],
    p_cubic     = pvals[3]
  )
})

poly_df <- do.call(rbind, Filter(Negate(is.null), poly_results))
print(poly_df)
##                   predictor     p_linear  p_quadratic      p_cubic
## poly(zn, 3)1             zn 4.697806e-06 4.420507e-03 2.295386e-01
## poly(indus, 3)1       indus 8.854243e-24 1.086057e-03 1.196405e-12
## poly(nox, 3)1           nox 2.457491e-26 7.736755e-05 6.961110e-16
## poly(rm, 3)1             rm 5.128048e-07 1.508545e-03 5.085751e-01
## poly(age, 3)1           age 4.878803e-17 2.291156e-06 6.679915e-03
## poly(dis, 3)1           dis 1.253249e-21 7.869767e-14 1.088832e-08
## poly(rad, 3)1           rad 1.053211e-56 9.120558e-03 4.823138e-01
## poly(tax, 3)1           tax 6.976314e-49 3.665348e-06 2.438507e-01
## poly(ptratio, 3)1   ptratio 1.565484e-11 2.405468e-03 6.300514e-03
## poly(lstat, 3)1       lstat 1.678072e-27 3.780418e-02 1.298906e-01
## poly(medv, 3)1         medv 4.930818e-27 2.928577e-35 1.046510e-12

Comment: Several predictors show statistically significant quadratic and/or cubic terms — notably dis, lstat, nox, and medv — providing evidence of non-linear associations with crime rate. This suggests that purely linear models may be insufficient and that polynomial or other non-linear approaches could improve fit.


Chapter 4 – Exercise 13

Data Preparation

data(Weekly)
str(Weekly)
## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 

(a) Numerical and Graphical Summaries

summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
ggpairs(Weekly %>% select(-Direction),
        lower  = list(continuous = wrap("points", alpha = 0.2, size = 0.5)),
        upper  = list(continuous = wrap("cor", size = 3)),
        title  = "Weekly Data – Scatterplot Matrix")

ggplot(Weekly, aes(x = Year, y = Volume)) +
  geom_line(colour = "steelblue") +
  labs(title = "Trading Volume Over Time", y = "Volume (billions of shares)")

Patterns: Trading volume has increased substantially over the 21-year period. The lag variables (Lag1–Lag5) show very low correlations with each other and with Today, consistent with the weak predictability of stock returns. Volume and Year are strongly correlated, reflecting the secular growth in market activity.

(b) Logistic Regression – Full Data

glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                data    = Weekly,
                family  = binomial)
summary(glm_full)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Comment: Only Lag2 has a statistically significant coefficient (p ≈ 0.03). The other lag variables and Volume are not significant predictors of market direction when all are included together.

(c) Confusion Matrix – Full Data

prob_full <- predict(glm_full, type = "response")
pred_full <- ifelse(prob_full > 0.5, "Up", "Down")
conf_full <- table(Predicted = pred_full, Actual = Weekly$Direction)
print(conf_full)
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
cat("Overall Correct:", mean(pred_full == Weekly$Direction), "\n")
## Overall Correct: 0.5610652

Interpretation: The model predominantly predicts “Up”, achieving a high accuracy for Up weeks but almost completely missing Down weeks. The overall fraction correct (~56%) is only marginally better than random guessing, driven largely by the market’s upward bias during this period. The high false-positive rate (Down predicted as Up) indicates poor discrimination.

(d) Training/Test Split – Logistic Regression with Lag2 Only

train <- Weekly %>% filter(Year <= 2008)
test  <- Weekly %>% filter(Year >= 2009)

glm_lag2 <- glm(Direction ~ Lag2,
                data   = train,
                family = binomial)

prob_test <- predict(glm_lag2, newdata = test, type = "response")
pred_test <- ifelse(prob_test > 0.5, "Up", "Down")

conf_test <- table(Predicted = pred_test, Actual = test$Direction)
print(conf_test)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
cat("Overall Correct (test):", mean(pred_test == test$Direction), "\n")
## Overall Correct (test): 0.625

Comment: Using only Lag2 as a predictor on the held-out 2009–2010 data, the model achieves approximately 62.5% correct predictions. The confusion matrix shows that the model still favours predicting “Up”, but captures a reasonable fraction of true Up weeks. This is a modest improvement over the full-model baseline, consistent with Lag2 being the only significant predictor in the full model. ```