library(ISLR2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(GGally)    # ggpairs scatterplot matrix
library(corrplot)  # correlation heatmap

## corrplot 0.95 loaded

library(pROC)      # ROC curves

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Chapter 2 – Exercise 9 (`Auto` Dataset)

Make sure that missing values have been removed from the data.

data(Auto)
Auto <- na.omit(Auto)
dim(Auto)

## [1] 392   9

head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

(a) Quantitative vs. Qualitative Predictors

str(Auto)

## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...

Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year

Qualitative predictor: name (and origin is technically categorical — country code 1/2/3 — but stored as integer)

(b) Range of Each Quantitative Predictor

quant_vars <- Auto %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)

range_df <- quant_vars %>%
  summarise(across(everything(), list(Min = min, Max = max))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)")

knitr::kable(range_df, caption = "Range of Quantitative Predictors")

Range of Quantitative Predictors
Variable	Min	Max
mpg	9	46.6
cylinders	3	8.0
displacement	68	455.0
horsepower	46	230.0
weight	1613	5140.0
acceleration	8	24.8
year	70	82.0

(c) Mean and Standard Deviation of Each Quantitative Predictor

stats_df <- quant_vars %>%
  summarise(across(everything(), list(Mean = mean, SD = sd))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)") %>%
  mutate(across(c(Mean, SD), ~ round(.x, 2)))

knitr::kable(stats_df, caption = "Mean and Standard Deviation of Quantitative Predictors")

Mean and Standard Deviation of Quantitative Predictors
Variable	Mean	SD
mpg	23.45	7.81
cylinders	5.47	1.71
displacement	194.41	104.64
horsepower	104.47	38.49
weight	2977.58	849.40
acceleration	15.54	2.76
year	75.98	3.68

(d) Remove Observations 10–85; Recompute Statistics

Auto_sub <- Auto[-(10:85), ]
dim(Auto_sub)

## [1] 316   9

quant_sub <- Auto_sub %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)

sub_stats <- quant_sub %>%
  summarise(across(everything(),
    list(Min = min, Max = max, Mean = mean, SD = sd))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_(?=[^_]+$)") %>%
  mutate(across(c(Mean, SD), ~ round(.x, 2)))

knitr::kable(sub_stats, caption = "Statistics for Subset (rows 10–85 removed)")

Statistics for Subset (rows 10–85 removed)
Variable	Min	Max	Mean	SD
mpg	11.0	46.6	24.40	7.87
cylinders	3.0	8.0	5.37	1.65
displacement	68.0	455.0	187.24	99.68
horsepower	46.0	230.0	100.72	35.71
weight	1649.0	4997.0	2935.97	811.30
acceleration	8.5	24.8	15.73	2.69
year	70.0	82.0	77.15	3.11

(e) Graphical Investigation of Predictors

# Scatterplot matrix for quantitative variables
ggpairs(quant_vars,
        upper = list(continuous = wrap("cor", size = 3)),
        lower = list(continuous = wrap("points", alpha = 0.3, size = 0.6)),
        diag  = list(continuous = wrap("densityDiag")),
        title = "Scatterplot Matrix – Auto Dataset") +
  theme_bw(base_size = 9)

Findings: Strong negative correlations exist between mpg and displacement, horsepower, and weight. displacement, horsepower, and weight are heavily correlated with each other (multicollinearity). cylinders clusters around 4, 6, and 8 — behaving more like a categorical variable.

(f) Variables Useful for Predicting `mpg`

Auto %>%
  select(mpg, displacement, horsepower, weight, acceleration, year) %>%
  pivot_longer(-mpg, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(x = Value, y = mpg)) +
  geom_point(alpha = 0.3, size = 0.8, color = "#2980B9") +
  geom_smooth(method = "loess", color = "#E74C3C", se = TRUE) +
  facet_wrap(~ Predictor, scales = "free_x", ncol = 3) +
  labs(title = "Predictors vs. mpg", x = "Predictor Value", y = "mpg") +
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

Conclusion: displacement, horsepower, and weight show strong negative nonlinear relationships with mpg — all three are useful predictors. year shows a positive trend (newer cars are more fuel-efficient). acceleration has a weaker, noisier relationship.

Chapter 3 – Exercise 9 (`Auto` Dataset — Multiple Linear Regression)

(a) Scatterplot Matrix

Auto_num <- Auto %>% select(-name)

ggpairs(Auto_num,
        upper = list(continuous = wrap("cor", size = 2.8)),
        lower = list(continuous = wrap("points", alpha = 0.25, size = 0.5)),
        title = "Scatterplot Matrix (All Variables, excluding name)") +
  theme_bw(base_size = 8)

(b) Correlation Matrix

cor_matrix <- cor(Auto_num)
knitr::kable(round(cor_matrix, 3), caption = "Correlation Matrix (excluding name)")

Correlation Matrix (excluding name)
	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin
mpg	1.000	-0.778	-0.805	-0.778	-0.832	0.423	0.581	0.565
cylinders	-0.778	1.000	0.951	0.843	0.898	-0.505	-0.346	-0.569
displacement	-0.805	0.951	1.000	0.897	0.933	-0.544	-0.370	-0.615
horsepower	-0.778	0.843	0.897	1.000	0.865	-0.689	-0.416	-0.455
weight	-0.832	0.898	0.933	0.865	1.000	-0.417	-0.309	-0.585
acceleration	0.423	-0.505	-0.544	-0.689	-0.417	1.000	0.290	0.213
year	0.581	-0.346	-0.370	-0.416	-0.309	0.290	1.000	0.182
origin	0.565	-0.569	-0.615	-0.455	-0.585	0.213	0.182	1.000

corrplot(cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", number.cex = 0.65,
         tl.cex = 0.8, title = "Correlation Heatmap", mar = c(0,0,1,0))

(c) Multiple Linear Regression: `mpg` ~ All Predictors (excluding `name`)

lm_full <- lm(mpg ~ . - name, data = Auto)
summary(lm_full)

## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

i. Is there a relationship between the predictors and the response?

Yes — the overall F-statistic is highly significant (p < 2.2e-16), confirming a relationship between the predictors and mpg.

ii. Which predictors are statistically significant?

displacement, weight, year, and origin all show p-values < 0.05 and are statistically significant predictors of mpg. horsepower and acceleration are not significant at the 5% level.

iii. What does the coefficient for year suggest?

The positive coefficient for year (~0.75) suggests that, holding other variables constant, each additional model year is associated with an increase of about 0.75 mpg — reflecting the trend toward more fuel-efficient vehicles over time.

(d) Diagnostic Plots

par(mfrow = c(2, 2))
plot(lm_full)

par(mfrow = c(1, 1))

Comments:

Residuals vs Fitted: There is a slight nonlinear (U-shaped) pattern, suggesting the linear model may not fully capture the relationship — a transformation might help.
Q-Q Plot: Residuals follow the normal line fairly well, with mild deviation in the tails.
Scale-Location: Some heteroscedasticity is present (variance increases with fitted values).
Residuals vs Leverage: Observation 14 has notably high leverage. A few points (e.g., 323, 327) appear as mild outliers but no extreme Cook’s distance values.

(e) Interaction Effects

# Key interactions suggested by high correlations
lm_interact <- lm(mpg ~ displacement + horsepower + weight + year + origin +
                  cylinders + acceleration +
                  displacement:weight +
                  horsepower:year +
                  weight:year,
                  data = Auto)
summary(lm_interact)

## 
## Call:
## lm(formula = mpg ~ displacement + horsepower + weight + year + 
##     origin + cylinders + acceleration + displacement:weight + 
##     horsepower:year + weight:year, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4158 -1.4565 -0.0318  1.3979 11.2571 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -5.468e+01  1.385e+01  -3.947 9.43e-05 ***
## displacement        -7.006e-02  1.106e-02  -6.337 6.61e-10 ***
## horsepower           8.122e-01  1.696e-01   4.790 2.39e-06 ***
## weight              -2.118e-02  8.390e-03  -2.524   0.0120 *  
## year                 1.437e+00  1.708e-01   8.411 8.28e-16 ***
## origin               4.994e-01  2.468e-01   2.023   0.0438 *  
## cylinders            5.035e-01  2.825e-01   1.782   0.0755 .  
## acceleration        -6.393e-02  9.014e-02  -0.709   0.4786    
## displacement:weight  1.987e-05  2.310e-06   8.604  < 2e-16 ***
## horsepower:year     -1.160e-02  2.311e-03  -5.021 7.90e-07 ***
## weight:year          1.620e-04  1.114e-04   1.454   0.1467    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.785 on 381 degrees of freedom
## Multiple R-squared:  0.8759, Adjusted R-squared:  0.8727 
## F-statistic:   269 on 10 and 381 DF,  p-value: < 2.2e-16

Finding: The interaction horsepower:year and weight:year are statistically significant (p < 0.05), suggesting that the effect of horsepower and weight on mpg has changed over time — heavier, high-horsepower cars improved more in fuel efficiency in later years.

(f) Variable Transformations

lm_log    <- lm(mpg ~ log(displacement) + log(horsepower) + log(weight) + year + origin, data = Auto)
lm_sqrt   <- lm(mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight) + year + origin, data = Auto)
lm_sq     <- lm(mpg ~ displacement + I(displacement^2) + weight + year + origin, data = Auto)

cat("R² – log transform:    ", summary(lm_log)$r.squared,  "\n")

## R² – log transform:     0.8454534

cat("R² – sqrt transform:   ", summary(lm_sqrt)$r.squared, "\n")

## R² – sqrt transform:    0.8337855

cat("R² – quadratic:        ", summary(lm_sq)$r.squared,   "\n")

## R² – quadratic:         0.8418804

cat("R² – original (full):  ", summary(lm_full)$r.squared, "\n")

## R² – original (full):   0.8214781

par(mfrow = c(2, 2))
plot(lm_log)

par(mfrow = c(1, 1))

Conclusion: The log-transformed model achieves the highest R² and the residual plots are more homoscedastic — log transformations of displacement, horsepower, and weight improve model fit compared to using the raw values.

Chapter 3 – Exercise 15 (`Boston` Dataset — Simple & Multiple Regression)

data(Boston)
glimpse(Boston)

## Rows: 506
## Columns: 13
## $ crim    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad     <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ lstat   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…

(a) Simple Linear Regression for Each Predictor → `crim`

predictors <- setdiff(names(Boston), "crim")

slr_results <- map_dfr(predictors, function(pred) {
  fit <- lm(as.formula(paste("crim ~", pred)), data = Boston)
  s   <- summary(fit)
  tibble(
    Predictor   = pred,
    Coefficient = round(coef(fit)[2], 4),
    `p-value`   = round(coef(s)[2, 4], 4),
    R2          = round(s$r.squared, 4),
    Significant = ifelse(coef(s)[2, 4] < 0.05, "Yes ✓", "No")
  )
})

knitr::kable(slr_results, caption = "Simple Linear Regression: crim ~ each predictor")

Simple Linear Regression: crim ~ each predictor
Predictor	Coefficient	p-value	R2	Significant
zn	-0.0739	0.0000	0.0402	Yes ✓
indus	0.5098	0.0000	0.1653	Yes ✓
chas	-1.8928	0.2094	0.0031	No
nox	31.2485	0.0000	0.1772	Yes ✓
rm	-2.6841	0.0000	0.0481	Yes ✓
age	0.1078	0.0000	0.1244	Yes ✓
dis	-1.5509	0.0000	0.1441	Yes ✓
rad	0.6179	0.0000	0.3913	Yes ✓
tax	0.0297	0.0000	0.3396	Yes ✓
ptratio	1.1520	0.0000	0.0841	Yes ✓
lstat	0.5488	0.0000	0.2076	Yes ✓
medv	-0.3632	0.0000	0.1508	Yes ✓

# Plot crim vs. most significant predictors
sig_preds <- slr_results %>% filter(Significant == "Yes ✓") %>% pull(Predictor)

Boston %>%
  select(crim, all_of(sig_preds)) %>%
  pivot_longer(-crim, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(x = Value, y = crim)) +
  geom_point(alpha = 0.3, size = 0.7, color = "#8E44AD") +
  geom_smooth(method = "lm", color = "#E74C3C", se = TRUE) +
  facet_wrap(~ Predictor, scales = "free_x", ncol = 4) +
  labs(title = "Significant Predictors vs. Crime Rate (crim)",
       x = "Predictor Value", y = "Per Capita Crime Rate") +
  theme_bw(base_size = 9)

## `geom_smooth()` using formula = 'y ~ x'

Finding: Almost all predictors are statistically significant in simple linear regression. rad (accessibility to radial highways), tax (tax rate), and lstat (% lower status) show the strongest associations with crime rate.

(b) Multiple Linear Regression: `crim` ~ All Predictors

mlr_boston <- lm(crim ~ ., data = Boston)
summary(mlr_boston)

## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.534 -2.248 -0.348  1.087 73.923 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.7783938  7.0818258   1.946 0.052271 .  
## zn           0.0457100  0.0187903   2.433 0.015344 *  
## indus       -0.0583501  0.0836351  -0.698 0.485709    
## chas        -0.8253776  1.1833963  -0.697 0.485841    
## nox         -9.9575865  5.2898242  -1.882 0.060370 .  
## rm           0.6289107  0.6070924   1.036 0.300738    
## age         -0.0008483  0.0179482  -0.047 0.962323    
## dis         -1.0122467  0.2824676  -3.584 0.000373 ***
## rad          0.6124653  0.0875358   6.997 8.59e-12 ***
## tax         -0.0037756  0.0051723  -0.730 0.465757    
## ptratio     -0.3040728  0.1863598  -1.632 0.103393    
## lstat        0.1388006  0.0757213   1.833 0.067398 .  
## medv        -0.2200564  0.0598240  -3.678 0.000261 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared:  0.4493, Adjusted R-squared:  0.4359 
## F-statistic: 33.52 on 12 and 493 DF,  p-value: < 2.2e-16

Finding: In the multiple regression model, only zn, dis, rad, black, and medv are statistically significant (p < 0.05). Many predictors that were significant in simple regression lose significance — indicating confounding and multicollinearity among predictors.

(c) Univariate vs. Multiple Regression Coefficients

uni_coefs   <- slr_results$Coefficient
multi_coefs <- coef(mlr_boston)[-1]  # exclude intercept

coef_compare <- tibble(
  Predictor  = predictors,
  Univariate = uni_coefs,
  Multiple   = as.numeric(multi_coefs)
)

ggplot(coef_compare, aes(x = Univariate, y = Multiple, label = Predictor)) +
  geom_point(color = "#E67E22", size = 3) +
  geom_text(vjust = -0.7, size = 3.2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  labs(title = "Univariate vs. Multiple Regression Coefficients",
       subtitle = "Each point = one predictor",
       x = "Simple Regression Coefficient",
       y = "Multiple Regression Coefficient") +
  theme_bw()

Comment: The nox variable shows the largest discrepancy — a large positive coefficient in simple regression shrinks dramatically in the multiple model, reflecting collinearity with other predictors.

(d) Non-linear Associations (Cubic Polynomial)

poly_results <- map_dfr(predictors, function(pred) {
  n_unique <- length(unique(Boston[[pred]]))
  degree   <- min(3, n_unique - 1)   # cap degree to avoid poly() error

  fit_poly <- lm(as.formula(paste("crim ~ poly(", pred, ",", degree, ")")), data = Boston)
  s        <- summary(fit_poly)
  coefs    <- coef(s)

  tibble(
    Predictor   = pred,
    Max_Degree  = degree,
    p_linear    = round(coefs[2, 4], 4),
    p_quadratic = ifelse(nrow(coefs) >= 3, round(coefs[3, 4], 4), NA_real_),
    p_cubic     = ifelse(nrow(coefs) >= 4, round(coefs[4, 4], 4), NA_real_),
    R2          = round(s$r.squared, 4)
  )
})

knitr::kable(poly_results, caption = "Cubic Polynomial Fits: crim ~ poly(X, 3)")

Cubic Polynomial Fits: crim ~ poly(X, 3)
Predictor	Max_Degree	p_linear	p_quadratic	p_cubic	R2
zn	3	0.0000	0.0044	0.2295	0.0582
indus	3	0.0000	0.0011	0.0000	0.2597
chas	1	0.2094	NA	NA	0.0031
nox	3	0.0000	0.0001	0.0000	0.2970
rm	3	0.0000	0.0015	0.5086	0.0678
age	3	0.0000	0.0000	0.0067	0.1742
dis	3	0.0000	0.0000	0.0000	0.2778
rad	3	0.0000	0.0091	0.4823	0.4000
tax	3	0.0000	0.0000	0.2439	0.3689
ptratio	3	0.0000	0.0024	0.0063	0.1138
lstat	3	0.0000	0.0378	0.1299	0.2179
medv	3	0.0000	0.0000	0.0000	0.4202

Finding: Several predictors — including indus, nox, age, dis, ptratio, and medv — show significant quadratic or cubic terms, providing evidence of non-linear associations with crime rate.

Chapter 4 – Exercise 13 (`Weekly` Dataset — Logistic Regression)

data(Weekly)
glimpse(Weekly)

## Rows: 1,089
## Columns: 9
## $ Year      <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, …
## $ Lag1      <dbl> 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0…
## $ Lag2      <dbl> 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0…
## $ Lag3      <dbl> -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, 1.178, -…
## $ Lag4      <dbl> -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514, 0.712, …
## $ Lag5      <dbl> -3.484, -0.229, -3.936, 1.572, 0.816, -0.270, -2.576, 3.514,…
## $ Volume    <dbl> 0.1549760, 0.1485740, 0.1598375, 0.1616300, 0.1537280, 0.154…
## $ Today     <dbl> -0.270, -2.576, 3.514, 0.712, 1.178, -1.372, 0.807, 0.041, 1…
## $ Direction <fct> Down, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down, Up, Up…

(a) Numerical and Graphical Summaries

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

# Volume over time
ggplot(Weekly, aes(x = Year, y = Volume)) +
  geom_line(color = "#2980B9", alpha = 0.6) +
  geom_smooth(color = "#E74C3C", se = FALSE) +
  labs(title = "Trading Volume Over Time (Weekly)", y = "Volume (billions of shares)") +
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Lag correlations with Today
Weekly %>%
  select(Lag1:Lag5, Volume, Today) %>%
  cor() %>%
  corrplot(method = "color", addCoef.col = "black", number.cex = 0.7,
           tl.cex = 0.8, title = "Correlations – Weekly Variables", mar = c(0,0,1,0))

Patterns: Trading volume has increased substantially over the 21-year period. The lag variables show very weak correlations with Today’s return — consistent with market efficiency. Volume and Year are strongly correlated.

(b) Logistic Regression: `Direction` ~ Lag1–Lag5 + Volume

logit_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                  data = Weekly, family = binomial)
summary(logit_full)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Finding: Only Lag2 is statistically significant (p ≈ 0.03). The other lag variables and Volume do not appear significant.

(c) Confusion Matrix — Full Training Set

pred_prob_full <- predict(logit_full, type = "response")
pred_dir_full  <- ifelse(pred_prob_full > 0.5, "Up", "Down")

conf_mat <- table(Predicted = pred_dir_full, Actual = Weekly$Direction)
conf_mat

##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557

accuracy <- mean(pred_dir_full == Weekly$Direction)
cat("Overall Fraction of Correct Predictions:", round(accuracy, 4), "\n")

## Overall Fraction of Correct Predictions: 0.5611

Interpretation: The model has an overall accuracy of ~56%. Looking at the confusion matrix: it tends to predict “Up” very often — it captures most actual “Up” weeks correctly but misclassifies many “Down” weeks as “Up”. This reflects the market’s upward bias and the model’s tendency to predict the majority class.

(d) Train on 1990–2008, Test on 2009–2010 (Lag2 only)

train <- Weekly %>% filter(Year <= 2008)
test  <- Weekly %>% filter(Year  > 2008)

logit_lag2 <- glm(Direction ~ Lag2, data = train, family = binomial)
summary(logit_lag2)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

pred_prob_test <- predict(logit_lag2, newdata = test, type = "response")
pred_dir_test  <- ifelse(pred_prob_test > 0.5, "Up", "Down")

conf_test <- table(Predicted = pred_dir_test, Actual = test$Direction)
conf_test

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

acc_test <- mean(pred_dir_test == test$Direction)
cat("Test Accuracy (2009–2010):", round(acc_test, 4), "\n")

## Test Accuracy (2009–2010): 0.625

Interpretation: The Lag2-only model achieves approximately 62.5% accuracy on the held-out 2009–2010 data — better than the full model on training data, suggesting Lag2 is the most useful predictor and avoiding overfitting by dropping insignificant predictors helps generalization.

Session Info

sessionInfo()

## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] pROC_1.19.0.1   corrplot_0.95   GGally_2.4.0    lubridate_1.9.5
##  [5] forcats_1.0.1   stringr_1.6.0   dplyr_1.2.0     purrr_1.2.1    
##  [9] readr_2.2.0     tidyr_1.3.2     tibble_3.3.1    ggplot2_4.0.2  
## [13] tidyverse_2.0.0 ISLR2_1.3-2    
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     stringi_1.8.7      lattice_0.22-9    
##  [5] hms_1.1.4          digest_0.6.39      magrittr_2.0.4     evaluate_1.0.5    
##  [9] grid_4.5.3         timechange_0.4.0   RColorBrewer_1.1-3 fastmap_1.2.0     
## [13] Matrix_1.7-4       jsonlite_2.0.0     mgcv_1.9-4         scales_1.4.0      
## [17] jquerylib_0.1.4    cli_3.6.5          rlang_1.1.7        splines_4.5.3     
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.12        tools_4.5.3       
## [25] tzdb_0.5.0         ggstats_0.13.0     vctrs_0.7.1        R6_2.6.1          
## [29] lifecycle_1.0.5    pkgconfig_2.0.3    pillar_1.11.1      bslib_0.10.0      
## [33] gtable_0.3.6       glue_1.8.0         Rcpp_1.1.1         xfun_0.56         
## [37] tidyselect_1.2.1   rstudioapi_0.18.0  knitr_1.51         farver_2.1.2      
## [41] htmltools_0.5.9    nlme_3.1-168       rmarkdown_2.30     labeling_0.4.3    
## [45] compiler_4.5.3     S7_0.2.1

finance midterm

Alex 123

2026-04-14

Chapter 2 – Exercise 9 (`Auto` Dataset)

(a) Quantitative vs. Qualitative Predictors

(b) Range of Each Quantitative Predictor

(c) Mean and Standard Deviation of Each Quantitative Predictor

(d) Remove Observations 10–85; Recompute Statistics

(e) Graphical Investigation of Predictors

(f) Variables Useful for Predicting `mpg`

Chapter 3 – Exercise 9 (`Auto` Dataset — Multiple Linear Regression)

(a) Scatterplot Matrix

(b) Correlation Matrix

(c) Multiple Linear Regression: `mpg` ~ All Predictors (excluding `name`)

(d) Diagnostic Plots

(e) Interaction Effects

(f) Variable Transformations

Chapter 3 – Exercise 15 (`Boston` Dataset — Simple & Multiple Regression)

(a) Simple Linear Regression for Each Predictor → `crim`

(b) Multiple Linear Regression: `crim` ~ All Predictors

(c) Univariate vs. Multiple Regression Coefficients

(d) Non-linear Associations (Cubic Polynomial)

Chapter 4 – Exercise 13 (`Weekly` Dataset — Logistic Regression)

(a) Numerical and Graphical Summaries

(b) Logistic Regression: `Direction` ~ Lag1–Lag5 + Volume

(c) Confusion Matrix — Full Training Set

(d) Train on 1990–2008, Test on 2009–2010 (Lag2 only)

Session Info

finance midterm

Alex 123

2026-04-14

Chapter 2 – Exercise 9 (Auto Dataset)

(a) Quantitative vs. Qualitative Predictors

(b) Range of Each Quantitative Predictor

(c) Mean and Standard Deviation of Each Quantitative Predictor

(d) Remove Observations 10–85; Recompute Statistics

(e) Graphical Investigation of Predictors

(f) Variables Useful for Predicting mpg

Chapter 3 – Exercise 9 (Auto Dataset — Multiple Linear Regression)

(a) Scatterplot Matrix

(b) Correlation Matrix

(c) Multiple Linear Regression: mpg ~ All Predictors (excluding name)

(d) Diagnostic Plots

(e) Interaction Effects

(f) Variable Transformations

Chapter 3 – Exercise 15 (Boston Dataset — Simple & Multiple Regression)

(a) Simple Linear Regression for Each Predictor → crim

(b) Multiple Linear Regression: crim ~ All Predictors

(c) Univariate vs. Multiple Regression Coefficients

(d) Non-linear Associations (Cubic Polynomial)

Chapter 4 – Exercise 13 (Weekly Dataset — Logistic Regression)

(a) Numerical and Graphical Summaries

(b) Logistic Regression: Direction ~ Lag1–Lag5 + Volume

(c) Confusion Matrix — Full Training Set

(d) Train on 1990–2008, Test on 2009–2010 (Lag2 only)

Session Info

Chapter 2 – Exercise 9 (`Auto` Dataset)

(f) Variables Useful for Predicting `mpg`

Chapter 3 – Exercise 9 (`Auto` Dataset — Multiple Linear Regression)

(c) Multiple Linear Regression: `mpg` ~ All Predictors (excluding `name`)

Chapter 3 – Exercise 15 (`Boston` Dataset — Simple & Multiple Regression)

(a) Simple Linear Regression for Each Predictor → `crim`

(b) Multiple Linear Regression: `crim` ~ All Predictors

Chapter 4 – Exercise 13 (`Weekly` Dataset — Logistic Regression)

(b) Logistic Regression: `Direction` ~ Lag1–Lag5 + Volume