data(Auto)
Auto <- na.omit(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
The quantitative predictors are: mpg,
cylinders, displacement,
horsepower, weight, acceleration,
year.
The qualitative predictor is: name (and
origin can be treated as qualitative/categorical since it
codes 1 = American, 2 = European, 3 = Japanese).
# Confirm variable types
sapply(Auto, class)
## mpg cylinders displacement horsepower weight acceleration
## "numeric" "integer" "numeric" "integer" "integer" "numeric"
## year origin name
## "integer" "integer" "factor"
quant_vars <- Auto %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)
sapply(quant_vars, range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
sapply(quant_vars, mean)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
sapply(quant_vars, sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
Auto_sub <- Auto[-(10:85), ]
quant_sub <- Auto_sub %>% select(mpg, cylinders, displacement, horsepower, weight, acceleration, year)
cat("Range:\n"); print(sapply(quant_sub, range))
## Range:
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0 3 68 46 1649 8.5 70
## [2,] 46.6 8 455 230 4997 24.8 82
cat("\nMean:\n"); print(sapply(quant_sub, mean))
##
## Mean:
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year
## 77.145570
cat("\nSD:\n"); print(sapply(quant_sub, sd))
##
## SD:
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year
## 3.106217
ggpairs(quant_vars,
lower = list(continuous = wrap("points", alpha = 0.3, size = 0.8)),
diag = list(continuous = wrap("densityDiag")),
upper = list(continuous = wrap("cor", size = 3)),
title = "Scatterplot Matrix – Auto Dataset")
Findings: Strong negative correlations exist between
mpg and displacement, horsepower,
and weight (heavier, more powerful cars are less
fuel-efficient). displacement, horsepower, and
weight are highly positively correlated with each other.
year shows a moderate positive correlation with
mpg, suggesting newer cars became more fuel-efficient over
time.
mpgAuto %>%
tidyr::pivot_longer(cols = c(displacement, horsepower, weight, acceleration, year, cylinders),
names_to = "predictor", values_to = "value") %>%
ggplot(aes(x = value, y = mpg)) +
geom_point(alpha = 0.3, size = 0.8, colour = "steelblue") +
geom_smooth(method = "loess", se = TRUE, colour = "tomato") +
facet_wrap(~predictor, scales = "free_x") +
labs(title = "MPG vs. Each Quantitative Predictor", x = "Predictor Value", y = "MPG")
Conclusion: displacement,
horsepower, and weight show strong non-linear
(curvilinear) negative relationships with mpg and are the
most useful predictors. year shows a positive trend — newer
cars tend to have better fuel economy. cylinders also
negatively correlates with mpg. acceleration
has a weaker, noisier relationship.
ggpairs(Auto %>% select(-name),
lower = list(continuous = wrap("points", alpha = 0.25, size = 0.6)),
upper = list(continuous = wrap("cor", size = 3)),
title = "Scatterplot Matrix – All Auto Variables")
name)cor(Auto %>% select(-name))
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
mpg ~ All Predictors
(except name)lm_fit <- lm(mpg ~ . - name, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
i. Relationship between predictors and
response?
Yes. The overall F-statistic is highly significant (p < 2.2e-16),
indicating that at least some predictors have a relationship with
mpg.
ii. Which predictors are statistically
significant?
displacement, weight, year, and
origin have statistically significant coefficients (p <
0.05). cylinders, horsepower, and
acceleration are not significant when controlling for the
others.
iii. Coefficient for year:
The positive coefficient (~0.75) suggests that, holding other variables
constant, cars made one year later have on average about 0.75 more miles
per gallon. This reflects improving fuel-efficiency standards over
time.
par(mfrow = c(2, 2))
plot(lm_fit)
par(mfrow = c(1, 1))
Comments:
- The Residuals vs Fitted plot shows a slight U-shaped pattern,
suggesting mild non-linearity not fully captured by the linear
model.
- The Q-Q plot shows minor departures in the tails but is
broadly acceptable.
- The Scale-Location plot indicates roughly constant variance,
with some heteroscedasticity at higher fitted values.
- The Residuals vs Leverage plot identifies observation
14 as having unusually high leverage. A few points
(e.g., 323, 327) are mild outliers but do not exert excessive
influence.
# Test selected interactions suggested by domain knowledge
lm_interact <- lm(mpg ~ . - name + displacement:weight + horsepower:weight + acceleration:horsepower,
data = Auto)
summary(lm_interact)
##
## Call:
## lm(formula = mpg ~ . - name + displacement:weight + horsepower:weight +
## acceleration:horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8709 -1.6234 -0.0727 1.3837 11.8703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.006e+01 5.623e+00 -1.789 0.074396 .
## cylinders 2.760e-01 2.960e-01 0.933 0.351551
## displacement -3.355e-02 1.988e-02 -1.687 0.092360 .
## horsepower -6.366e-02 5.750e-02 -1.107 0.268942
## weight -9.232e-03 9.096e-04 -10.149 < 2e-16 ***
## acceleration 4.527e-01 1.694e-01 2.672 0.007857 **
## year 7.759e-01 4.445e-02 17.456 < 2e-16 ***
## origin 5.935e-01 2.637e-01 2.251 0.024962 *
## displacement:weight 8.120e-06 5.426e-06 1.497 0.135330
## horsepower:weight 2.991e-05 1.304e-05 2.294 0.022341 *
## horsepower:acceleration -6.311e-03 1.779e-03 -3.548 0.000437 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.886 on 381 degrees of freedom
## Multiple R-squared: 0.8668, Adjusted R-squared: 0.8633
## F-statistic: 247.9 on 10 and 381 DF, p-value: < 2.2e-16
Comment: The interaction
displacement:weight and horsepower:weight are
statistically significant, confirming that the joint effect of engine
size and vehicle weight on fuel economy is not simply additive.
# Log transformation
lm_log <- lm(mpg ~ log(displacement) + log(horsepower) + log(weight) + acceleration + year + origin,
data = Auto)
summary(lm_log)
##
## Call:
## lm(formula = mpg ~ log(displacement) + log(horsepower) + log(weight) +
## acceleration + year + origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5892 -1.7692 -0.0696 1.5646 12.8531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106.96196 9.88311 10.823 < 2e-16 ***
## log(displacement) 0.08762 1.05613 0.083 0.933923
## log(horsepower) -5.66760 1.56474 -3.622 0.000331 ***
## log(weight) -13.99299 2.19174 -6.384 4.96e-10 ***
## acceleration -0.19698 0.10271 -1.918 0.055867 .
## year 0.72422 0.04697 15.419 < 2e-16 ***
## origin 0.91679 0.27198 3.371 0.000826 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.077 on 385 degrees of freedom
## Multiple R-squared: 0.8469, Adjusted R-squared: 0.8445
## F-statistic: 355 on 6 and 385 DF, p-value: < 2.2e-16
# Square root
lm_sqrt <- lm(mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight) + acceleration + year + origin,
data = Auto)
summary(lm_sqrt)
##
## Call:
## lm(formula = mpg ~ sqrt(displacement) + sqrt(horsepower) + sqrt(weight) +
## acceleration + year + origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4363 -2.0056 -0.1541 1.7213 13.0062
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.89538 4.93212 0.993 0.3216
## sqrt(displacement) 0.19846 0.15916 1.247 0.2132
## sqrt(horsepower) -0.65154 0.30287 -2.151 0.0321 *
## sqrt(weight) -0.64035 0.07755 -8.257 2.41e-15 ***
## acceleration -0.04552 0.10235 -0.445 0.6567
## year 0.73604 0.04920 14.960 < 2e-16 ***
## origin 1.15233 0.27541 4.184 3.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.206 on 385 degrees of freedom
## Multiple R-squared: 0.8339, Adjusted R-squared: 0.8313
## F-statistic: 322.1 on 6 and 385 DF, p-value: < 2.2e-16
# Polynomial (X^2) for horsepower
lm_poly <- lm(mpg ~ displacement + poly(horsepower, 2) + weight + acceleration + year + origin,
data = Auto)
summary(lm_poly)
##
## Call:
## lm(formula = mpg ~ displacement + poly(horsepower, 2) + weight +
## acceleration + year + origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7680 -1.6744 -0.1844 1.5704 12.0786
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.893e+01 3.755e+00 -5.041 7.16e-07 ***
## displacement -1.670e-03 5.278e-03 -0.316 0.75193
## poly(horsepower, 2)1 -5.120e+01 1.030e+01 -4.970 1.01e-06 ***
## poly(horsepower, 2)2 3.477e+01 3.649e+00 9.527 < 2e-16 ***
## weight -3.303e-03 6.784e-04 -4.869 1.65e-06 ***
## acceleration -3.210e-01 9.887e-02 -3.246 0.00127 **
## year 7.351e-01 4.601e-02 15.977 < 2e-16 ***
## origin 1.056e+00 2.520e-01 4.190 3.46e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.002 on 384 degrees of freedom
## Multiple R-squared: 0.8547, Adjusted R-squared: 0.8521
## F-statistic: 322.8 on 7 and 384 DF, p-value: < 2.2e-16
Comment: Log-transforming displacement,
horsepower, and weight increases the adjusted
R² and reduces non-linearity in the residuals, suggesting these
predictors have a multiplicative rather than additive relationship with
mpg. The polynomial term for horsepower is
also significant, confirming the curvilinear relationship observed in
the scatterplots.
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 13 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
predictors <- setdiff(names(Boston), "crim")
slr_results <- lapply(predictors, function(var) {
fit <- lm(as.formula(paste("crim ~", var)), data = Boston)
s <- summary(fit)
data.frame(
predictor = var,
coef = coef(fit)[2],
p_value = s$coefficients[2, 4],
r_squared = s$r.squared
)
})
slr_df <- do.call(rbind, slr_results) %>% arrange(p_value)
print(slr_df)
## predictor coef p_value r_squared
## rad rad 0.61791093 2.693844e-56 0.391256687
## tax tax 0.02974225 2.357127e-47 0.339614243
## lstat lstat 0.54880478 2.654277e-27 0.207590933
## nox nox 31.24853120 3.751739e-23 0.177217182
## indus indus 0.50977633 1.450349e-21 0.165310070
## medv medv -0.36315992 1.173987e-19 0.150780469
## dis dis -1.55090168 8.519949e-19 0.144149375
## age age 0.10778623 2.854869e-16 0.124421452
## ptratio ptratio 1.15198279 2.942922e-11 0.084068439
## rm rm -2.68405122 6.346703e-07 0.048069117
## zn zn -0.07393498 5.506472e-06 0.040187908
## chas chas -1.89277655 2.094345e-01 0.003123869
# Plot a few key relationships
Boston %>%
tidyr::pivot_longer(cols = c(lstat, medv, dis, nox, rad),
names_to = "predictor", values_to = "value") %>%
ggplot(aes(x = value, y = crim)) +
geom_point(alpha = 0.3, size = 0.8, colour = "steelblue") +
geom_smooth(method = "lm", colour = "tomato", se = TRUE) +
facet_wrap(~predictor, scales = "free_x") +
labs(title = "Per Capita Crime Rate vs. Selected Predictors")
Comment: Almost all predictors show a statistically
significant association with crim in univariate
regressions. rad (accessibility to radial highways),
tax (property-tax rate), and lstat
(lower-status population %) have the strongest positive associations.
dis (distance to employment centers) and medv
(median home value) have negative associations.
mlr_fit <- lm(crim ~ ., data = Boston)
summary(mlr_fit)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
Comment: In the multiple regression, only a handful
of predictors remain significant after controlling for others:
dis, rad, medv, and
zn. Many predictors significant in univariate models (e.g.,
lstat, tax) lose significance due to
multicollinearity.
# Fully self-contained: rebuild everything needed here
data(Boston)
all_preds <- setdiff(names(Boston), "crim")
# Univariate coefficients
uni_coefs_c <- sapply(all_preds, function(var) {
fit <- lm(Boston$crim ~ Boston[[var]])
coef(fit)[2]
})
# Multiple regression coefficients
mlr_c <- lm(crim ~ ., data = Boston)
multi_c <- coef(mlr_c)[-1] # drop intercept
# Build data frame — align by name
coef_df <- data.frame(
predictor = all_preds,
univariate = as.numeric(uni_coefs_c),
multiple = as.numeric(multi_c[all_preds]),
stringsAsFactors = FALSE
)
# Remove any rows with NA
coef_df <- coef_df[complete.cases(coef_df), ]
print(coef_df) # sanity check — should show 12 rows with no NA
## predictor univariate multiple
## 1 zn -0.07393498 0.0457100386
## 2 indus 0.50977633 -0.0583501107
## 3 chas -1.89277655 -0.8253775522
## 4 nox 31.24853120 -9.9575865471
## 5 rm -2.68405122 0.6289106622
## 6 age 0.10778623 -0.0008482791
## 7 dis -1.55090168 -1.0122467382
## 8 rad 0.61791093 0.6124653115
## 9 tax 0.02974225 -0.0037756465
## 10 ptratio 1.15198279 -0.3040727572
## 11 lstat 0.54880478 0.1388005968
## 12 medv -0.36315992 -0.2200563590
ggplot(coef_df, aes(x = univariate, y = multiple, label = predictor)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey60") +
geom_point(colour = "steelblue", size = 3) +
geom_text(vjust = -0.7, hjust = 0.5, size = 3, colour = "black") +
scale_x_continuous(expand = expansion(mult = 0.15)) +
scale_y_continuous(expand = expansion(mult = 0.15)) +
labs(title = "Univariate vs. Multiple Regression Coefficients",
subtitle = "Each point = one predictor of crim (Boston dataset)",
x = "Simple (Univariate) Regression Coefficient",
y = "Multiple Regression Coefficient") +
theme_bw(base_size = 12)
Comment: The coefficients differ substantially
between univariate and multiple regression, most notably for
nox — its large positive univariate coefficient shrinks
dramatically in multiple regression, indicating confounding. The dashed
45° line represents perfect agreement; most points fall below it,
reflecting attenuation due to multicollinearity.
poly_results <- lapply(predictors, function(var) {
# skip binary variable
if (length(unique(Boston[[var]])) < 4) return(NULL)
fit <- lm(as.formula(paste("crim ~ poly(", var, ", 3)")), data = Boston)
s <- summary(fit)
pvals <- s$coefficients[-1, 4]
data.frame(
predictor = var,
p_linear = pvals[1],
p_quadratic = pvals[2],
p_cubic = pvals[3]
)
})
poly_df <- do.call(rbind, Filter(Negate(is.null), poly_results))
print(poly_df)
## predictor p_linear p_quadratic p_cubic
## poly(zn, 3)1 zn 4.697806e-06 4.420507e-03 2.295386e-01
## poly(indus, 3)1 indus 8.854243e-24 1.086057e-03 1.196405e-12
## poly(nox, 3)1 nox 2.457491e-26 7.736755e-05 6.961110e-16
## poly(rm, 3)1 rm 5.128048e-07 1.508545e-03 5.085751e-01
## poly(age, 3)1 age 4.878803e-17 2.291156e-06 6.679915e-03
## poly(dis, 3)1 dis 1.253249e-21 7.869767e-14 1.088832e-08
## poly(rad, 3)1 rad 1.053211e-56 9.120558e-03 4.823138e-01
## poly(tax, 3)1 tax 6.976314e-49 3.665348e-06 2.438507e-01
## poly(ptratio, 3)1 ptratio 1.565484e-11 2.405468e-03 6.300514e-03
## poly(lstat, 3)1 lstat 1.678072e-27 3.780418e-02 1.298906e-01
## poly(medv, 3)1 medv 4.930818e-27 2.928577e-35 1.046510e-12
Comment: Several predictors show statistically
significant quadratic and/or cubic terms — notably dis,
lstat, nox, and medv — providing
evidence of non-linear associations with crime rate. This suggests that
purely linear models may be insufficient and that polynomial or other
non-linear approaches could improve fit.
data(Weekly)
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
ggpairs(Weekly %>% select(-Direction),
lower = list(continuous = wrap("points", alpha = 0.2, size = 0.5)),
upper = list(continuous = wrap("cor", size = 3)),
title = "Weekly Data – Scatterplot Matrix")
ggplot(Weekly, aes(x = Year, y = Volume)) +
geom_line(colour = "steelblue") +
labs(title = "Trading Volume Over Time", y = "Volume (billions of shares)")
Patterns: Trading volume has increased substantially
over the 21-year period. The lag variables (Lag1–Lag5) show very low
correlations with each other and with Today, consistent
with the weak predictability of stock returns. Volume and
Year are strongly correlated, reflecting the secular growth
in market activity.
glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(glm_full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Comment: Only Lag2 has a statistically
significant coefficient (p ≈ 0.03). The other lag variables and Volume
are not significant predictors of market direction when all are included
together.
prob_full <- predict(glm_full, type = "response")
pred_full <- ifelse(prob_full > 0.5, "Up", "Down")
conf_full <- table(Predicted = pred_full, Actual = Weekly$Direction)
print(conf_full)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
cat("Overall Correct:", mean(pred_full == Weekly$Direction), "\n")
## Overall Correct: 0.5610652
Interpretation: The model predominantly predicts “Up”, achieving a high accuracy for Up weeks but almost completely missing Down weeks. The overall fraction correct (~56%) is only marginally better than random guessing, driven largely by the market’s upward bias during this period. The high false-positive rate (Down predicted as Up) indicates poor discrimination.
Lag2
Onlytrain <- Weekly %>% filter(Year <= 2008)
test <- Weekly %>% filter(Year >= 2009)
glm_lag2 <- glm(Direction ~ Lag2,
data = train,
family = binomial)
prob_test <- predict(glm_lag2, newdata = test, type = "response")
pred_test <- ifelse(prob_test > 0.5, "Up", "Down")
conf_test <- table(Predicted = pred_test, Actual = test$Direction)
print(conf_test)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
cat("Overall Correct (test):", mean(pred_test == test$Direction), "\n")
## Overall Correct (test): 0.625
Comment: Using only Lag2 as a predictor
on the held-out 2009–2010 data, the model achieves approximately 62.5%
correct predictions. The confusion matrix shows that the model still
favours predicting “Up”, but captures a reasonable fraction of true Up
weeks. This is a modest improvement over the full-model baseline,
consistent with Lag2 being the only significant predictor
in the full model. ```