This exercise involves the
Autodataset. We begin by removing any rows with missing values.
var_types <- data.frame(
Variable = names(Auto),
Type = c("Quantitative","Quantitative","Quantitative",
"Quantitative","Quantitative","Quantitative",
"Quantitative","Qualitative","Qualitative"),
Description = c("Miles per gallon","Number of cylinders",
"Engine displacement (cu. in.)","Engine horsepower",
"Vehicle weight (lbs.)","Time to accelerate 0-60 mph (sec)",
"Model year","Origin (1=US, 2=Europe, 3=Japan)",
"Vehicle name")
)
var_types %>%
kable("html", caption = "Variable Types in the Auto Dataset") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(which(var_types$Type == "Qualitative"), background = "#fff3cd")| Variable | Type | Description |
|---|---|---|
| mpg | Quantitative | Miles per gallon |
| cylinders | Quantitative | Number of cylinders |
| displacement | Quantitative | Engine displacement (cu. in.) |
| horsepower | Quantitative | Engine horsepower |
| weight | Quantitative | Vehicle weight (lbs.) |
| acceleration | Quantitative | Time to accelerate 0-60 mph (sec) |
| year | Quantitative | Model year |
| origin | Qualitative | Origin (1=US, 2=Europe, 3=Japan) |
| name | Qualitative | Vehicle name |
Note: origin is coded as an integer but
represents a categorical region of manufacture. name is a
character identifier. All remaining variables are continuous
quantitative predictors.
quant_vars <- Auto %>% select(mpg, cylinders, displacement,
horsepower, weight, acceleration, year)
range_df <- data.frame(
Variable = names(quant_vars),
Min = sapply(quant_vars, min),
Max = sapply(quant_vars, max),
Range = sapply(quant_vars, function(x) diff(range(x)))
)
range_df %>%
kable("html", caption = "Range of Quantitative Predictors", digits = 2) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)| Variable | Min | Max | Range | |
|---|---|---|---|---|
| mpg | mpg | 9 | 46.6 | 37.6 |
| cylinders | cylinders | 3 | 8.0 | 5.0 |
| displacement | displacement | 68 | 455.0 | 387.0 |
| horsepower | horsepower | 46 | 230.0 | 184.0 |
| weight | weight | 1613 | 5140.0 | 3527.0 |
| acceleration | acceleration | 8 | 24.8 | 16.8 |
| year | year | 70 | 82.0 | 12.0 |
stats_df <- data.frame(
Variable = names(quant_vars),
Mean = sapply(quant_vars, mean),
SD = sapply(quant_vars, sd)
)
stats_df %>%
kable("html", caption = "Mean and SD of Quantitative Predictors", digits = 3) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)| Variable | Mean | SD | |
|---|---|---|---|
| mpg | mpg | 23.446 | 7.805 |
| cylinders | cylinders | 5.472 | 1.706 |
| displacement | displacement | 194.412 | 104.644 |
| horsepower | horsepower | 104.469 | 38.491 |
| weight | weight | 2977.584 | 849.403 |
| acceleration | acceleration | 15.541 | 2.759 |
| year | year | 75.980 | 3.684 |
auto_sub <- Auto[-(10:85), ]
quant_sub <- auto_sub %>% select(mpg, cylinders, displacement,
horsepower, weight, acceleration, year)
sub_stats <- data.frame(
Variable = names(quant_sub),
Min = sapply(quant_sub, min),
Max = sapply(quant_sub, max),
Mean = sapply(quant_sub, mean),
SD = sapply(quant_sub, sd)
)
sub_stats %>%
kable("html", caption = "Summary Statistics After Removing Observations 10-85",
digits = 3) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)| Variable | Min | Max | Mean | SD | |
|---|---|---|---|---|---|
| mpg | mpg | 11.0 | 46.6 | 24.404 | 7.867 |
| cylinders | cylinders | 3.0 | 8.0 | 5.373 | 1.654 |
| displacement | displacement | 68.0 | 455.0 | 187.241 | 99.678 |
| horsepower | horsepower | 46.0 | 230.0 | 100.722 | 35.709 |
| weight | weight | 1649.0 | 4997.0 | 2935.972 | 811.300 |
| acceleration | acceleration | 8.5 | 24.8 | 15.727 | 2.694 |
| year | year | 70.0 | 82.0 | 77.146 | 3.106 |
Note: Removing 76 observations causes modest shifts in means and standard deviations. Ranges narrow slightly for some variables because extreme values fall within rows 10-85.
ggpairs(
quant_vars,
lower = list(continuous = wrap("points", alpha = 0.3, size = 0.8, color = "steelblue")),
diag = list(continuous = wrap("densityDiag", fill = "steelblue", alpha = 0.4)),
upper = list(continuous = wrap("cor", size = 3.5)),
title = "Scatterplot Matrix - Auto Dataset (Quantitative Variables)"
) + theme_bw(base_size = 9)Findings:
displacement, weight, and
horsepower are strongly positively correlated (r >
0.85), indicating high multicollinearity among these “size/power”
variables.mpg is strongly negatively correlated
with displacement (r = -0.81), weight (r =
-0.83), and horsepower (r = -0.78).year shows a moderate positive correlation with
mpg (r = 0.58), reflecting improving fuel efficiency
standards over time.acceleration has relatively weak correlations with most
other variables, making it a more independent predictor.mpgquant_vars %>%
mutate(mpg = Auto$mpg) %>%
pivot_longer(-mpg, names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value, y = mpg)) +
geom_point(alpha = 0.3, color = "steelblue", size = 1) +
geom_smooth(method = "loess", color = "red", se = TRUE, linewidth = 0.8) +
facet_wrap(~variable, scales = "free_x", ncol = 3) +
labs(title = "mpg vs. Each Quantitative Predictor",
subtitle = "Red line = LOESS smoother",
x = "Predictor Value", y = "Miles Per Gallon") +
theme_bw()Comment: The LOESS curves reveal that
weight, displacement, and
horsepower have strong, slightly non-linear negative
relationships with mpg. year shows a clear
positive trend. cylinders is discrete with a step-like
negative association. Based on these plots, weight,
horsepower, displacement, and
year appear to be the most informative
predictors.
Multiple linear regression on the
Autodataset withmpgas the response.
ggpairs(
Auto %>% select(-name),
lower = list(continuous = wrap("points", alpha = 0.2, size = 0.7, color = "darkorange")),
diag = list(continuous = wrap("densityDiag", fill = "darkorange", alpha = 0.4),
discrete = wrap("barDiag", fill = "darkorange", alpha = 0.6)),
upper = list(continuous = wrap("cor", size = 3),
combo = wrap("box_no_facet")),
title = "Scatterplot Matrix - Auto (All Variables)"
) + theme_bw(base_size = 8)cor_mat <- cor(Auto %>% select(-name))
corrplot(cor_mat,
method = "color",
type = "upper",
order = "hclust",
addCoef.col = "black",
number.cex = 0.7,
tl.cex = 0.85,
col = colorRampPalette(c("#d73027","white","#1a6faf"))(200),
title = "Correlation Matrix - Auto Dataset",
mar = c(0,0,1.5,0))Observation: The upper-left cluster
(mpg, cylinders, displacement,
horsepower, weight) shows very high
inter-correlations, confirming multicollinearity. year and
origin are less correlated with this cluster.
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
coef_df <- as.data.frame(summary(lm_fit)$coefficients)
coef_df$Sig <- ifelse(coef_df[,4] < 0.001, "***",
ifelse(coef_df[,4] < 0.01, "**",
ifelse(coef_df[,4] < 0.05, "*",
ifelse(coef_df[,4] < 0.1, ".", ""))))
colnames(coef_df)[1:4] <- c("Estimate","Std. Error","t value","Pr(>|t|)")
coef_df %>%
kable("html", caption = "MLR Coefficient Summary", digits = 4) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(which(coef_df$Sig %in% c("*","**","***")), background = "#d4edda")| Estimate | Std. Error | t value | Pr(>|t|) | Sig | |
|---|---|---|---|---|---|
| (Intercept) | -17.2184 | 4.6443 | -3.7074 | 0.0002 | *** |
| cylinders | -0.4934 | 0.3233 | -1.5261 | 0.1278 | |
| displacement | 0.0199 | 0.0075 | 2.6474 | 0.0084 | ** |
| horsepower | -0.0170 | 0.0138 | -1.2295 | 0.2196 | |
| weight | -0.0065 | 0.0007 | -9.9288 | 0.0000 | *** |
| acceleration | 0.0806 | 0.0988 | 0.8152 | 0.4155 | |
| year | 0.7508 | 0.0510 | 14.7288 | 0.0000 | *** |
| origin | 1.4261 | 0.2781 | 5.1275 | 0.0000 | *** |
i. Yes — the F-statistic is large with a near-zero
p-value; the predictors collectively explain ~82% of variance in
mpg (Adjusted R² ≈ 0.82).
ii. displacement, weight,
year, and origin are significant at the 5%
level (highlighted). horsepower and
acceleration lose significance once correlated predictors
are controlled for.
iii. The positive year coefficient
(~0.75) implies that, holding all else constant, each additional model
year is associated with approximately 0.75 more mpg —
capturing industry-wide fuel efficiency improvements.
Comment: The Residuals vs. Fitted plot shows a slight U-shape suggesting mild non-linearity. The Scale-Location plot shows some heteroscedasticity at higher fitted values. Observation 14 has notably high leverage; points 323, 326, and 327 are potential outliers.
lm_interact <- lm(mpg ~ displacement * weight + horsepower * acceleration +
year + origin, data = Auto)
summary(lm_interact)##
## Call:
## lm(formula = mpg ~ displacement * weight + horsepower * acceleration +
## year + origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7332 -1.6889 -0.0407 1.5378 12.0407
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.503e+01 4.854e+00 -3.096 0.002107 **
## displacement -6.418e-02 8.988e-03 -7.141 4.70e-12 ***
## weight -8.473e-03 8.612e-04 -9.839 < 2e-16 ***
## horsepower 4.894e-02 2.381e-02 2.055 0.040553 *
## acceleration 5.710e-01 1.528e-01 3.737 0.000215 ***
## year 7.817e-01 4.460e-02 17.526 < 2e-16 ***
## origin 5.117e-01 2.542e-01 2.013 0.044838 *
## displacement:weight 1.900e-05 2.335e-06 8.135 5.79e-15 ***
## horsepower:acceleration -6.816e-03 1.702e-03 -4.005 7.45e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.904 on 383 degrees of freedom
## Multiple R-squared: 0.8644, Adjusted R-squared: 0.8615
## F-statistic: 305.1 on 8 and 383 DF, p-value: < 2.2e-16
Comment: The displacement:weight
interaction is statistically significant, indicating that the combined
effect of engine size and vehicle mass is not purely additive — very
heavy, large-engine cars suffer a compounded fuel economy penalty.
lm_log <- lm(mpg ~ log(displacement) + log(horsepower) + log(weight) +
acceleration + year + origin, data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(horsepower) + sqrt(weight) +
acceleration + year + origin, data = Auto)
lm_quad <- lm(mpg ~ horsepower + I(horsepower^2) + weight +
acceleration + year + origin, data = Auto)
model_comp <- data.frame(
Model = c("Linear (baseline)","Log-transformed","Sqrt-transformed","Quadratic (hp2)"),
Adj_R2 = c(summary(lm_fit)$adj.r.squared, summary(lm_log)$adj.r.squared,
summary(lm_sqrt)$adj.r.squared, summary(lm_quad)$adj.r.squared),
RSE = c(summary(lm_fit)$sigma, summary(lm_log)$sigma,
summary(lm_sqrt)$sigma, summary(lm_quad)$sigma)
)
model_comp %>%
kable("html", caption = "Model Comparison - Transformations", digits = 4) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(which.max(model_comp$Adj_R2), background = "#d4edda")| Model | Adj_R2 | RSE |
|---|---|---|
| Linear (baseline) | 0.8182 | 3.3277 |
| Log-transformed | 0.8445 | 3.0775 |
| Sqrt-transformed | 0.8310 | 3.2082 |
| Quadratic (hp2) | 0.8524 | 2.9983 |
Comment: Log-transforming displacement,
horsepower, and weight yields the highest
adjusted R² and lowest RSE. This makes physical sense — the relationship
between engine/body size and fuel economy is multiplicative, not
additive.
Predicting per capita crime rate (
crim) using theBostondataset.
slr_results <- lapply(predictors_boston, function(var) {
fit <- lm(as.formula(paste("crim ~", var)), data = Boston)
s <- summary(fit)
data.frame(
Predictor = var,
Coefficient = round(coef(fit)[2], 4),
R2 = round(s$r.squared, 4),
p_value = signif(s$coefficients[2, 4], 3)
)
})
slr_df <- do.call(rbind, slr_results)
slr_df %>%
arrange(p_value) %>%
kable("html", caption = "Simple Linear Regression - Each Predictor vs. crim",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(which(sort(slr_df$p_value) < 0.05), background = "#d4edda")| Predictor | Coefficient | R2 | p_value | |
|---|---|---|---|---|
| rad | rad | 0.6179 | 0.3913 | 0.000 |
| tax | tax | 0.0297 | 0.3396 | 0.000 |
| lstat | lstat | 0.5488 | 0.2076 | 0.000 |
| nox | nox | 31.2485 | 0.1772 | 0.000 |
| indus | indus | 0.5098 | 0.1653 | 0.000 |
| medv | medv | -0.3632 | 0.1508 | 0.000 |
| dis | dis | -1.5509 | 0.1441 | 0.000 |
| age | age | 0.1078 | 0.1244 | 0.000 |
| ptratio | ptratio | 1.1520 | 0.0841 | 0.000 |
| rm | rm | -2.6841 | 0.0481 | 0.000 |
| zn | zn | -0.0739 | 0.0402 | 0.000 |
| chas | chas | -1.8928 | 0.0031 | 0.209 |
top6 <- slr_df %>% arrange(p_value) %>% head(6) %>% pull(Predictor)
Boston %>%
select(crim, all_of(top6)) %>%
pivot_longer(-crim, names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value, y = crim)) +
geom_point(alpha = 0.25, color = "tomato", size = 1) +
geom_smooth(method = "lm", color = "navy", se = TRUE, linewidth = 0.9) +
facet_wrap(~variable, scales = "free_x", ncol = 3) +
labs(title = "Top 6 Predictors of Per Capita Crime Rate (Boston)",
subtitle = "Simple linear regression fits shown",
x = "Predictor Value", y = "Crime Rate (crim)") +
theme_bw()Comment: Almost all predictors show a statistically
significant univariate association with crim.
rad (highway access), tax (property tax), and
lstat (lower-status population %) have particularly strong
associations.
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
coef_b <- as.data.frame(summary(mlr_boston)$coefficients)
coef_b$Sig <- ifelse(coef_b[,4] < 0.001,"***",
ifelse(coef_b[,4] < 0.01, "**",
ifelse(coef_b[,4] < 0.05, "*",
ifelse(coef_b[,4] < 0.1, ".",""))))
colnames(coef_b)[1:4] <- c("Estimate","Std. Error","t value","Pr(>|t|)")
coef_b %>%
kable("html", caption = "Multiple Regression Coefficients - Boston", digits = 4) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(which(coef_b$Sig %in% c("*","**","***")), background = "#d4edda")| Estimate | Std. Error | t value | Pr(>|t|) | Sig | |
|---|---|---|---|---|---|
| (Intercept) | 13.7784 | 7.0818 | 1.9456 | 0.0523 | . |
| zn | 0.0457 | 0.0188 | 2.4326 | 0.0153 |
|
| indus | -0.0584 | 0.0836 | -0.6977 | 0.4857 | |
| chas | -0.8254 | 1.1834 | -0.6975 | 0.4858 | |
| nox | -9.9576 | 5.2898 | -1.8824 | 0.0604 | . |
| rm | 0.6289 | 0.6071 | 1.0359 | 0.3007 | |
| age | -0.0008 | 0.0179 | -0.0473 | 0.9623 | |
| dis | -1.0122 | 0.2825 | -3.5836 | 0.0004 | *** |
| rad | 0.6125 | 0.0875 | 6.9967 | 0.0000 | *** |
| tax | -0.0038 | 0.0052 | -0.7300 | 0.4658 | |
| ptratio | -0.3041 | 0.1864 | -1.6316 | 0.1034 | |
| lstat | 0.1388 | 0.0757 | 1.8330 | 0.0674 | . |
| medv | -0.2201 | 0.0598 | -3.6784 | 0.0003 | *** |
Comment: In the full model, only zn,
dis, rad, black, and
medv remain significant. Variables like nox
and lstat lose significance when controlling for correlated
predictors — evidence of multicollinearity.
mlr_coefs <- coef(mlr_boston)[-1]
common <- intersect(names(mlr_coefs), slr_df$Predictor)
plot_df <- data.frame(
Predictor = common,
Simple_Coef = slr_df$Coefficient[match(common, slr_df$Predictor)],
Multi_Coef = mlr_coefs[common]
)
ggplot(plot_df, aes(x = Simple_Coef, y = Multi_Coef, label = Predictor)) +
geom_point(color = "steelblue", size = 3) +
geom_text(vjust = -0.7, size = 3.2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Univariate vs. Multiple Regression Coefficients",
subtitle = "Dashed line = perfect agreement",
x = "Simple Regression Coefficient",
y = "Multiple Regression Coefficient") +
theme_bw()Comment: Substantial divergence exists for several
predictors, most notably nox — its simple regression
coefficient is large but reverses sign in the multiple model. This
sign reversal is a classic symptom of confounding due
to multicollinearity.
poly_results <- lapply(predictors_boston, function(var) {
if (length(unique(Boston[[var]])) < 4) return(NULL)
fit <- lm(as.formula(paste("crim ~ poly(", var, ", 3)")), data = Boston)
pvals <- summary(fit)$coefficients[-1, 4]
data.frame(
Predictor = var,
p_linear = round(pvals[1], 4),
p_quadratic = ifelse(length(pvals) >= 2, round(pvals[2], 4), NA),
p_cubic = ifelse(length(pvals) >= 3, round(pvals[3], 4), NA)
)
})
poly_df <- do.call(rbind, Filter(Negate(is.null), poly_results))
poly_df %>%
kable("html", caption = "Polynomial Regression p-values - Boston Predictors",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(
which((!is.na(poly_df$p_quadratic) & poly_df$p_quadratic < 0.05) |
(!is.na(poly_df$p_cubic) & poly_df$p_cubic < 0.05)),
background = "#fff3cd"
)| Predictor | p_linear | p_quadratic | p_cubic | |
|---|---|---|---|---|
| poly(zn, 3)1 | zn | 0 | 0.0044 | 0.2295 |
| poly(indus, 3)1 | indus | 0 | 0.0011 | 0.0000 |
| poly(nox, 3)1 | nox | 0 | 0.0001 | 0.0000 |
| poly(rm, 3)1 | rm | 0 | 0.0015 | 0.5086 |
| poly(age, 3)1 | age | 0 | 0.0000 | 0.0067 |
| poly(dis, 3)1 | dis | 0 | 0.0000 | 0.0000 |
| poly(rad, 3)1 | rad | 0 | 0.0091 | 0.4823 |
| poly(tax, 3)1 | tax | 0 | 0.0000 | 0.2439 |
| poly(ptratio, 3)1 | ptratio | 0 | 0.0024 | 0.0063 |
| poly(lstat, 3)1 | lstat | 0 | 0.0378 | 0.1299 |
| poly(medv, 3)1 | medv | 0 | 0.0000 | 0.0000 |
Comment: Significant quadratic or cubic terms appear
for indus, nox, age,
dis, ptratio, and medv,
indicating non-linear relationships with crime rate that simple linear
models would underfit.
Predicting weekly stock market direction using the
Weeklydataset (1990-2010).
summary(Weekly) %>%
kable("html", caption = "Summary Statistics - Weekly Dataset") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = TRUE, font_size = 12)| Year | Lag1 | Lag2 | Lag3 | Lag4 | Lag5 | Volume | Today | Direction | |
|---|---|---|---|---|---|---|---|---|---|
| Min. :1990 | Min. :-18.1950 | Min. :-18.1950 | Min. :-18.1950 | Min. :-18.1950 | Min. :-18.1950 | Min. :0.08747 | Min. :-18.1950 | Down:484 | |
| 1st Qu.:1995 | 1st Qu.: -1.1540 | 1st Qu.: -1.1540 | 1st Qu.: -1.1580 | 1st Qu.: -1.1580 | 1st Qu.: -1.1660 | 1st Qu.:0.33202 | 1st Qu.: -1.1540 | Up :605 | |
| Median :2000 | Median : 0.2410 | Median : 0.2410 | Median : 0.2410 | Median : 0.2380 | Median : 0.2340 | Median :1.00268 | Median : 0.2410 | NA | |
| Mean :2000 | Mean : 0.1506 | Mean : 0.1511 | Mean : 0.1472 | Mean : 0.1458 | Mean : 0.1399 | Mean :1.57462 | Mean : 0.1499 | NA | |
| 3rd Qu.:2005 | 3rd Qu.: 1.4050 | 3rd Qu.: 1.4090 | 3rd Qu.: 1.4090 | 3rd Qu.: 1.4090 | 3rd Qu.: 1.4050 | 3rd Qu.:2.05373 | 3rd Qu.: 1.4050 | NA | |
| Max. :2010 | Max. : 12.0260 | Max. : 12.0260 | Max. : 12.0260 | Max. : 12.0260 | Max. : 12.0260 | Max. :9.32821 | Max. : 12.0260 | NA |
ggplot(Weekly, aes(x = Year, y = Volume)) +
geom_line(color = "darkgreen", linewidth = 0.7) +
geom_smooth(method = "loess", color = "red", se = FALSE, linewidth = 1) +
labs(title = "NYSE Trading Volume Over Time (1990-2010)",
subtitle = "Red = LOESS trend",
x = "Year", y = "Average Weekly Volume (billions of shares)") +
theme_bw()Weekly %>%
count(Direction) %>%
mutate(pct = n / sum(n) * 100) %>%
ggplot(aes(x = Direction, y = n, fill = Direction)) +
geom_col(width = 0.5) +
geom_text(aes(label = sprintf("%d (%.1f%%)", n, pct)), vjust = -0.5, size = 4) +
scale_fill_manual(values = c("Down" = "tomato", "Up" = "steelblue")) +
labs(title = "Market Direction - Weekly Dataset", y = "Count", x = "") +
theme_bw() + theme(legend.position = "none")Comment: Trading volume increased dramatically from 1990 to ~2004, then levelled off. The market moved “Up” in approximately 56.1% of weeks, reflecting the typical upward drift of equity markets. The lag variables show little obvious serial pattern.
glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(glm_full)##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
coef_glm <- as.data.frame(summary(glm_full)$coefficients)
coef_glm$Sig <- ifelse(coef_glm[,4] < 0.001,"***",
ifelse(coef_glm[,4] < 0.01, "**",
ifelse(coef_glm[,4] < 0.05, "*",
ifelse(coef_glm[,4] < 0.1, ".",""))))
colnames(coef_glm)[1:4] <- c("Estimate","Std. Error","z value","Pr(>|z|)")
coef_glm %>%
kable("html", caption = "Logistic Regression Coefficients - Full Model", digits = 4) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(which(coef_glm$Sig %in% c("*","**","***")), background = "#d4edda")| Estimate | Std. Error | z value | Pr(>|z|) | Sig | |
|---|---|---|---|---|---|
| (Intercept) | 0.2669 | 0.0859 | 3.1056 | 0.0019 | ** |
| Lag1 | -0.0413 | 0.0264 | -1.5626 | 0.1181 | |
| Lag2 | 0.0584 | 0.0269 | 2.1754 | 0.0296 |
|
| Lag3 | -0.0161 | 0.0267 | -0.6024 | 0.5469 | |
| Lag4 | -0.0278 | 0.0265 | -1.0501 | 0.2937 | |
| Lag5 | -0.0145 | 0.0264 | -0.5485 | 0.5833 | |
| Volume | -0.0227 | 0.0369 | -0.6163 | 0.5377 |
Comment: Only Lag2 is statistically
significant (p ~ 0.03). Its positive coefficient suggests a higher
return two weeks ago is weakly associated with an “Up” direction this
week. Other lags and Volume do not show significant predictive
power.
glm_probs <- predict(glm_full, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, "Up", "Down")
conf_mat <- table(Predicted = glm_pred, Actual = Weekly$Direction)
print(conf_mat)## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
accuracy <- mean(glm_pred == Weekly$Direction)
precision <- conf_mat["Up","Up"] / sum(conf_mat["Up",])
recall <- conf_mat["Up","Up"] / sum(conf_mat[,"Up"])
data.frame(
Metric = c("Overall Accuracy","Precision (Up)","Recall / Sensitivity (Up)"),
Value = round(c(accuracy, precision, recall), 4)
) %>%
kable("html", caption = "Performance Metrics - Full Logistic Regression") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)| Metric | Value |
|---|---|
| Overall Accuracy | 0.5611 |
| Precision (Up) | 0.5643 |
| Recall / Sensitivity (Up) | 0.9207 |
Comment: Overall accuracy of 56.1% barely beats the naive baseline of always predicting “Up” (~56%). The model’s high recall for “Up” is offset by very poor specificity — it almost never correctly identifies “Down” weeks.
Lag2 Onlytrain <- Weekly$Year <= 2008
test_data <- Weekly[!train, ]
glm_lag2 <- glm(Direction ~ Lag2, data = Weekly,
family = binomial, subset = train)
lag2_probs <- predict(glm_lag2, newdata = test_data, type = "response")
lag2_pred <- ifelse(lag2_probs > 0.5, "Up", "Down")
conf_mat2 <- table(Predicted = lag2_pred, Actual = test_data$Direction)
print(conf_mat2)## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
roc_obj <- roc(test_data$Direction, lag2_probs, levels = c("Down","Up"))
auc_val <- auc(roc_obj)
ggroc(roc_obj, color = "steelblue", linewidth = 1.2) +
geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "grey50") +
annotate("text", x = 0.35, y = 0.15,
label = paste0("AUC = ", round(auc_val, 3)),
size = 5, color = "steelblue") +
labs(title = "ROC Curve - Logistic Regression (Lag2 only, Test 2009-2010)",
subtitle = "Dashed = random classifier",
x = "Specificity", y = "Sensitivity") +
theme_bw()prec2 <- conf_mat2["Up","Up"] / sum(conf_mat2["Up",])
rec2 <- conf_mat2["Up","Up"] / sum(conf_mat2[,"Up"])
data.frame(
Metric = c("Test Accuracy (2009-2010)","Precision (Up)","Recall (Up)","AUC"),
Value = round(c(acc2, prec2, rec2, as.numeric(auc_val)), 4)
) %>%
kable("html", caption = "Performance on Held-Out Test Set (2009-2010)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(1, background = "#d4edda")| Metric | Value |
|---|---|
| Test Accuracy (2009-2010) | 0.6250 |
| Precision (Up) | 0.6222 |
| Recall (Up) | 0.9180 |
| AUC | 0.4537 |
Comment: Using only Lag2 and evaluating
on held-out 2009-2010 data yields a test accuracy of 62.5% — an
improvement over the full model. The AUC of 0.454 confirms modest but
genuine discriminatory power. The simpler model generalises better,
consistent with Occam’s Razor: complexity should only
be added when it demonstrably reduces test error.
End of Midterm Homework — Application of Financial Software