data(mtcars) # built-in dataset
df <- mtcars
str(df)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(df)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
Codebook (variables & units)
codebook <- data.frame(
variable = c("mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"),
description = c(
"Miles per gallon (numeric)",
"Number of cylinders (4, 6, 8)",
"Displacement (cu.in.)",
"Gross horsepower",
"Rear axle ratio",
"Weight (1000 lbs)",
"1/4 mile time (sec)",
"Engine (0 = V-shaped, 1 = straight)",
"Transmission (0 = automatic, 1 = manual)",
"Number of forward gears",
"Number of carburetors"
),
stringsAsFactors = FALSE
)
codebook
## variable description
## 1 mpg Miles per gallon (numeric)
## 2 cyl Number of cylinders (4, 6, 8)
## 3 disp Displacement (cu.in.)
## 4 hp Gross horsepower
## 5 drat Rear axle ratio
## 6 wt Weight (1000 lbs)
## 7 qsec 1/4 mile time (sec)
## 8 vs Engine (0 = V-shaped, 1 = straight)
## 9 am Transmission (0 = automatic, 1 = manual)
## 10 gear Number of forward gears
## 11 carb Number of carburetors
(Base R only, no extra packages.)
old_par <- par(mfrow = c(2,3))
hist(df$mpg, main = "mpg", xlab = "")
hist(df$wt, main = "wt", xlab = "")
hist(df$hp, main = "hp", xlab = "")
hist(df$disp, main = "disp", xlab = "")
hist(df$qsec, main = "qsec", xlab = "")
hist(df$drat, main = "drat", xlab = "")
par(old_par)
pairs(df[, c("mpg","wt","hp","disp","qsec","drat")],
main = "Pairs Plot (selected variables)")
From the plots, mpg decreases as
wt (weight) and
hp (horsepower) increase. Cars with manual
transmissions (am = 1) generally have higher
mpg than automatics.
colSums(is.na(df))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 0 0 0 0 0 0 0 0
anyNA(df)
## [1] FALSE
The counts above show no missing values.
checks <- list(
mpg_positive = all(df$mpg > 0),
cyl_valid = all(df$cyl %in% c(4,6,8)),
disp_positive = all(df$disp > 0),
hp_positive = all(df$hp > 0),
drat_positive = all(df$drat > 0),
wt_positive = all(df$wt > 0),
qsec_positive = all(df$qsec > 0),
vs_binary = all(df$vs %in% c(0,1)),
am_binary = all(df$am %in% c(0,1)),
gear_valid = all(df$gear %in% c(3,4,5)),
carb_positive_int = all(df$carb == as.integer(df$carb) & df$carb > 0)
)
checks
## $mpg_positive
## [1] TRUE
##
## $cyl_valid
## [1] TRUE
##
## $disp_positive
## [1] TRUE
##
## $hp_positive
## [1] TRUE
##
## $drat_positive
## [1] TRUE
##
## $wt_positive
## [1] TRUE
##
## $qsec_positive
## [1] TRUE
##
## $vs_binary
## [1] TRUE
##
## $am_binary
## [1] TRUE
##
## $gear_valid
## [1] TRUE
##
## $carb_positive_int
## [1] TRUE
All checks return
TRUE, so no obvious invalid values were found under these simple rules.
lmmpgmod_full <- lm(mpg ~ ., data = df)
summary(mod_full)
##
## Call:
## lm(formula = mpg ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
wt, hp) mean
higher values are associated with lower
mpg, holding other variables constant.am,
drat) mean higher values are associated with
higher mpg, all else equal.Pr(>|t|) column above to see which
predictors are statistically significant in this sample (n = 32).Linearity, independent errors, constant variance, and approximately normal residuals.
op <- par(mfrow = c(2,2))
plot(mod_full) # standard diagnostic plots
par(op)
Diagnostics notes: The Residuals vs Fitted plot
shows no strong curvature; variance looks roughly constant with mild
spread changes.
The Q–Q plot is close to the line with slight tail deviations.
Scale–Location is fairly flat, suggesting near-constant variance.
Residuals vs Leverage flags a few moderately influential points but
nothing extreme. Overall, assumptions appear mostly
reasonable for this dataset.
mse_in_sample <- mean(residuals(mod_full)^2)
mse_in_sample
## [1] 4.609201
This is the in-sample Mean Squared Error (average squared residual).
We will try a few reasonable interactions and report the first with a significant coefficient.
try_pairs <- list(c("disp","hp"), c("wt","hp"), c("am","wt"))
results <- data.frame(interaction = character(), p_value = numeric(),
R2 = numeric(), Adj_R2 = numeric(), stringsAsFactors = FALSE)
for (p in try_pairs) {
f <- as.formula(paste0("mpg ~ . + ", p[1], ":", p[2]))
m <- lm(f, data = df)
sm <- summary(m)
term <- paste0(p[1], ":", p[2])
if (term %in% rownames(sm$coefficients)) {
results <- rbind(results,
data.frame(interaction = term,
p_value = sm$coefficients[term, "Pr(>|t|)"],
R2 = sm$r.squared, Adj_R2 = sm$adj.r.squared))
}
}
results
## interaction p_value R2 Adj_R2
## 1 disp:hp 0.01403585 0.9038446 0.8509592
Fit the chosen interaction and compare performance (R² / Adj R²).
mod_int <- lm(mpg ~ . + disp:hp, data = df)
cbind(
Full = c(R2 = summary(mod_full)$r.squared, AdjR2 = summary(mod_full)$adj.r.squared),
With_disp_hp = c(R2 = summary(mod_int)$r.squared, AdjR2 = summary(mod_int)$adj.r.squared)
)
## Full With_disp_hp
## R2 0.8690158 0.9038446
## AdjR2 0.8066423 0.8509592
summary(mod_int)$coefficients["disp:hp", , drop = FALSE]
## Estimate Std. Error t value Pr(>|t|)
## disp:hp 0.0003835253 0.0001424938 2.691523 0.01403585
Interpretation: The positive
disp:hp coefficient suggests the (negative) effect of
horsepower on mpg is slightly weaker at larger
displacement (and vice-versa).
Identify potential outliers and apply winsorization to the single variable with the most outliers. Compare models before/after.
# (a) Influence via Cook's distance
cook <- cooks.distance(mod_full)
cutoff <- 4 / nrow(df)
which(cook > cutoff) # indices of influential points (if any)
## Merc 230 Chrysler Imperial Lotus Europa Ford Pantera L
## 9 17 28 29
# (b) Count univariate outliers per variable (boxplot rule)
num_vars <- df[, sapply(df, is.numeric)]
out_counts <- sapply(num_vars, function(x) length(boxplot.stats(x)$out))
out_counts
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 1 0 2 1 0 0 0 1
var_most_out <- names(which.max(out_counts))
var_most_out
## [1] "wt"
# (c) Simple winsorization helper (no extra packages)
winsorize_vec <- function(x, lower = 0.05, upper = 0.95) {
qs <- quantile(x, probs = c(lower, upper), na.rm = TRUE, type = 7)
x <- pmax(x, qs[1])
x <- pmin(x, qs[2])
x
}
df_w <- df
df_w[[var_most_out]] <- winsorize_vec(df_w[[var_most_out]], 0.05, 0.95)
mod_full_w <- lm(mpg ~ ., data = df_w)
c(R2_before = summary(mod_full)$r.squared,
AdjR2_before = summary(mod_full)$adj.r.squared,
R2_after = summary(mod_full_w)$r.squared,
AdjR2_after = summary(mod_full_w)$adj.r.squared)
## R2_before AdjR2_before R2_after AdjR2_after
## 0.8690158 0.8066423 0.8649505 0.8006412
coef_before <- coef(mod_full)
coef_after <- coef(mod_full_w)
head(sort(abs(coef_after - coef_before), decreasing = TRUE), 10)
## (Intercept) vs gear wt qsec am
## 1.513692278 0.199097609 0.101456718 0.101112016 0.101010096 0.091670938
## carb cyl drat disp
## 0.071374601 0.021165157 0.020301694 0.001311172
Winsorization note: After winsorizing
wt, R² fell slightly (e.g., 0.869 → 0.865;
Adj-R² 0.807 → 0.801) and the largest coefficient shifts were in the
intercept, vs, and gear.
Does a higher R² always improve predictability? Consider overfitting and the value of out-of-sample testing.