1) Data Exploration

1.1 Load data, codebook, and high-level review

data(mtcars)   # built-in dataset
df <- mtcars
str(df)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
summary(df)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

Codebook (variables & units)

codebook <- data.frame(
  variable = c("mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb"),
  description = c(
    "Miles per gallon (numeric)",
    "Number of cylinders (4, 6, 8)",
    "Displacement (cu.in.)",
    "Gross horsepower",
    "Rear axle ratio",
    "Weight (1000 lbs)",
    "1/4 mile time (sec)",
    "Engine (0 = V-shaped, 1 = straight)",
    "Transmission (0 = automatic, 1 = manual)",
    "Number of forward gears",
    "Number of carburetors"
  ),
  stringsAsFactors = FALSE
)
codebook
##    variable                              description
## 1       mpg               Miles per gallon (numeric)
## 2       cyl            Number of cylinders (4, 6, 8)
## 3      disp                    Displacement (cu.in.)
## 4        hp                         Gross horsepower
## 5      drat                          Rear axle ratio
## 6        wt                        Weight (1000 lbs)
## 7      qsec                      1/4 mile time (sec)
## 8        vs      Engine (0 = V-shaped, 1 = straight)
## 9        am Transmission (0 = automatic, 1 = manual)
## 10     gear                  Number of forward gears
## 11     carb                    Number of carburetors

1.2 Visualizations & summary statistics

(Base R only, no extra packages.)

old_par <- par(mfrow = c(2,3))
hist(df$mpg,  main = "mpg",  xlab = "")
hist(df$wt,   main = "wt",   xlab = "")
hist(df$hp,   main = "hp",   xlab = "")
hist(df$disp, main = "disp", xlab = "")
hist(df$qsec, main = "qsec", xlab = "")
hist(df$drat, main = "drat", xlab = "")

par(old_par)

pairs(df[, c("mpg","wt","hp","disp","qsec","drat")],
      main = "Pairs Plot (selected variables)")

1.3 One interesting trend/pattern

From the plots, mpg decreases as wt (weight) and hp (horsepower) increase. Cars with manual transmissions (am = 1) generally have higher mpg than automatics.

1.4 Which variables are most strongly correlated with mpg? Why?

cors <- cor(df)
mpg_cors <- sort(cors[,"mpg"][-which(names(df)=="mpg")])
mpg_cors  # sorted (ascending); strongest negative are at the top
##         wt        cyl       disp         hp       carb       qsec       gear 
## -0.8676594 -0.8521620 -0.8475514 -0.7761684 -0.5509251  0.4186840  0.4802848 
##         am         vs       drat 
##  0.5998324  0.6640389  0.6811719
# Simple visual of absolute correlations with mpg:
barplot(sort(abs(cors[,"mpg"][-which(names(df)=="mpg")]), decreasing = TRUE),
        main = "Absolute correlations with mpg", las = 2, ylab = "|corr|")

Answer: By magnitude, wt, disp, and hp tend to be most strongly (negatively) correlated with mpg. Heavier cars and larger/stronger engines usually consume more fuel, reducing mileage.


2) Data Preprocessing

2.1 Missing data check (show evidence)

colSums(is.na(df))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0
anyNA(df)
## [1] FALSE

The counts above show no missing values.

2.2 Inconsistent/invalid data check (show evidence)

checks <- list(
  mpg_positive = all(df$mpg > 0),
  cyl_valid = all(df$cyl %in% c(4,6,8)),
  disp_positive = all(df$disp > 0),
  hp_positive = all(df$hp > 0),
  drat_positive = all(df$drat > 0),
  wt_positive = all(df$wt > 0),
  qsec_positive = all(df$qsec > 0),
  vs_binary = all(df$vs %in% c(0,1)),
  am_binary = all(df$am %in% c(0,1)),
  gear_valid = all(df$gear %in% c(3,4,5)),
  carb_positive_int = all(df$carb == as.integer(df$carb) & df$carb > 0)
)
checks
## $mpg_positive
## [1] TRUE
## 
## $cyl_valid
## [1] TRUE
## 
## $disp_positive
## [1] TRUE
## 
## $hp_positive
## [1] TRUE
## 
## $drat_positive
## [1] TRUE
## 
## $wt_positive
## [1] TRUE
## 
## $qsec_positive
## [1] TRUE
## 
## $vs_binary
## [1] TRUE
## 
## $am_binary
## [1] TRUE
## 
## $gear_valid
## [1] TRUE
## 
## $carb_positive_int
## [1] TRUE

All checks return TRUE, so no obvious invalid values were found under these simple rules.


3) Linear Regression using lm

3.1 Create a linear regression model to predict mpg

mod_full <- lm(mpg ~ ., data = df)
summary(mod_full)
## 
## Call:
## lm(formula = mpg ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

3.2 Interpret the coefficients (briefly)

  • Negative coefficients (e.g., wt, hp) mean higher values are associated with lower mpg, holding other variables constant.
  • Positive coefficients (e.g., possibly am, drat) mean higher values are associated with higher mpg, all else equal.
  • Use the Pr(>|t|) column above to see which predictors are statistically significant in this sample (n = 32).

3.3 Linear regression assumptions (diagnostics)

Linearity, independent errors, constant variance, and approximately normal residuals.

op <- par(mfrow = c(2,2))
plot(mod_full)  # standard diagnostic plots

par(op)

Diagnostics notes: The Residuals vs Fitted plot shows no strong curvature; variance looks roughly constant with mild spread changes.
The Q–Q plot is close to the line with slight tail deviations.
Scale–Location is fairly flat, suggesting near-constant variance.
Residuals vs Leverage flags a few moderately influential points but nothing extreme. Overall, assumptions appear mostly reasonable for this dataset.

3.4 Evaluate the model using MSE

mse_in_sample <- mean(residuals(mod_full)^2)
mse_in_sample
## [1] 4.609201

This is the in-sample Mean Squared Error (average squared residual).

3.5 Add interaction terms (find one significant, if possible)

We will try a few reasonable interactions and report the first with a significant coefficient.

try_pairs <- list(c("disp","hp"), c("wt","hp"), c("am","wt"))
results <- data.frame(interaction = character(), p_value = numeric(),
                      R2 = numeric(), Adj_R2 = numeric(), stringsAsFactors = FALSE)

for (p in try_pairs) {
  f <- as.formula(paste0("mpg ~ . + ", p[1], ":", p[2]))
  m <- lm(f, data = df)
  sm <- summary(m)
  term <- paste0(p[1], ":", p[2])
  if (term %in% rownames(sm$coefficients)) {
    results <- rbind(results,
      data.frame(interaction = term,
                 p_value = sm$coefficients[term, "Pr(>|t|)"],
                 R2 = sm$r.squared, Adj_R2 = sm$adj.r.squared))
  }
}
results
##   interaction    p_value        R2    Adj_R2
## 1     disp:hp 0.01403585 0.9038446 0.8509592

Fit the chosen interaction and compare performance (R² / Adj R²).

mod_int <- lm(mpg ~ . + disp:hp, data = df)
cbind(
  Full = c(R2 = summary(mod_full)$r.squared, AdjR2 = summary(mod_full)$adj.r.squared),
  With_disp_hp = c(R2 = summary(mod_int)$r.squared, AdjR2 = summary(mod_int)$adj.r.squared)
)
##            Full With_disp_hp
## R2    0.8690158    0.9038446
## AdjR2 0.8066423    0.8509592
summary(mod_int)$coefficients["disp:hp", , drop = FALSE]
##             Estimate   Std. Error  t value   Pr(>|t|)
## disp:hp 0.0003835253 0.0001424938 2.691523 0.01403585

Interpretation: The positive disp:hp coefficient suggests the (negative) effect of horsepower on mpg is slightly weaker at larger displacement (and vice-versa).

3.6 Outliers & Winsorization (if needed)

Identify potential outliers and apply winsorization to the single variable with the most outliers. Compare models before/after.

# (a) Influence via Cook's distance
cook <- cooks.distance(mod_full)
cutoff <- 4 / nrow(df)
which(cook > cutoff)  # indices of influential points (if any)
##          Merc 230 Chrysler Imperial      Lotus Europa    Ford Pantera L 
##                 9                17                28                29
# (b) Count univariate outliers per variable (boxplot rule)
num_vars <- df[, sapply(df, is.numeric)]
out_counts <- sapply(num_vars, function(x) length(boxplot.stats(x)$out))
out_counts
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    1    0    2    1    0    0    0    1
var_most_out <- names(which.max(out_counts))
var_most_out
## [1] "wt"
# (c) Simple winsorization helper (no extra packages)
winsorize_vec <- function(x, lower = 0.05, upper = 0.95) {
  qs <- quantile(x, probs = c(lower, upper), na.rm = TRUE, type = 7)
  x <- pmax(x, qs[1])
  x <- pmin(x, qs[2])
  x
}

df_w <- df
df_w[[var_most_out]] <- winsorize_vec(df_w[[var_most_out]], 0.05, 0.95)

mod_full_w <- lm(mpg ~ ., data = df_w)
c(R2_before = summary(mod_full)$r.squared,
  AdjR2_before = summary(mod_full)$adj.r.squared,
  R2_after = summary(mod_full_w)$r.squared,
  AdjR2_after = summary(mod_full_w)$adj.r.squared)
##    R2_before AdjR2_before     R2_after  AdjR2_after 
##    0.8690158    0.8066423    0.8649505    0.8006412
coef_before <- coef(mod_full)
coef_after  <- coef(mod_full_w)
head(sort(abs(coef_after - coef_before), decreasing = TRUE), 10)
## (Intercept)          vs        gear          wt        qsec          am 
## 1.513692278 0.199097609 0.101456718 0.101112016 0.101010096 0.091670938 
##        carb         cyl        drat        disp 
## 0.071374601 0.021165157 0.020301694 0.001311172

Winsorization note: After winsorizing wt, R² fell slightly (e.g., 0.869 → 0.865; Adj-R² 0.807 → 0.801) and the largest coefficient shifts were in the intercept, vs, and gear.

3.7 (No credit, just think)

Does a higher R² always improve predictability? Consider overfitting and the value of out-of-sample testing.