library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(devtools)
## Loading required package: usethis

Question 1

Summary and Correlation

data(mtcars) str(mtcars)

summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
cor(mtcars)
##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000

Boxplot

This suggests that transmission type may influence fuel efficiency—possibly due to differences in engine control or driving behavior.

Correlation between mpg and cars table

cor(mtcars$mpg, mtcars)
##      mpg       cyl       disp         hp      drat         wt     qsec
## [1,]   1 -0.852162 -0.8475514 -0.7761684 0.6811719 -0.8676594 0.418684
##             vs        am      gear       carb
## [1,] 0.6640389 0.5998324 0.4802848 -0.5509251

Top Negative Correlations:

  • weight: -0.87
  • horsepower: -0.78
  • cylinders: -0.85

Heavier cars and cars with more cylinders or horsepower tend to consume more fuel leading to lower mpg.

Question 2

sum(is.na(mtcars))
## [1] 0
colSums(is.na(mtcars))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0

The dataset is missing no values

str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
table(mtcars$am)    # Should be 0 or 1
## 
##  0  1 
## 19 13
table(mtcars$vs)    # Should be 0 or 1
## 
##  0  1 
## 18 14
table(mtcars$gear)  # Should be 3, 4, or 5
## 
##  3  4  5 
## 15 12  5
table(mtcars$carb)  # Should be positive integers
## 
##  1  2  3  4  6  8 
##  7 10  3 10  1  1
# Checking for negative or zero values in mpg, wt, hp, etc.
summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

No inconsistent or invalid data detected

Question 3

model <- lm(mpg ~ ., data = mtcars)
summary(model)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

Each coefficient represents the expected change in mpg for a one-unit increase in the predictor, holding all other variables constant

The p-values tells us which predictors have a meaningful impact on mpg

Assumptions and Diagnostics

Assumptions:

  • Linearity
  • Independence of Errors
  • Normality of Residuals
  • Homoscedasticity
par(mfrow = c(2, 2))
plot(model)

Observations:

  • Residuals vs Fitted: Linearity is violated with the curved line
  • Normal Q-Q: Confirms deviation suggests non-normal residuals
  • Scale-Location: Confirmed homoscedasticity is present
  • Residuals vs Leverage: Identified influential points

MSE

mse <- mean((model$residuals)^2)
mse
## [1] 4.609201

Interaction Terms

model_interact <- lm(mpg ~ wt * hp + ., data = mtcars)
summary(model_interact)
## 
## Call:
## lm(formula = mpg ~ wt * hp + ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6129 -1.4482  0.2571  1.1179  4.0907 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.903972  16.390539   1.702 0.104165    
## wt          -9.613350   2.439829  -3.940 0.000809 ***
## hp          -0.140989   0.041789  -3.374 0.003018 ** 
## cyl          1.011371   0.941887   1.074 0.295710    
## disp        -0.002363   0.015716  -0.150 0.882013    
## drat        -0.803048   1.455063  -0.552 0.587132    
## qsec         0.744333   0.611042   1.218 0.237347    
## vs           0.133431   1.759111   0.076 0.940291    
## am          -0.725300   1.999043  -0.363 0.720543    
## gear         2.907613   1.434933   2.026 0.056279 .  
## carb        -0.512939   0.699359  -0.733 0.471800    
## wt:hp        0.036219   0.011403   3.176 0.004746 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared:  0.9129, Adjusted R-squared:  0.865 
## F-statistic: 19.06 on 11 and 20 DF,  p-value: 3.046e-08

Weight:Horsepower is significant because it suggests the effect of weight on mpg depends on horsepower

summary(model)$r.squared
## [1] 0.8690158
summary(model_interact)$r.squared
## [1] 0.9129353

The interaction terms created more variance, potentially revealing more nuanced relationships, and improved predictive power.

boxplot(mtcars)

library(DescTools)

mtcars$hp_wins <- Winsorize(mtcars$hp, val = quantile(mtcars$hp, 
                                                      probs = c(0.05, 0.95), 
                                                      na.rm = T))

model_wins <- lm(mpg ~ . - hp + hp_wins, data = mtcars)

summary(model_wins)
## 
## Call:
## lm(formula = mpg ~ . - hp + hp_wins, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4440 -1.5389 -0.1908  1.1157  4.5556 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 14.52194   18.89881   0.768   0.4508  
## cyl         -0.13067    1.03657  -0.126   0.9009  
## disp         0.01304    0.01737   0.751   0.4609  
## drat         1.07929    1.63034   0.662   0.5152  
## wt          -3.47885    1.84405  -1.887   0.0731 .
## qsec         0.69142    0.75049   0.921   0.3674  
## vs           0.18045    2.05887   0.088   0.9310  
## am           2.23794    2.05534   1.089   0.2886  
## gear         0.56689    1.48310   0.382   0.7061  
## carb        -0.31407    0.76341  -0.411   0.6849  
## hp_wins     -0.02675    0.02533  -1.056   0.3031  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.642 on 21 degrees of freedom
## Multiple R-squared:  0.8699, Adjusted R-squared:  0.8079 
## F-statistic: 14.04 on 10 and 21 DF,  p-value: 3.559e-07

Does R^2 imrpove the model?

Not necessarily. A higher R² can mean better fit to training data, but it doesn’t guarantee better generalization. Always validate with test data or cross-validation.