# Load Packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(ggplot2)
library(corrplot)
## corrplot 0.94 loaded
# Load dataset 
Cars <- mtcars
attach(Cars)
## The following object is masked from package:ggplot2:
## 
##     mpg
## Data Exploration
head(Cars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
summary(Cars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
mpg_corr_matrix <- cor(Cars)
mpg_corr_matrix
##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000
table(mtcars$cyl)
## 
##  4  6  8 
## 11  7 14
summary(Cars$mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.40   15.43   19.20   20.09   22.80   33.90
summary(Cars$hp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.0    96.5   123.0   146.7   180.0   335.0
summary(Cars$hp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.0    96.5   123.0   146.7   180.0   335.0

Data Processing

No data is missing.

colSums(is.na(Cars))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0
boxplot(Cars, main = "Boxplot detecting outliers", col = "orange")

library(corrplot)
corrplot(mpg_corr_matrix, method="circle", type="upper", order="hclust",
         tl.col="black", tl.srt=45)

Insights from the Boxplot Matrix:

Insights from the Correlation Matrix:

Linear Regression

data(Cars)
## Warning in data(Cars): data set 'Cars' not found
attach(Cars)
## The following objects are masked from Cars (pos = 3):
## 
##     am, carb, cyl, disp, drat, gear, hp, mpg, qsec, vs, wt
## The following object is masked from package:ggplot2:
## 
##     mpg
Cars_lm_mpg <- lm(mpg ~ ., data = mtcars)
summary(Cars_lm_mpg)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

Linear Regression Insights

Best fit Linear Regression

In my opinion, the independent variables weight, hp, and cyl would make the best fit for a linear regression. The p-value of 0.140 is high for horsepower, but I think it is an important factor. The only significant variable is **weight* at a p-value of 0.000199.

mpg_lm_bestfit <- lm(mpg ~ wt + hp + cyl, data = mtcars)
summary(mpg_lm_bestfit)
## 
## Call:
## lm(formula = mpg ~ wt + hp + cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9290 -1.5598 -0.5311  1.1850  5.8986 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38.75179    1.78686  21.687  < 2e-16 ***
## wt          -3.16697    0.74058  -4.276 0.000199 ***
## hp          -0.01804    0.01188  -1.519 0.140015    
## cyl         -0.94162    0.55092  -1.709 0.098480 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared:  0.8431, Adjusted R-squared:  0.8263 
## F-statistic: 50.17 on 3 and 28 DF,  p-value: 2.184e-11

Diagnostic Plots

par(mfrow = c(2, 2))
plot(Cars_lm_mpg )

* Linearity: The Residuals vs Fitted plot shows a slight curve, indicating that the linearity assumption may not be perfectly met (the relationship between predictors and mpg might not be fully linear). * Normality of Residuals: The Q-Q plot mostly follows a straight line, suggesting that the residuals are approximately normally distributed, though there are a few outliers (e.g., Chrysler Imperial, Fiat 1280, Ford Pantera L). * Homoscedasticity: The Scale-Location plot shows a slightly increasing trend, which suggests that the residuals may not have constant variance (some heteroscedasticity is present). * Influential Points: The Residuals vs Leverage plot identifies influential data points, particularly Chrysler Imperial and Ford Pantera L, which may disproportionately affect the model.

Evaluate MSE

predicted_mpg <- predict(Cars_lm_mpg) 
residuals <- Cars$mpg - predicted_mpg
mse <- mean(residuals^2)
mse
## [1] 4.609201

MSE = 4.609201 an MSE of 4.61 for the Cars dataset, with mpg (miles per gallon) as the dependent variable indicates that the average squared difference between the actual mpg values and the predicted values from the model is 4.61. This suggests that, while the model predicts mpg reasonably well, there are moderate differences between the actual and predicted fuel efficiency values. A lower MSE would indicate a better fit, meaning less prediction error. To further understand the error in terms of mpg, you could also compute the Root Mean Squared Error (RMSE), which would be approximately 2.15 mpg.

Interaction

full_model <- lm(mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb + wt * hp, data = mtcars)
summary(full_model)
## 
## Call:
## lm(formula = mpg ~ cyl + disp + hp + drat + wt + qsec + vs + 
##     am + gear + carb + wt * hp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6129 -1.4482  0.2571  1.1179  4.0907 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.903972  16.390539   1.702 0.104165    
## cyl          1.011371   0.941887   1.074 0.295710    
## disp        -0.002363   0.015716  -0.150 0.882013    
## hp          -0.140989   0.041789  -3.374 0.003018 ** 
## drat        -0.803048   1.455063  -0.552 0.587132    
## wt          -9.613350   2.439829  -3.940 0.000809 ***
## qsec         0.744333   0.611042   1.218 0.237347    
## vs           0.133431   1.759111   0.076 0.940291    
## am          -0.725300   1.999043  -0.363 0.720543    
## gear         2.907613   1.434933   2.026 0.056279 .  
## carb        -0.512939   0.699359  -0.733 0.471800    
## hp:wt        0.036219   0.011403   3.176 0.004746 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared:  0.9129, Adjusted R-squared:  0.865 
## F-statistic: 19.06 on 11 and 20 DF,  p-value: 3.046e-08
#interaction between weight and horsepower (wt*hp)
Cars_lm_interaction <- lm(mpg ~ wt * hp + disp + cyl, data = mtcars)
summary(Cars_lm_interaction)
## 
## Call:
## lm(formula = mpg ~ wt * hp + disp + cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4093 -1.6584 -0.5678  1.4284  4.5726 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.569405   3.816026  12.990 7.09e-13 ***
## wt          -7.643723   1.558423  -4.905 4.32e-05 ***
## hp          -0.107661   0.031230  -3.447  0.00194 ** 
## disp         0.001079   0.010918   0.099  0.92204    
## cyl         -0.404110   0.650595  -0.621  0.53992    
## wt:hp        0.025561   0.008608   2.969  0.00634 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.213 on 26 degrees of freedom
## Multiple R-squared:  0.887,  Adjusted R-squared:  0.8652 
## F-statistic:  40.8 on 5 and 26 DF,  p-value: 1.684e-11

winsorization

lower_bound_hp <- quantile(Cars$hp, 0.01)
upper_bound_hp <- quantile(Cars$hp, 0.99)
#winsorization to 'hp'
Cars$hp_winsorized <- Cars$hp
Cars$hp_winsorized[Cars$hp_winsorized < lower_bound_hp] <- lower_bound_hp
Cars$hp_winsorized[Cars$hp_winsorized > upper_bound_hp] <- upper_bound_hp
model_before <- lm(mpg ~ hp * wt, data = Cars)
summary(model_before)
## 
## Call:
## lm(formula = mpg ~ hp * wt, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0632 -1.6491 -0.7362  1.4211  4.5513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.80842    3.60516  13.816 5.01e-14 ***
## hp          -0.12010    0.02470  -4.863 4.04e-05 ***
## wt          -8.21662    1.26971  -6.471 5.20e-07 ***
## hp:wt        0.02785    0.00742   3.753 0.000811 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.153 on 28 degrees of freedom
## Multiple R-squared:  0.8848, Adjusted R-squared:  0.8724 
## F-statistic: 71.66 on 3 and 28 DF,  p-value: 2.981e-13
model_after <- lm(mpg ~ hp_winsorized * wt, data = Cars)
summary(model_after)
## 
## Call:
## lm(formula = mpg ~ hp_winsorized * wt, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0915 -1.6182 -0.7639  1.3622  4.4830 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      49.930327   3.609565  13.833 4.86e-14 ***
## hp_winsorized    -0.122091   0.024660  -4.951 3.17e-05 ***
## wt               -8.191877   1.279261  -6.404 6.22e-07 ***
## hp_winsorized:wt  0.028008   0.007425   3.772 0.000771 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.134 on 28 degrees of freedom
## Multiple R-squared:  0.8868, Adjusted R-squared:  0.8747 
## F-statistic: 73.11 on 3 and 28 DF,  p-value: 2.328e-13