# Load Packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(ggplot2)
library(corrplot)
## corrplot 0.94 loaded
# Load dataset
Cars <- mtcars
attach(Cars)
## The following object is masked from package:ggplot2:
##
## mpg
## Data Exploration
head(Cars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
summary(Cars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
mpg_corr_matrix <- cor(Cars)
mpg_corr_matrix
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
table(mtcars$cyl)
##
## 4 6 8
## 11 7 14
summary(Cars$mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.40 15.43 19.20 20.09 22.80 33.90
summary(Cars$hp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 52.0 96.5 123.0 146.7 180.0 335.0
summary(Cars$hp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 52.0 96.5 123.0 146.7 180.0 335.0
Wt (weight) has the strongest negative correlation with
mpg, with a value of -0.8676594. This suggests that as the
weight of the car increases, the miles per gallon decrease.Cyl (number of cylinders) has the second strongest
negative correlation with mpg, with a value of -0.8521620.
This indicates that cars with more cylinders tend to have lower fuel
efficiency.Hp (horsepower) also negatively correlates with
mpg, with a value of -0.7761684. This means that higher
horsepower generally results in lower fuel efficiency.No data is missing.
colSums(is.na(Cars))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 0 0 0 0 0 0 0 0
boxplot(Cars, main = "Boxplot detecting outliers", col = "orange")
library(corrplot)
corrplot(mpg_corr_matrix, method="circle", type="upper", order="hclust",
tl.col="black", tl.srt=45)
disp and hp have a larger variance than the
other variables.hp, there could be a
sports car or a truck/16 wheeler that look abnormal compared to the
other observations.wt and carb each have one or two outliers,
but they are not extreme.wt (weight): There is a strong negative correlation
between mpg and wt (around -0.9). This suggests that as a
car’s weight increases, its fuel efficiency decreases.data(Cars)
## Warning in data(Cars): data set 'Cars' not found
attach(Cars)
## The following objects are masked from Cars (pos = 3):
##
## am, carb, cyl, disp, drat, gear, hp, mpg, qsec, vs, wt
## The following object is masked from package:ggplot2:
##
## mpg
Cars_lm_mpg <- lm(mpg ~ ., data = mtcars)
summary(Cars_lm_mpg)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Cyl (-0.11144): Coefficient, Each additional cylinder
results in a decrease in mpg by ~0.11. Not statistically significant at
0.05 confidence level (p-value 0.9161).Disp Coefficient, For every unit increase in
displacement, mpg increases by about 0.0133, not statistically
significant at a 0.05 confidence level. (p-value 0.4635)hp ((0.01334) Each additional unit of horsepower
decreases mpg by 0.0215, but again, this is not statistically
significant (p-value 0.3350).wt This coefficient is significant at the 10% level
(p-value = 0.0633). It indicates that for every additional 1,000 pounds
in weight, the car’s fuel efficiency decreases by about 3.7 mpg. This
result aligns with expectations; heavier cars typically have lower fuel
efficiency.qsec Each additional second in 1/4 mile time (qsec)
increases mpg by 0.82, suggesting that cars with slower acceleration
tend to be more fuel-efficient. This relationship is not statistically
significant (p-value 0.2739). vs The engine shape has a very weak
positive association with mpg, but it’s highly insignificant (p-value
0.8814).am Cars with manual transmission (am = 1) have 2.52
more mpg than cars with automatic transmission, but the p-value is
0.2340, so this is not statistically significant.gear The number of forward gears doesn’t show any
significant effect on mpg (p-value 0.6652).In my opinion, the independent variables weight, hp, and cyl would make the best fit for a linear regression. The p-value of 0.140 is high for horsepower, but I think it is an important factor. The only significant variable is **weight* at a p-value of 0.000199.
mpg_lm_bestfit <- lm(mpg ~ wt + hp + cyl, data = mtcars)
summary(mpg_lm_bestfit)
##
## Call:
## lm(formula = mpg ~ wt + hp + cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9290 -1.5598 -0.5311 1.1850 5.8986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.75179 1.78686 21.687 < 2e-16 ***
## wt -3.16697 0.74058 -4.276 0.000199 ***
## hp -0.01804 0.01188 -1.519 0.140015
## cyl -0.94162 0.55092 -1.709 0.098480 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared: 0.8431, Adjusted R-squared: 0.8263
## F-statistic: 50.17 on 3 and 28 DF, p-value: 2.184e-11
par(mfrow = c(2, 2))
plot(Cars_lm_mpg )
* Linearity: The Residuals vs Fitted plot shows a slight curve,
indicating that the linearity assumption may not be perfectly met (the
relationship between predictors and mpg might not be fully linear). *
Normality of Residuals: The Q-Q plot mostly follows a straight line,
suggesting that the residuals are approximately normally distributed,
though there are a few outliers (e.g., Chrysler Imperial, Fiat 1280,
Ford Pantera L). * Homoscedasticity: The Scale-Location plot shows a
slightly increasing trend, which suggests that the residuals may not
have constant variance (some heteroscedasticity is present). *
Influential Points: The Residuals vs Leverage plot identifies
influential data points, particularly Chrysler Imperial and Ford Pantera
L, which may disproportionately affect the model.
predicted_mpg <- predict(Cars_lm_mpg)
residuals <- Cars$mpg - predicted_mpg
mse <- mean(residuals^2)
mse
## [1] 4.609201
MSE = 4.609201 an MSE of 4.61 for the Cars dataset, with mpg (miles per gallon) as the dependent variable indicates that the average squared difference between the actual mpg values and the predicted values from the model is 4.61. This suggests that, while the model predicts mpg reasonably well, there are moderate differences between the actual and predicted fuel efficiency values. A lower MSE would indicate a better fit, meaning less prediction error. To further understand the error in terms of mpg, you could also compute the Root Mean Squared Error (RMSE), which would be approximately 2.15 mpg.
full_model <- lm(mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb + wt * hp, data = mtcars)
summary(full_model)
##
## Call:
## lm(formula = mpg ~ cyl + disp + hp + drat + wt + qsec + vs +
## am + gear + carb + wt * hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6129 -1.4482 0.2571 1.1179 4.0907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.903972 16.390539 1.702 0.104165
## cyl 1.011371 0.941887 1.074 0.295710
## disp -0.002363 0.015716 -0.150 0.882013
## hp -0.140989 0.041789 -3.374 0.003018 **
## drat -0.803048 1.455063 -0.552 0.587132
## wt -9.613350 2.439829 -3.940 0.000809 ***
## qsec 0.744333 0.611042 1.218 0.237347
## vs 0.133431 1.759111 0.076 0.940291
## am -0.725300 1.999043 -0.363 0.720543
## gear 2.907613 1.434933 2.026 0.056279 .
## carb -0.512939 0.699359 -0.733 0.471800
## hp:wt 0.036219 0.011403 3.176 0.004746 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared: 0.9129, Adjusted R-squared: 0.865
## F-statistic: 19.06 on 11 and 20 DF, p-value: 3.046e-08
#interaction between weight and horsepower (wt*hp)
Cars_lm_interaction <- lm(mpg ~ wt * hp + disp + cyl, data = mtcars)
summary(Cars_lm_interaction)
##
## Call:
## lm(formula = mpg ~ wt * hp + disp + cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4093 -1.6584 -0.5678 1.4284 4.5726
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.569405 3.816026 12.990 7.09e-13 ***
## wt -7.643723 1.558423 -4.905 4.32e-05 ***
## hp -0.107661 0.031230 -3.447 0.00194 **
## disp 0.001079 0.010918 0.099 0.92204
## cyl -0.404110 0.650595 -0.621 0.53992
## wt:hp 0.025561 0.008608 2.969 0.00634 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.213 on 26 degrees of freedom
## Multiple R-squared: 0.887, Adjusted R-squared: 0.8652
## F-statistic: 40.8 on 5 and 26 DF, p-value: 1.684e-11
wt:hp is statistically
significant, with a p-value of 0.00634 (well below 0.05), meaning the
interaction between weight and horsepower has a significant effect on
predicting mpg.wt and hp both remain statistically
significant on their own (with p-values well below 0.05), indicating
that both weight and horsepower have strong individual effects on
mpg.disp and
cyl will be a good idea since their p-values are weak and
non-significant. This would leave me with a factorial anova with the
main effects being wt and hp, and the
interaction wt*hp.lower_bound_hp <- quantile(Cars$hp, 0.01)
upper_bound_hp <- quantile(Cars$hp, 0.99)
#winsorization to 'hp'
Cars$hp_winsorized <- Cars$hp
Cars$hp_winsorized[Cars$hp_winsorized < lower_bound_hp] <- lower_bound_hp
Cars$hp_winsorized[Cars$hp_winsorized > upper_bound_hp] <- upper_bound_hp
model_before <- lm(mpg ~ hp * wt, data = Cars)
summary(model_before)
##
## Call:
## lm(formula = mpg ~ hp * wt, data = Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0632 -1.6491 -0.7362 1.4211 4.5513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.80842 3.60516 13.816 5.01e-14 ***
## hp -0.12010 0.02470 -4.863 4.04e-05 ***
## wt -8.21662 1.26971 -6.471 5.20e-07 ***
## hp:wt 0.02785 0.00742 3.753 0.000811 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.153 on 28 degrees of freedom
## Multiple R-squared: 0.8848, Adjusted R-squared: 0.8724
## F-statistic: 71.66 on 3 and 28 DF, p-value: 2.981e-13
model_after <- lm(mpg ~ hp_winsorized * wt, data = Cars)
summary(model_after)
##
## Call:
## lm(formula = mpg ~ hp_winsorized * wt, data = Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0915 -1.6182 -0.7639 1.3622 4.4830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.930327 3.609565 13.833 4.86e-14 ***
## hp_winsorized -0.122091 0.024660 -4.951 3.17e-05 ***
## wt -8.191877 1.279261 -6.404 6.22e-07 ***
## hp_winsorized:wt 0.028008 0.007425 3.772 0.000771 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.134 on 28 degrees of freedom
## Multiple R-squared: 0.8868, Adjusted R-squared: 0.8747
## F-statistic: 73.11 on 3 and 28 DF, p-value: 2.328e-13
hp variable had noticeable
outliers in the dataset, as identified by the boxplot, suggesting the
presence of extreme values that could influence the regression
model.hp,
capping its extreme values at the 1st and 99th percentiles to mitigate
the effect of outliers without removing data points.mpg.hp,
wt, and the interaction term
hp:wt remained significant in both models,
with the hp coefficient becoming slightly more negative
after winsorization (-0.1021 to -0.1220), suggesting a stronger negative
relationship between hp and mpg once extreme
values were reduced.