library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(devtools)
## Loading required package: usethis
data(mtcars) str(mtcars)
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
cor(mtcars)
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
This suggests that transmission type may influence fuel efficiency—possibly due to differences in engine control or driving behavior.
cor(mtcars$mpg, mtcars)
## mpg cyl disp hp drat wt qsec
## [1,] 1 -0.852162 -0.8475514 -0.7761684 0.6811719 -0.8676594 0.418684
## vs am gear carb
## [1,] 0.6640389 0.5998324 0.4802848 -0.5509251
Top Negative Correlations:
Heavier cars and cars with more cylinders or horsepower tend to consume more fuel leading to lower mpg.
sum(is.na(mtcars))
## [1] 0
colSums(is.na(mtcars))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 0 0 0 0 0 0 0 0
The dataset is missing no values
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
table(mtcars$am) # Should be 0 or 1
##
## 0 1
## 19 13
table(mtcars$vs) # Should be 0 or 1
##
## 0 1
## 18 14
table(mtcars$gear) # Should be 3, 4, or 5
##
## 3 4 5
## 15 12 5
table(mtcars$carb) # Should be positive integers
##
## 1 2 3 4 6 8
## 7 10 3 10 1 1
# Checking for negative or zero values in mpg, wt, hp, etc.
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
No inconsistent or invalid data detected
model <- lm(mpg ~ ., data = mtcars)
summary(model)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Each coefficient represents the expected change in mpg for a one-unit increase in the predictor, holding all other variables constant
The p-values tells us which predictors have a meaningful impact on mpg
Assumptions:
par(mfrow = c(2, 2))
plot(model)
Observations:
mse <- mean((model$residuals)^2)
mse
## [1] 4.609201
model_interact <- lm(mpg ~ wt * hp + ., data = mtcars)
summary(model_interact)
##
## Call:
## lm(formula = mpg ~ wt * hp + ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6129 -1.4482 0.2571 1.1179 4.0907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.903972 16.390539 1.702 0.104165
## wt -9.613350 2.439829 -3.940 0.000809 ***
## hp -0.140989 0.041789 -3.374 0.003018 **
## cyl 1.011371 0.941887 1.074 0.295710
## disp -0.002363 0.015716 -0.150 0.882013
## drat -0.803048 1.455063 -0.552 0.587132
## qsec 0.744333 0.611042 1.218 0.237347
## vs 0.133431 1.759111 0.076 0.940291
## am -0.725300 1.999043 -0.363 0.720543
## gear 2.907613 1.434933 2.026 0.056279 .
## carb -0.512939 0.699359 -0.733 0.471800
## wt:hp 0.036219 0.011403 3.176 0.004746 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared: 0.9129, Adjusted R-squared: 0.865
## F-statistic: 19.06 on 11 and 20 DF, p-value: 3.046e-08
Weight:Horsepower is significant because it suggests the effect of weight on mpg depends on horsepower
summary(model)$r.squared
## [1] 0.8690158
summary(model_interact)$r.squared
## [1] 0.9129353
The interaction terms created more variance, potentially revealing more nuanced relationships, and improved predictive power.
boxplot(mtcars)
library(DescTools)
mtcars$hp_wins <- Winsorize(mtcars$hp, val = quantile(mtcars$hp,
probs = c(0.05, 0.95),
na.rm = T))
model_wins <- lm(mpg ~ . - hp + hp_wins, data = mtcars)
summary(model_wins)
##
## Call:
## lm(formula = mpg ~ . - hp + hp_wins, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4440 -1.5389 -0.1908 1.1157 4.5556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.52194 18.89881 0.768 0.4508
## cyl -0.13067 1.03657 -0.126 0.9009
## disp 0.01304 0.01737 0.751 0.4609
## drat 1.07929 1.63034 0.662 0.5152
## wt -3.47885 1.84405 -1.887 0.0731 .
## qsec 0.69142 0.75049 0.921 0.3674
## vs 0.18045 2.05887 0.088 0.9310
## am 2.23794 2.05534 1.089 0.2886
## gear 0.56689 1.48310 0.382 0.7061
## carb -0.31407 0.76341 -0.411 0.6849
## hp_wins -0.02675 0.02533 -1.056 0.3031
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.642 on 21 degrees of freedom
## Multiple R-squared: 0.8699, Adjusted R-squared: 0.8079
## F-statistic: 14.04 on 10 and 21 DF, p-value: 3.559e-07
Not necessarily. A higher R² can mean better fit to training data, but it doesn’t guarantee better generalization. Always validate with test data or cross-validation.