data(mtcars)
attach(mtcars) # this attach the data to the current enviroment
View(mtcars)
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'mtcars':
##
## mpg
ggplot(mtcars, aes(x = mpg)) +
geom_histogram(binwidth = 2, fill = "lightblue", color = "black") +
labs(title = "Distribution of Miles Per Gallon (mpg)", x = "mpg", y = "Count")
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point(color = "green") +
theme_minimal() +
labs(title = "Miles Per Gallon vs. Weight", x = "Weight (1000 lbs)", y = "mpg")
Heavier cars usually have lower Mpg
cor_matrix <- cor(mtcars)
cor_matrix[,"mpg"]
## mpg cyl disp hp drat wt qsec
## 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.6811719 -0.8676594 0.4186840
## vs am gear carb
## 0.6640389 0.5998324 0.4802848 -0.5509251
sum(is.na(mtcars))
## [1] 0
no missing data
boxplot(mtcars, las=2, cex.axis=0.6)
there may be some outliers like displacement and horespower
model <- lm(mpg ~ ., data = mtcars)
summary(model)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
weight seems to have the biggest impact on reducing Mpg, losing 3.72 mpg per 1000 pounds it seems that the transmission type can incease mpg the most
residuals <- model$residuals
mse <- mean(residuals^2)
print(mse)
## [1] 4.609201
model_interaction <- lm(mpg ~ wt * hp + ., data = mtcars)
summary(model_interaction)
##
## Call:
## lm(formula = mpg ~ wt * hp + ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6129 -1.4482 0.2571 1.1179 4.0907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.903972 16.390539 1.702 0.104165
## wt -9.613350 2.439829 -3.940 0.000809 ***
## hp -0.140989 0.041789 -3.374 0.003018 **
## cyl 1.011371 0.941887 1.074 0.295710
## disp -0.002363 0.015716 -0.150 0.882013
## drat -0.803048 1.455063 -0.552 0.587132
## qsec 0.744333 0.611042 1.218 0.237347
## vs 0.133431 1.759111 0.076 0.940291
## am -0.725300 1.999043 -0.363 0.720543
## gear 2.907613 1.434933 2.026 0.056279 .
## carb -0.512939 0.699359 -0.733 0.471800
## wt:hp 0.036219 0.011403 3.176 0.004746 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared: 0.9129, Adjusted R-squared: 0.865
## F-statistic: 19.06 on 11 and 20 DF, p-value: 3.046e-08
the r-squared of the original interacionts was 0.869 adding the interaction of weight and horsepower increased the r-squared to 0.9129 proving that both variables are significant to Mpg
boxplot(mtcars, las=2, cex.axis=0.6)
disp
and hp may be outliers
# Calculate the 1st and 99th percentiles for 'mpg'
lower_bound_mpg <- quantile(mtcars$mpg, 0.01, na.rm = TRUE)
upper_bound_mpg <- quantile(mtcars$mpg, 0.99, na.rm = TRUE)
# Winsorize the 'mpg' variable
mtcars$mpg_wins <- mtcars$mpg
mtcars$mpg_wins[mtcars$mpg_wins < lower_bound_mpg] <- lower_bound_mpg
mtcars$mpg_wins[mtcars$mpg_wins > upper_bound_mpg] <- upper_bound_mpg
# Fit model with winsorized 'mpg'
model_wins <- lm(mpg_wins ~ ., data = mtcars)
summary(model_wins)
##
## Call:
## lm(formula = mpg_wins ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.296022 -0.022676 -0.006357 0.029335 0.137883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.645e-01 5.919e-01 1.629 0.119
## mpg 9.862e-01 6.831e-03 144.366 <2e-16 ***
## cyl -2.938e-02 3.272e-02 -0.898 0.380
## disp 4.569e-05 5.664e-04 0.081 0.937
## hp -4.827e-04 6.971e-04 -0.692 0.497
## drat -1.372e-02 5.147e-02 -0.267 0.793
## wt -6.838e-03 6.450e-02 -0.106 0.917
## qsec -2.571e-02 2.356e-02 -1.092 0.288
## vs 1.193e-02 6.591e-02 0.181 0.858
## am -5.301e-02 6.664e-02 -0.795 0.436
## gear 2.721e-02 4.696e-02 0.579 0.569
## carb -4.574e-03 2.598e-02 -0.176 0.862
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08296 on 20 degrees of freedom
## Multiple R-squared: 0.9999, Adjusted R-squared: 0.9998
## F-statistic: 1.47e+04 on 11 and 20 DF, p-value: < 2.2e-16
winstorizing mpg changed the r^2 from 0.869 to 0.9999 which accounts for almost all of the variability with an almost perfect fit it does account for variation but wont work too well on new data