data(mtcars)
attach(mtcars)

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.94 loaded
corr_matrix <- cor(mtcars)
print(corr_matrix)
##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000
corrplot(corr_matrix, method="circle", type="upper", order="hclust",
         tl.col="black", tl.srt=45)

The variables most strongly associated with mpg are wt, cyl, and disp. Now let’s look at these variables graphically.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'mtcars':
## 
##     mpg
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  labs(title = "Weight vs Miles per Gallon", x = "Weight (1000 lbs)", y = "Miles per Gallon")

ggplot(mtcars, aes(x=factor(cyl), y=mpg)) +
  geom_boxplot(aes(fill=factor(cyl))) +
  ggtitle("Miles per Gallon by Number of Cylinders") +
  xlab("Number of Cylinders") +
  ylab("Miles per Gallon")

ggplot(mtcars, aes(x=disp, y=mpg)) +
  geom_point(aes(color=disp)) +
  ggtitle("Displacement vs. Miles per Gallon") +
  xlab("Displacement (cubic inches)") +
  ylab("Miles per Gallon") +
  scale_color_continuous(name="Index")

An increase in weight, number of cylinders, and displacement correlates to a decrease in miles per gallon.

colSums(is.na(mtcars))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0
#There is no missing data.
boxplot(mtcars, las=2, cex.axis=0.6)

mtcars_lm <- lm(mpg ~ wt + disp + cyl + hp, data = mtcars)
summary(mtcars_lm)
## 
## Call:
## lm(formula = mpg ~ wt + disp + cyl + hp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0562 -1.4636 -0.4281  1.2854  5.8269 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 40.82854    2.75747  14.807 1.76e-14 ***
## wt          -3.85390    1.01547  -3.795 0.000759 ***
## disp         0.01160    0.01173   0.989 0.331386    
## cyl         -1.29332    0.65588  -1.972 0.058947 .  
## hp          -0.02054    0.01215  -1.691 0.102379    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.513 on 27 degrees of freedom
## Multiple R-squared:  0.8486, Adjusted R-squared:  0.8262 
## F-statistic: 37.84 on 4 and 27 DF,  p-value: 1.061e-10
#There is a significant correlation between mpg and wt and mpg and cyl, but not mpg and disp.

mean(mtcars_lm$residuals^2)
## [1] 5.326386
#The Means Squared Error is 5.326386.

Looking through the data, the displacement variable has the largest range. Let’s winsorize this variable and also truncate hp to remove the large outlier.

mt_cars_sorted <- mtcars[mtcars$hp <= 300, ]
lower_bound_disp <- quantile(mt_cars_sorted$disp, 0.05, na.rm = TRUE)
upper_bound_disp <- quantile(mt_cars_sorted$disp, 0.95, na.rm = TRUE)
mt_cars_sorted$disp[mt_cars_sorted$tax < lower_bound_disp] <- lower_bound_disp
mt_cars_sorted$disp[mt_cars_sorted$tax > upper_bound_disp] <- upper_bound_disp

Now lets run the same linear regression tests as before.

mtcars_lm2 <- lm(mpg ~ wt + disp + cyl + hp, data = mt_cars_sorted)
summary(mtcars_lm2)
## 
## Call:
## lm(formula = mpg ~ wt + disp + cyl + hp, data = mt_cars_sorted)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9483 -1.2877 -0.3379  0.7952  5.7201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.13342    2.78171  14.787  3.6e-14 ***
## wt          -3.91774    1.01973  -3.842 0.000705 ***
## disp         0.01502    0.01229   1.221 0.232863    
## cyl         -1.20061    0.66446  -1.807 0.082366 .  
## hp          -0.03123    0.01661  -1.879 0.071432 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.518 on 26 degrees of freedom
## Multiple R-squared:  0.8501, Adjusted R-squared:  0.827 
## F-statistic: 36.86 on 4 and 26 DF,  p-value: 2.325e-10
mean(mtcars_lm2$residuals^2)
## [1] 5.315611
#The means squared error is 5.315611.

Lastly, lets run these two regressions with an interaction term of cyl * hp and see if there is a correlation between these two variables.

mtcars_lm <- lm(mpg ~ wt + disp + cyl + hp + cyl * hp, data = mtcars)
summary(mtcars_lm)
## 
## Call:
## lm(formula = mpg ~ wt + disp + cyl + hp + cyl * hp, data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.395 -1.426 -0.612  1.219  4.312 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 52.164411   5.031802  10.367 9.96e-11 ***
## wt          -3.302628   0.946204  -3.490  0.00174 ** 
## disp         0.003063   0.011144   0.275  0.78561    
## cyl         -2.780502   0.826175  -3.366  0.00238 ** 
## hp          -0.159849   0.054760  -2.919  0.00716 ** 
## cyl:hp       0.018380   0.007077   2.597  0.01527 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.281 on 26 degrees of freedom
## Multiple R-squared:  0.8798, Adjusted R-squared:  0.8567 
## F-statistic: 38.07 on 5 and 26 DF,  p-value: 3.697e-11
mtcars_lm2 <- lm(mpg ~ wt + disp + cyl + hp + cyl * hp, data = mt_cars_sorted)
summary(mtcars_lm2)
## 
## Call:
## lm(formula = mpg ~ wt + disp + cyl + hp + cyl * hp, data = mt_cars_sorted)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3926 -1.4547 -0.7343  1.2200  4.3282 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.959186   5.294022   9.815 4.67e-10 ***
## wt          -3.325420   0.975411  -3.409  0.00222 ** 
## disp         0.003811   0.012326   0.309  0.75972    
## cyl         -2.731965   0.897450  -3.044  0.00543 ** 
## hp          -0.158458   0.056522  -2.803  0.00963 ** 
## cyl:hp       0.017967   0.007682   2.339  0.02764 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.326 on 25 degrees of freedom
## Multiple R-squared:  0.877,  Adjusted R-squared:  0.8524 
## F-statistic: 35.65 on 5 and 25 DF,  p-value: 1.343e-10

There is a relatively significant relationship between the number of cylinders and horsepower.