Data Exploration

data(mtcars)
df <- mtcars
str(df)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
summary(df)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

Correlation with mpg

correlations <- sort(cor(df)[, "mpg"], decreasing = TRUE)
correlations
##        mpg       drat         vs         am       gear       qsec       carb 
##  1.0000000  0.6811719  0.6640389  0.5998324  0.4802848  0.4186840 -0.5509251 
##         hp       disp        cyl         wt 
## -0.7761684 -0.8475514 -0.8521620 -0.8676594

Data Processing

any_NA <- sum(is.na(df))
anyNA(df)
## [1] FALSE
summary(any_NA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0

Build Model

model <- lm(mpg ~ ., data = df)
model
## 
## Call:
## lm(formula = mpg ~ ., data = df)
## 
## Coefficients:
## (Intercept)          cyl         disp           hp         drat           wt  
##    12.30337     -0.11144      0.01334     -0.02148      0.78711     -3.71530  
##        qsec           vs           am         gear         carb  
##     0.82104      0.31776      2.52023      0.65541     -0.19942
summary(model)
## 
## Call:
## lm(formula = mpg ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07
Model is insignificant
Negative values indicare that higher powered cars have a lower eco friendly exhaust
Positive values indicate that they have a higher efficiency rating

Model Assumptions

par(mfrow = c(2, 2))
plot(model)

Evaluate Model Performace

predict <- predict(model, df)
mse <- mean((df$mpg - predict)^2)
mse
## [1] 4.609201

Model Interaction w/ Other Variables

model_interaction <- lm(mpg ~ wt * hp + cyl + disp + drat + qsec + vs + am + gear + carb,
                        data = df)
summary(model_interaction)
## 
## Call:
## lm(formula = mpg ~ wt * hp + cyl + disp + drat + qsec + vs + 
##     am + gear + carb, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6129 -1.4482  0.2571  1.1179  4.0907 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.903972  16.390539   1.702 0.104165    
## wt          -9.613350   2.439829  -3.940 0.000809 ***
## hp          -0.140989   0.041789  -3.374 0.003018 ** 
## cyl          1.011371   0.941887   1.074 0.295710    
## disp        -0.002363   0.015716  -0.150 0.882013    
## drat        -0.803048   1.455063  -0.552 0.587132    
## qsec         0.744333   0.611042   1.218 0.237347    
## vs           0.133431   1.759111   0.076 0.940291    
## am          -0.725300   1.999043  -0.363 0.720543    
## gear         2.907613   1.434933   2.026 0.056279 .  
## carb        -0.512939   0.699359  -0.733 0.471800    
## wt:hp        0.036219   0.011403   3.176 0.004746 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared:  0.9129, Adjusted R-squared:  0.865 
## F-statistic: 19.06 on 11 and 20 DF,  p-value: 3.046e-08

Outlier

outliers <- which(abs(rstandard(model_interaction)) > 2)
outliers
## Fiat 128 
##       18
mtcars[outliers, ]
##           mpg cyl disp hp drat  wt  qsec vs am gear carb
## Fiat 128 32.4   4 78.7 66 4.08 2.2 19.47  1  1    4    1

Winsorization

df_winsor <- df
lower <- quantile(df_winsor$mpg, 0.01)
upper <- quantile(df_winsor$mpg, 0.99)
df_winsor$mpg[df_winsor$mpg < lower] <- lower
df_winsor$mpg[df_winsor$mpg > upper] <- upper

model_int_wins <- lm(mpg ~ wt * hp + 
                       cyl + 
                       disp + 
                       drat + 
                       qsec + 
                       vs + 
                       am + 
                       gear + 
                       carb, 
                     data = df_winsor)
summary(model_int_wins)
## 
## Call:
## lm(formula = mpg ~ wt * hp + cyl + disp + drat + qsec + vs + 
##     am + gear + carb, data = df_winsor)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.598 -1.399  0.260  1.136  4.173 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 28.444058  16.195810   1.756  0.09435 .  
## wt          -9.472667   2.410842  -3.929  0.00083 ***
## hp          -0.139227   0.041293  -3.372  0.00303 ** 
## cyl          0.965237   0.930697   1.037  0.31206    
## disp        -0.002245   0.015529  -0.145  0.88648    
## drat        -0.801740   1.437776  -0.558  0.58329    
## qsec         0.708513   0.603782   1.173  0.25440    
## vs           0.143969   1.738212   0.083  0.93481    
## am          -0.760277   1.975293  -0.385  0.70438    
## gear         2.889049   1.417885   2.038  0.05504 .  
## carb        -0.509644   0.691050  -0.737  0.46939    
## wt:hp        0.035629   0.011267   3.162  0.00490 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.188 on 20 degrees of freedom
## Multiple R-squared:  0.914,  Adjusted R-squared:  0.8667 
## F-statistic: 19.33 on 11 and 20 DF,  p-value: 2.698e-08

Does an improved R^2 Improve the model

yes it does imporve the model because an improved R squared allows for models to be compared againts each other to allow more interpretation for the models