# Load Packages
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(ggplot2)
library(corrplot)

## corrplot 0.94 loaded

# Load dataset 
Cars <- mtcars
attach(Cars)

## The following object is masked from package:ggplot2:
## 
##     mpg

## Data Exploration
head(Cars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

summary(Cars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

mpg_corr_matrix <- cor(Cars)
mpg_corr_matrix

##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000

table(mtcars$cyl)

## 
##  4  6  8 
## 11  7 14

summary(Cars$mpg)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.40   15.43   19.20   20.09   22.80   33.90

summary(Cars$hp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.0    96.5   123.0   146.7   180.0   335.0

summary(Cars$hp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.0    96.5   123.0   146.7   180.0   335.0

Wt (weight) has the strongest negative correlation with mpg, with a value of -0.8676594. This suggests that as the weight of the car increases, the miles per gallon decrease.
Cyl (number of cylinders) has the second strongest negative correlation with mpg, with a value of -0.8521620. This indicates that cars with more cylinders tend to have lower fuel efficiency.
Hp (horsepower) also negatively correlates with mpg, with a value of -0.7761684. This means that higher horsepower generally results in lower fuel efficiency.

Data Processing

No data is missing.

colSums(is.na(Cars))

##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0

boxplot(Cars, main = "Boxplot detecting outliers", col = "orange")

library(corrplot)
corrplot(mpg_corr_matrix, method="circle", type="upper", order="hclust",
         tl.col="black", tl.srt=45)

Insights from the Boxplot Matrix:

Variance: We can see that the variables disp and hp have a larger variance than the other variables.
Outliers: The main potential inconsistency comes from the outliers observed in hp and qsec. These outliers may or may not be problematic, but they should be investigated further to determine if they are accurate or if they represent data entry errors or unusual conditions. My guess is that for hp, there could be a sports car or a truck/16 wheeler that look abnormal compared to the other observations.
wt and carb each have one or two outliers, but they are not extreme.
For moost of the variables, the boxplot is relatively compact, with no visible outliers, suggesting consistency across the dataset.

Insights from the Correlation Matrix:

wt (weight): There is a strong negative correlation between mpg and wt (around -0.9). This suggests that as a car’s weight increases, its fuel efficiency decreases.
qsec (1/4 mile time) is negatively correlated with variables such as hp, wt, and cyl.
vs (engine shape) and am (transmission type) have relatively weak correlations with most other variables, suggesting they are less directly related to fuel efficiency or engine performance characteristics.

Linear Regression

data(Cars)

## Warning in data(Cars): data set 'Cars' not found

attach(Cars)

## The following objects are masked from Cars (pos = 3):
## 
##     am, carb, cyl, disp, drat, gear, hp, mpg, qsec, vs, wt

## The following object is masked from package:ggplot2:
## 
##     mpg

Cars_lm_mpg <- lm(mpg ~ ., data = mtcars)
summary(Cars_lm_mpg)

## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

Linear Regression Insights

Intercept at 12.30337 This is the expected value of mpg when all other predictors (e.g., wt, qsec, am) are zero. While not directly meaningful in this case because cars can’t have zero weight, acceleration, or transmission type, the intercept is required for the regression equation.
This is the expected value of mpg when all other predictors (e.g., wt, qsec, am) are zero. While not directly meaningful in this case because cars can’t have zero weight, acceleration, or transmission type, the intercept is required for the regression equation.
This coefficient is significant at the 10% level (p-value = 0.0633). It indicates that for every additional 1,000 pounds in weight, the car’s fuel efficiency decreases by about 3.7 mpg. This result aligns with expectations—heavier cars typically have lower fuel efficiency.
Cyl (-0.11144): Coefficient, Each additional cylinder results in a decrease in mpg by ~0.11. Not statistically significant at 0.05 confidence level (p-value 0.9161).
Disp Coefficient, For every unit increase in displacement, mpg increases by about 0.0133, not statistically significant at a 0.05 confidence level. (p-value 0.4635)
hp ((0.01334) Each additional unit of horsepower decreases mpg by 0.0215, but again, this is not statistically significant (p-value 0.3350).
wt This coefficient is significant at the 10% level (p-value = 0.0633). It indicates that for every additional 1,000 pounds in weight, the car’s fuel efficiency decreases by about 3.7 mpg. This result aligns with expectations; heavier cars typically have lower fuel efficiency.
qsec Each additional second in 1/4 mile time (qsec) increases mpg by 0.82, suggesting that cars with slower acceleration tend to be more fuel-efficient. This relationship is not statistically significant (p-value 0.2739). vs The engine shape has a very weak positive association with mpg, but it’s highly insignificant (p-value 0.8814).
am Cars with manual transmission (am = 1) have 2.52 more mpg than cars with automatic transmission, but the p-value is 0.2340, so this is not statistically significant.
gear The number of forward gears doesn’t show any significant effect on mpg (p-value 0.6652).

Best fit Linear Regression

In my opinion, the independent variables weight, hp, and cyl would make the best fit for a linear regression. The p-value of 0.140 is high for horsepower, but I think it is an important factor. The only significant variable is **weight* at a p-value of 0.000199.

mpg_lm_bestfit <- lm(mpg ~ wt + hp + cyl, data = mtcars)
summary(mpg_lm_bestfit)

## 
## Call:
## lm(formula = mpg ~ wt + hp + cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9290 -1.5598 -0.5311  1.1850  5.8986 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38.75179    1.78686  21.687  < 2e-16 ***
## wt          -3.16697    0.74058  -4.276 0.000199 ***
## hp          -0.01804    0.01188  -1.519 0.140015    
## cyl         -0.94162    0.55092  -1.709 0.098480 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared:  0.8431, Adjusted R-squared:  0.8263 
## F-statistic: 50.17 on 3 and 28 DF,  p-value: 2.184e-11

Diagnostic Plots

par(mfrow = c(2, 2))
plot(Cars_lm_mpg )

* Linearity: The Residuals vs Fitted plot shows a slight curve, indicating that the linearity assumption may not be perfectly met (the relationship between predictors and mpg might not be fully linear). * Normality of Residuals: The Q-Q plot mostly follows a straight line, suggesting that the residuals are approximately normally distributed, though there are a few outliers (e.g., Chrysler Imperial, Fiat 1280, Ford Pantera L). * Homoscedasticity: The Scale-Location plot shows a slightly increasing trend, which suggests that the residuals may not have constant variance (some heteroscedasticity is present). * Influential Points: The Residuals vs Leverage plot identifies influential data points, particularly Chrysler Imperial and Ford Pantera L, which may disproportionately affect the model.

Evaluate MSE

predicted_mpg <- predict(Cars_lm_mpg) 
residuals <- Cars$mpg - predicted_mpg
mse <- mean(residuals^2)
mse

## [1] 4.609201

MSE = 4.609201 an MSE of 4.61 for the Cars dataset, with mpg (miles per gallon) as the dependent variable indicates that the average squared difference between the actual mpg values and the predicted values from the model is 4.61. This suggests that, while the model predicts mpg reasonably well, there are moderate differences between the actual and predicted fuel efficiency values. A lower MSE would indicate a better fit, meaning less prediction error. To further understand the error in terms of mpg, you could also compute the Root Mean Squared Error (RMSE), which would be approximately 2.15 mpg.

Interaction

full_model <- lm(mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb + wt * hp, data = mtcars)
summary(full_model)

## 
## Call:
## lm(formula = mpg ~ cyl + disp + hp + drat + wt + qsec + vs + 
##     am + gear + carb + wt * hp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6129 -1.4482  0.2571  1.1179  4.0907 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.903972  16.390539   1.702 0.104165    
## cyl          1.011371   0.941887   1.074 0.295710    
## disp        -0.002363   0.015716  -0.150 0.882013    
## hp          -0.140989   0.041789  -3.374 0.003018 ** 
## drat        -0.803048   1.455063  -0.552 0.587132    
## wt          -9.613350   2.439829  -3.940 0.000809 ***
## qsec         0.744333   0.611042   1.218 0.237347    
## vs           0.133431   1.759111   0.076 0.940291    
## am          -0.725300   1.999043  -0.363 0.720543    
## gear         2.907613   1.434933   2.026 0.056279 .  
## carb        -0.512939   0.699359  -0.733 0.471800    
## hp:wt        0.036219   0.011403   3.176 0.004746 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared:  0.9129, Adjusted R-squared:  0.865 
## F-statistic: 19.06 on 11 and 20 DF,  p-value: 3.046e-08

#interaction between weight and horsepower (wt*hp)
Cars_lm_interaction <- lm(mpg ~ wt * hp + disp + cyl, data = mtcars)
summary(Cars_lm_interaction)

## 
## Call:
## lm(formula = mpg ~ wt * hp + disp + cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4093 -1.6584 -0.5678  1.4284  4.5726 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.569405   3.816026  12.990 7.09e-13 ***
## wt          -7.643723   1.558423  -4.905 4.32e-05 ***
## hp          -0.107661   0.031230  -3.447  0.00194 ** 
## disp         0.001079   0.010918   0.099  0.92204    
## cyl         -0.404110   0.650595  -0.621  0.53992    
## wt:hp        0.025561   0.008608   2.969  0.00634 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.213 on 26 degrees of freedom
## Multiple R-squared:  0.887,  Adjusted R-squared:  0.8652 
## F-statistic:  40.8 on 5 and 26 DF,  p-value: 1.684e-11

The interaction term wt:hp is statistically significant, with a p-value of 0.00634 (well below 0.05), meaning the interaction between weight and horsepower has a significant effect on predicting mpg.
The coefficient for the interaction term (0.025561) suggests that the combined effect of weight and horsepower is not simply additive; instead, the impact of one variable on mpg depends on the level of the other variable. Specifically, for every increase in horsepower, the effect of weight on mpg changes by approximately 0.0256 units.
The R-squared value is 0.887, meaning that 88.7% of the variance in mpg is explained by this model.
The adjusted R-squared is 0.8652, which accounts for the number of predictors in the model and suggests that the model fits the data well, even after adjusting for the complexity added by including multiple terms and the interaction. As stated by Professor Zu, the adjusted R-square penalizes adding more variables, but in this case the model works because of the high p-valueb in the interaction variable.
wt and hp both remain statistically significant on their own (with p-values well below 0.05), indicating that both weight and horsepower have strong individual effects on mpg.
I think removing the independent variables disp and cyl will be a good idea since their p-values are weak and non-significant. This would leave me with a factorial anova with the main effects being wt and hp, and the interaction wt*hp.

winsorization

lower_bound_hp <- quantile(Cars$hp, 0.01)
upper_bound_hp <- quantile(Cars$hp, 0.99)
#winsorization to 'hp'
Cars$hp_winsorized <- Cars$hp
Cars$hp_winsorized[Cars$hp_winsorized < lower_bound_hp] <- lower_bound_hp
Cars$hp_winsorized[Cars$hp_winsorized > upper_bound_hp] <- upper_bound_hp

model_before <- lm(mpg ~ hp * wt, data = Cars)
summary(model_before)

## 
## Call:
## lm(formula = mpg ~ hp * wt, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0632 -1.6491 -0.7362  1.4211  4.5513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.80842    3.60516  13.816 5.01e-14 ***
## hp          -0.12010    0.02470  -4.863 4.04e-05 ***
## wt          -8.21662    1.26971  -6.471 5.20e-07 ***
## hp:wt        0.02785    0.00742   3.753 0.000811 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.153 on 28 degrees of freedom
## Multiple R-squared:  0.8848, Adjusted R-squared:  0.8724 
## F-statistic: 71.66 on 3 and 28 DF,  p-value: 2.981e-13

model_after <- lm(mpg ~ hp_winsorized * wt, data = Cars)
summary(model_after)

## 
## Call:
## lm(formula = mpg ~ hp_winsorized * wt, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0915 -1.6182 -0.7639  1.3622  4.4830 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      49.930327   3.609565  13.833 4.86e-14 ***
## hp_winsorized    -0.122091   0.024660  -4.951 3.17e-05 ***
## wt               -8.191877   1.279261  -6.404 6.22e-07 ***
## hp_winsorized:wt  0.028008   0.007425   3.772 0.000771 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.134 on 28 degrees of freedom
## Multiple R-squared:  0.8868, Adjusted R-squared:  0.8747 
## F-statistic: 73.11 on 3 and 28 DF,  p-value: 2.328e-13

The hp variable had noticeable outliers in the dataset, as identified by the boxplot, suggesting the presence of extreme values that could influence the regression model.
Winsorization was applied to hp, capping its extreme values at the 1st and 99th percentiles to mitigate the effect of outliers without removing data points.
Before winsorization, the model’s Multiple R-squared was 0.8848, and the Adjusted R-squared was 0.8724.
After winsorization, both the Multiple R-squared and Adjusted R-squared increased slightly to 0.8867 and 0.8747, respectively, indicating a marginal improvement in the model’s ability to explain the variance in mpg.
The coefficients for hp, wt, and the interaction term hp:wt remained significant in both models, with the hp coefficient becoming slightly more negative after winsorization (-0.1021 to -0.1220), suggesting a stronger negative relationship between hp and mpg once extreme values were reduced.
The residual standard error decreased slightly from 2.153 to 2.134, confirming that winsorization helped reduce variability and improve model stability by minimizing the influence of extreme outliers.

HW3

Pablo Lomelin

2024-10-05