data(mtcars)
attach(mtcars) # this attach the data to the current enviroment
View(mtcars)
summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'mtcars':
## 
##     mpg
ggplot(mtcars, aes(x = mpg)) +
  geom_histogram(binwidth = 2, fill = "lightblue", color = "black") +
  labs(title = "Distribution of Miles Per Gallon (mpg)", x = "mpg", y = "Count")

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point(color = "green") +
  theme_minimal() +
  labs(title = "Miles Per Gallon vs. Weight", x = "Weight (1000 lbs)", y = "mpg")

Heavier cars usually have lower Mpg

cor_matrix <- cor(mtcars)
cor_matrix[,"mpg"]
##        mpg        cyl       disp         hp       drat         wt       qsec 
##  1.0000000 -0.8521620 -0.8475514 -0.7761684  0.6811719 -0.8676594  0.4186840 
##         vs         am       gear       carb 
##  0.6640389  0.5998324  0.4802848 -0.5509251
sum(is.na(mtcars))
## [1] 0

no missing data

boxplot(mtcars, las=2, cex.axis=0.6)

there may be some outliers like displacement and horespower

model <- lm(mpg ~ ., data = mtcars)
summary(model)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

weight seems to have the biggest impact on reducing Mpg, losing 3.72 mpg per 1000 pounds it seems that the transmission type can incease mpg the most

residuals <- model$residuals
mse <- mean(residuals^2)
print(mse)
## [1] 4.609201
model_interaction <- lm(mpg ~ wt * hp + ., data = mtcars)
summary(model_interaction)
## 
## Call:
## lm(formula = mpg ~ wt * hp + ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6129 -1.4482  0.2571  1.1179  4.0907 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.903972  16.390539   1.702 0.104165    
## wt          -9.613350   2.439829  -3.940 0.000809 ***
## hp          -0.140989   0.041789  -3.374 0.003018 ** 
## cyl          1.011371   0.941887   1.074 0.295710    
## disp        -0.002363   0.015716  -0.150 0.882013    
## drat        -0.803048   1.455063  -0.552 0.587132    
## qsec         0.744333   0.611042   1.218 0.237347    
## vs           0.133431   1.759111   0.076 0.940291    
## am          -0.725300   1.999043  -0.363 0.720543    
## gear         2.907613   1.434933   2.026 0.056279 .  
## carb        -0.512939   0.699359  -0.733 0.471800    
## wt:hp        0.036219   0.011403   3.176 0.004746 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared:  0.9129, Adjusted R-squared:  0.865 
## F-statistic: 19.06 on 11 and 20 DF,  p-value: 3.046e-08

the r-squared of the original interacionts was 0.869 adding the interaction of weight and horsepower increased the r-squared to 0.9129 proving that both variables are significant to Mpg

boxplot(mtcars, las=2, cex.axis=0.6)

disp and hp may be outliers

# Calculate the 1st and 99th percentiles for 'mpg'
lower_bound_mpg <- quantile(mtcars$mpg, 0.01, na.rm = TRUE)
upper_bound_mpg <- quantile(mtcars$mpg, 0.99, na.rm = TRUE)

# Winsorize the 'mpg' variable
mtcars$mpg_wins <- mtcars$mpg
mtcars$mpg_wins[mtcars$mpg_wins < lower_bound_mpg] <- lower_bound_mpg
mtcars$mpg_wins[mtcars$mpg_wins > upper_bound_mpg] <- upper_bound_mpg

# Fit model with winsorized 'mpg'
model_wins <- lm(mpg_wins ~ ., data = mtcars)
summary(model_wins)
## 
## Call:
## lm(formula = mpg_wins ~ ., data = mtcars)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.296022 -0.022676 -0.006357  0.029335  0.137883 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.645e-01  5.919e-01   1.629    0.119    
## mpg          9.862e-01  6.831e-03 144.366   <2e-16 ***
## cyl         -2.938e-02  3.272e-02  -0.898    0.380    
## disp         4.569e-05  5.664e-04   0.081    0.937    
## hp          -4.827e-04  6.971e-04  -0.692    0.497    
## drat        -1.372e-02  5.147e-02  -0.267    0.793    
## wt          -6.838e-03  6.450e-02  -0.106    0.917    
## qsec        -2.571e-02  2.356e-02  -1.092    0.288    
## vs           1.193e-02  6.591e-02   0.181    0.858    
## am          -5.301e-02  6.664e-02  -0.795    0.436    
## gear         2.721e-02  4.696e-02   0.579    0.569    
## carb        -4.574e-03  2.598e-02  -0.176    0.862    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08296 on 20 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9998 
## F-statistic: 1.47e+04 on 11 and 20 DF,  p-value: < 2.2e-16

winstorizing mpg changed the r^2 from 0.869 to 0.9999 which accounts for almost all of the variability with an almost perfect fit it does account for variation but wont work too well on new data