Question Number 2 :

library(data.table)
library(readr)
library(readxl)
library(ggplot2)
library(ggmosaic)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:ggmosaic':
## 
##     happy
library(corrplot)
## corrplot 0.94 loaded
library(car)
## Loading required package: carData
library(MASS)
knitr::opts_chunk$set(echo = TRUE)
setdir <- "C:/Users/Risky's/Documents/R/R Notebook/mtcars/"
dt_mtcars <- fread(paste0(setdir,"mtcars.csv"))
dt_mtcars
str(dt_mtcars)
## Classes 'data.table' and 'data.frame':   32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : int  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : int  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : int  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : int  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: int  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: int  4 4 1 1 2 1 4 2 2 4 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(dt_mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
dim(dt_mtcars)
## [1] 32 11
hist(dt_mtcars [, mpg])

print(colSums(is.na(dt_mtcars)))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0
cor(dt_mtcars [,.(mpg,disp,wt)])
##             mpg       disp         wt
## mpg   1.0000000 -0.8475514 -0.8676594
## disp -0.8475514  1.0000000  0.8879799
## wt   -0.8676594  0.8879799  1.0000000
cor(dt_mtcars [, 1:11])
##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000
shapiro.test(dt_mtcars [, mpg])
## 
##  Shapiro-Wilk normality test
## 
## data:  dt_mtcars[, mpg]
## W = 0.94756, p-value = 0.1229

#MULTIPLE LINEAR REGRESSION

Create new table

new_mtcars <- copy(dt_mtcars)

Convert Data Type :

new_mtcars$cyl <- as.factor(new_mtcars$cyl)
new_mtcars$gear <- as.factor(new_mtcars$gear)
new_mtcars$carb <- as.factor(new_mtcars$carb)
new_mtcars$am <- as.factor(new_mtcars$am)
new_mtcars$vs <- as.factor(new_mtcars$vs)
new_mtcars
pairs(new_mtcars)

Transmission vs MPG :

ggplot(new_mtcars, aes(x=am, y=mpg)) + geom_violin() + labs(x = "Transmission (0=Automatic, 1=Manual)", y = "Miles/gallon") + theme_grey()

#Plot mpg, wt & vs
ggplot(new_mtcars, aes(x=wt, y=mpg, color=vs)) + geom_smooth() + geom_point() + labs(x="Weight", y="Miles/gallon", color="Engine (0=VShape,1=Straight)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Create the model :

multiregression2 <- lm(mpg ~ ., data = new_mtcars)
summary(multiregression2)
## 
## Call:
## lm(formula = mpg ~ ., data = new_mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5087 -1.3584 -0.0948  0.7745  4.6251 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 23.87913   20.06582   1.190   0.2525  
## cyl6        -2.64870    3.04089  -0.871   0.3975  
## cyl8        -0.33616    7.15954  -0.047   0.9632  
## disp         0.03555    0.03190   1.114   0.2827  
## hp          -0.07051    0.03943  -1.788   0.0939 .
## drat         1.18283    2.48348   0.476   0.6407  
## wt          -4.52978    2.53875  -1.784   0.0946 .
## qsec         0.36784    0.93540   0.393   0.6997  
## vs1          1.93085    2.87126   0.672   0.5115  
## am1          1.21212    3.21355   0.377   0.7113  
## gear4        1.11435    3.79952   0.293   0.7733  
## gear5        2.52840    3.73636   0.677   0.5089  
## carb2       -0.97935    2.31797  -0.423   0.6787  
## carb3        2.99964    4.29355   0.699   0.4955  
## carb4        1.09142    4.44962   0.245   0.8096  
## carb6        4.47757    6.38406   0.701   0.4938  
## carb8        7.25041    8.36057   0.867   0.3995  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.833 on 15 degrees of freedom
## Multiple R-squared:  0.8931, Adjusted R-squared:  0.779 
## F-statistic:  7.83 on 16 and 15 DF,  p-value: 0.000124

Feature Engineering :

back_multiregression2 <- step(multiregression2, direction="backward", trace=FALSE)
summary(back_multiregression2)
## 
## Call:
## lm(formula = mpg ~ cyl + hp + wt + am, data = new_mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9387 -1.2560 -0.4013  1.1253  5.0513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.70832    2.60489  12.940 7.73e-13 ***
## cyl6        -3.03134    1.40728  -2.154  0.04068 *  
## cyl8        -2.16368    2.28425  -0.947  0.35225    
## hp          -0.03211    0.01369  -2.345  0.02693 *  
## wt          -2.49683    0.88559  -2.819  0.00908 ** 
## am1          1.80921    1.39630   1.296  0.20646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.41 on 26 degrees of freedom
## Multiple R-squared:  0.8659, Adjusted R-squared:  0.8401 
## F-statistic: 33.57 on 5 and 26 DF,  p-value: 1.506e-10

Intepretation :

  1. The intercept represents the average improvement in miles per gallon (mpg) for an automatic car with 4 cylinders, assuming horsepower (hp) and weight (wt) are zero.
  2. Moving from 4 to 6 cylinders decreases mpg by 3.013.
  3. Moving from 4 to 8 cylinders decreases mpg by 2.16.
  4. A 1-unit increase in horsepower(hp) decreases mpg by 0.032.
  5. A 1-unit increase in weight(wt) decreases mpg by 2.5.
  6. Manual transmission increases mpg by 1.809.
#Diagnostic plots to check assumptions
par(mfrow = c(2, 2))  # 2x2 plot layout
plot(back_multiregression2)

Here’s a description of the diagnostic plots :

  1. Residuals vs Fitted (Top Left):
  1. Normal Q-Q Plot (Top Right):
  1. Scale-Location (Bottom Left):
  1. Residuals vs Leverage (Bottom Right):

Overall, the diagnostic plots suggest that while the model performs reasonably well, there may be concerns regarding non-linearity, potential outliers, and slight heteroscedasticity. Adjustments to the model or further investigation of influential points may be needed.

# Check for multicollinearity using VIF (Variance Inflation Factor)
vif(back_multiregression2)
##         GVIF Df GVIF^(1/(2*Df))
## cyl 5.824545  2        1.553515
## hp  4.703625  1        2.168784
## wt  4.007113  1        2.001778
## am  2.590777  1        1.609589

Calculated the Generalized Variance Inflation Factor (GVIF) for variables to check for multicollinearity.

GVIF: Generalized Variance Inflation Factor, which adjusts for the degrees of freedom (Df). Df: Degrees of freedom for each variable.

Interpretation :

  1. cyl: GVIF^(1/(2*Df)) = 1.553515
  2. hp: GVIF^(1/(2*Df)) = 2.168784
  3. wt: GVIF^(1/(2*Df)) = 2.001778
  4. am: GVIF^(1/(2*Df)) = 1.609589

Generally, a GVIF^(1/(2*Df)) value greater than 2.5 or 3 indicates potential multicollinearity issues. In this case, none of the variables exceed this threshold, suggesting that multicollinearity is not a significant problem for the model.

#Conclusion (print output)
cat("\nBest variables selected for predicting mpg:\n")
## 
## Best variables selected for predicting mpg:
print(names(coef(back_multiregression2)))
## [1] "(Intercept)" "cyl6"        "cyl8"        "hp"          "wt"         
## [6] "am1"
new_mtcars[,. (mpg, hp, wt, am, cyl)]
cat("\nModel Summary:\n")
## 
## Model Summary:
print(summary(back_multiregression2))
## 
## Call:
## lm(formula = mpg ~ cyl + hp + wt + am, data = new_mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9387 -1.2560 -0.4013  1.1253  5.0513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.70832    2.60489  12.940 7.73e-13 ***
## cyl6        -3.03134    1.40728  -2.154  0.04068 *  
## cyl8        -2.16368    2.28425  -0.947  0.35225    
## hp          -0.03211    0.01369  -2.345  0.02693 *  
## wt          -2.49683    0.88559  -2.819  0.00908 ** 
## am1          1.80921    1.39630   1.296  0.20646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.41 on 26 degrees of freedom
## Multiple R-squared:  0.8659, Adjusted R-squared:  0.8401 
## F-statistic: 33.57 on 5 and 26 DF,  p-value: 1.506e-10

Based on the linear regression model summary, here are the key conclusions:

  1. Significant Predictors:
  1. Non-Significant Predictors:
  1. Model Fit:
  1. Overall Significance:

In summary, the model suggests that the number of cylinders (specifically 6 cylinders), horsepower, and weight are significant factors affecting a car’s fuel efficiency, while the type of transmission and having 8 cylinders are not significant predictors in this context. The model fits the data well and explains a large portion of the variability in fuel efficiency.