Campaign - Senior Data Analyst Test Case

Question Number 2 :

library(data.table)
library(readr)
library(readxl)
library(ggplot2)
library(ggmosaic)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:ggmosaic':
## 
##     happy

library(corrplot)

## corrplot 0.94 loaded

library(car)

## Loading required package: carData

library(MASS)

knitr::opts_chunk$set(echo = TRUE)
setdir <- "C:/Users/Risky's/Documents/R/R Notebook/mtcars/"

dt_mtcars <- fread(paste0(setdir,"mtcars.csv"))
dt_mtcars

str(dt_mtcars)

## Classes 'data.table' and 'data.frame':   32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : int  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : int  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : int  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : int  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: int  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: int  4 4 1 1 2 1 4 2 2 4 ...
##  - attr(*, ".internal.selfref")=<externalptr>

summary(dt_mtcars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

dim(dt_mtcars)

## [1] 32 11

hist(dt_mtcars [, mpg])

print(colSums(is.na(dt_mtcars)))

##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0

cor(dt_mtcars [,.(mpg,disp,wt)])

##             mpg       disp         wt
## mpg   1.0000000 -0.8475514 -0.8676594
## disp -0.8475514  1.0000000  0.8879799
## wt   -0.8676594  0.8879799  1.0000000

cor(dt_mtcars [, 1:11])

##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000

shapiro.test(dt_mtcars [, mpg])

## 
##  Shapiro-Wilk normality test
## 
## data:  dt_mtcars[, mpg]
## W = 0.94756, p-value = 0.1229

#MULTIPLE LINEAR REGRESSION

Create new table

new_mtcars <- copy(dt_mtcars)

Convert Data Type :

new_mtcars$cyl <- as.factor(new_mtcars$cyl)
new_mtcars$gear <- as.factor(new_mtcars$gear)
new_mtcars$carb <- as.factor(new_mtcars$carb)
new_mtcars$am <- as.factor(new_mtcars$am)
new_mtcars$vs <- as.factor(new_mtcars$vs)

new_mtcars

pairs(new_mtcars)

Transmission vs MPG :

ggplot(new_mtcars, aes(x=am, y=mpg)) + geom_violin() + labs(x = "Transmission (0=Automatic, 1=Manual)", y = "Miles/gallon") + theme_grey()

#Plot mpg, wt & vs
ggplot(new_mtcars, aes(x=wt, y=mpg, color=vs)) + geom_smooth() + geom_point() + labs(x="Weight", y="Miles/gallon", color="Engine (0=VShape,1=Straight)")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Create the model :

multiregression2 <- lm(mpg ~ ., data = new_mtcars)
summary(multiregression2)

## 
## Call:
## lm(formula = mpg ~ ., data = new_mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5087 -1.3584 -0.0948  0.7745  4.6251 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 23.87913   20.06582   1.190   0.2525  
## cyl6        -2.64870    3.04089  -0.871   0.3975  
## cyl8        -0.33616    7.15954  -0.047   0.9632  
## disp         0.03555    0.03190   1.114   0.2827  
## hp          -0.07051    0.03943  -1.788   0.0939 .
## drat         1.18283    2.48348   0.476   0.6407  
## wt          -4.52978    2.53875  -1.784   0.0946 .
## qsec         0.36784    0.93540   0.393   0.6997  
## vs1          1.93085    2.87126   0.672   0.5115  
## am1          1.21212    3.21355   0.377   0.7113  
## gear4        1.11435    3.79952   0.293   0.7733  
## gear5        2.52840    3.73636   0.677   0.5089  
## carb2       -0.97935    2.31797  -0.423   0.6787  
## carb3        2.99964    4.29355   0.699   0.4955  
## carb4        1.09142    4.44962   0.245   0.8096  
## carb6        4.47757    6.38406   0.701   0.4938  
## carb8        7.25041    8.36057   0.867   0.3995  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.833 on 15 degrees of freedom
## Multiple R-squared:  0.8931, Adjusted R-squared:  0.779 
## F-statistic:  7.83 on 16 and 15 DF,  p-value: 0.000124

Feature Engineering :

back_multiregression2 <- step(multiregression2, direction="backward", trace=FALSE)
summary(back_multiregression2)

## 
## Call:
## lm(formula = mpg ~ cyl + hp + wt + am, data = new_mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9387 -1.2560 -0.4013  1.1253  5.0513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.70832    2.60489  12.940 7.73e-13 ***
## cyl6        -3.03134    1.40728  -2.154  0.04068 *  
## cyl8        -2.16368    2.28425  -0.947  0.35225    
## hp          -0.03211    0.01369  -2.345  0.02693 *  
## wt          -2.49683    0.88559  -2.819  0.00908 ** 
## am1          1.80921    1.39630   1.296  0.20646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.41 on 26 degrees of freedom
## Multiple R-squared:  0.8659, Adjusted R-squared:  0.8401 
## F-statistic: 33.57 on 5 and 26 DF,  p-value: 1.506e-10

Intepretation :

The intercept represents the average improvement in miles per gallon (mpg) for an automatic car with 4 cylinders, assuming horsepower (hp) and weight (wt) are zero.
Moving from 4 to 6 cylinders decreases mpg by 3.013.
Moving from 4 to 8 cylinders decreases mpg by 2.16.
A 1-unit increase in horsepower(hp) decreases mpg by 0.032.
A 1-unit increase in weight(wt) decreases mpg by 2.5.
Manual transmission increases mpg by 1.809.

#Diagnostic plots to check assumptions
par(mfrow = c(2, 2))  # 2x2 plot layout
plot(back_multiregression2)

Here’s a description of the diagnostic plots :

Residuals vs Fitted (Top Left):

This plot checks the linearity assumption. Ideally, residuals should be randomly scattered around the horizontal line (y = 0).

Result : In this case, there seems to be a slight curve, suggesting potential non-linearity or an issue with model fit.

Normal Q-Q Plot (Top Right):

This plot checks if residuals follow a normal distribution. Points should fall along the diagonal line.

Result : Most points follow the line, but deviations in the tails suggest that the residuals may not be perfectly normally distributed.

Scale-Location (Bottom Left):

This plot tests for homoscedasticity (constant variance of residuals). The residuals should be evenly spread along the fitted values.

Result : The red line appears fairly flat, but some spread increases on the right side, indicating potential heteroscedasticity (non-constant variance).

Residuals vs Leverage (Bottom Right):

This plot identifies influential points and potential outliers. Points with high leverage and large residuals could disproportionately affect the model.

Result : A few points, like 17, 18, and 20, are highlighted near Cook’s distance lines, which could indicate influential data points.

Overall, the diagnostic plots suggest that while the model performs reasonably well, there may be concerns regarding non-linearity, potential outliers, and slight heteroscedasticity. Adjustments to the model or further investigation of influential points may be needed.

# Check for multicollinearity using VIF (Variance Inflation Factor)
vif(back_multiregression2)

##         GVIF Df GVIF^(1/(2*Df))
## cyl 5.824545  2        1.553515
## hp  4.703625  1        2.168784
## wt  4.007113  1        2.001778
## am  2.590777  1        1.609589

Calculated the Generalized Variance Inflation Factor (GVIF) for variables to check for multicollinearity.

GVIF: Generalized Variance Inflation Factor, which adjusts for the degrees of freedom (Df). Df: Degrees of freedom for each variable.

Interpretation :

cyl: GVIF^(1/(2*Df)) = 1.553515
hp: GVIF^(1/(2*Df)) = 2.168784
wt: GVIF^(1/(2*Df)) = 2.001778
am: GVIF^(1/(2*Df)) = 1.609589

Generally, a GVIF^(1/(2*Df)) value greater than 2.5 or 3 indicates potential multicollinearity issues. In this case, none of the variables exceed this threshold, suggesting that multicollinearity is not a significant problem for the model.

#Conclusion (print output)
cat("\nBest variables selected for predicting mpg:\n")

## 
## Best variables selected for predicting mpg:

print(names(coef(back_multiregression2)))

## [1] "(Intercept)" "cyl6"        "cyl8"        "hp"          "wt"         
## [6] "am1"

new_mtcars[,. (mpg, hp, wt, am, cyl)]

cat("\nModel Summary:\n")

## 
## Model Summary:

print(summary(back_multiregression2))

## 
## Call:
## lm(formula = mpg ~ cyl + hp + wt + am, data = new_mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9387 -1.2560 -0.4013  1.1253  5.0513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.70832    2.60489  12.940 7.73e-13 ***
## cyl6        -3.03134    1.40728  -2.154  0.04068 *  
## cyl8        -2.16368    2.28425  -0.947  0.35225    
## hp          -0.03211    0.01369  -2.345  0.02693 *  
## wt          -2.49683    0.88559  -2.819  0.00908 ** 
## am1          1.80921    1.39630   1.296  0.20646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.41 on 26 degrees of freedom
## Multiple R-squared:  0.8659, Adjusted R-squared:  0.8401 
## F-statistic: 33.57 on 5 and 26 DF,  p-value: 1.506e-10

Based on the linear regression model summary, here are the key conclusions:

Significant Predictors:

Cylinders (cyl6): Cars with 6 cylinders have a significant negative impact on miles per gallon (mpg), reducing it by approximately 3.03 units.
Horsepower (hp): An increase in horsepower is associated with a slight but significant decrease in mpg.
Weight (wt): Heavier cars tend to have lower mpg, with weight being a significant predictor.

Non-Significant Predictors:

Cylinders (cyl8): Cars with 8 cylinders do not significantly affect mpg in this model.
Transmission (am1): The type of transmission (manual vs. automatic) does not significantly impact mpg.

Model Fit:

The model explains a substantial portion of the variance in mpg, with an R-squared value of 0.8659. This means approximately 86.59% of the variability in mpg is explained by the model.
The adjusted R-squared value of 0.8401 indicates a good fit, accounting for the number of predictors in the model.

Overall Significance:

The model is statistically significant overall, with a p-value of 1.506e-10, indicating that the predictors collectively have a significant effect on mpg.

In summary, the model suggests that the number of cylinders (specifically 6 cylinders), horsepower, and weight are significant factors affecting a car’s fuel efficiency, while the type of transmission and having 8 cylinders are not significant predictors in this context. The model fits the data well and explains a large portion of the variability in fuel efficiency.