Question Number 2 :
library(data.table)
library(readr)
library(readxl)
library(ggplot2)
library(ggmosaic)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:ggmosaic':
##
## happy
library(corrplot)
## corrplot 0.94 loaded
library(car)
## Loading required package: carData
library(MASS)
knitr::opts_chunk$set(echo = TRUE)
setdir <- "C:/Users/Risky's/Documents/R/R Notebook/mtcars/"
dt_mtcars <- fread(paste0(setdir,"mtcars.csv"))
dt_mtcars
str(dt_mtcars)
## Classes 'data.table' and 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : int 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : int 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : int 0 0 1 1 0 1 0 1 1 1 ...
## $ am : int 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: int 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: int 4 4 1 1 2 1 4 2 2 4 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(dt_mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
dim(dt_mtcars)
## [1] 32 11
hist(dt_mtcars [, mpg])
print(colSums(is.na(dt_mtcars)))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 0 0 0 0 0 0 0 0
cor(dt_mtcars [,.(mpg,disp,wt)])
## mpg disp wt
## mpg 1.0000000 -0.8475514 -0.8676594
## disp -0.8475514 1.0000000 0.8879799
## wt -0.8676594 0.8879799 1.0000000
cor(dt_mtcars [, 1:11])
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
shapiro.test(dt_mtcars [, mpg])
##
## Shapiro-Wilk normality test
##
## data: dt_mtcars[, mpg]
## W = 0.94756, p-value = 0.1229
#MULTIPLE LINEAR REGRESSION
Create new table
new_mtcars <- copy(dt_mtcars)
Convert Data Type :
new_mtcars$cyl <- as.factor(new_mtcars$cyl)
new_mtcars$gear <- as.factor(new_mtcars$gear)
new_mtcars$carb <- as.factor(new_mtcars$carb)
new_mtcars$am <- as.factor(new_mtcars$am)
new_mtcars$vs <- as.factor(new_mtcars$vs)
new_mtcars
pairs(new_mtcars)
Transmission vs MPG :
ggplot(new_mtcars, aes(x=am, y=mpg)) + geom_violin() + labs(x = "Transmission (0=Automatic, 1=Manual)", y = "Miles/gallon") + theme_grey()
#Plot mpg, wt & vs
ggplot(new_mtcars, aes(x=wt, y=mpg, color=vs)) + geom_smooth() + geom_point() + labs(x="Weight", y="Miles/gallon", color="Engine (0=VShape,1=Straight)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Create the model :
multiregression2 <- lm(mpg ~ ., data = new_mtcars)
summary(multiregression2)
##
## Call:
## lm(formula = mpg ~ ., data = new_mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5087 -1.3584 -0.0948 0.7745 4.6251
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.87913 20.06582 1.190 0.2525
## cyl6 -2.64870 3.04089 -0.871 0.3975
## cyl8 -0.33616 7.15954 -0.047 0.9632
## disp 0.03555 0.03190 1.114 0.2827
## hp -0.07051 0.03943 -1.788 0.0939 .
## drat 1.18283 2.48348 0.476 0.6407
## wt -4.52978 2.53875 -1.784 0.0946 .
## qsec 0.36784 0.93540 0.393 0.6997
## vs1 1.93085 2.87126 0.672 0.5115
## am1 1.21212 3.21355 0.377 0.7113
## gear4 1.11435 3.79952 0.293 0.7733
## gear5 2.52840 3.73636 0.677 0.5089
## carb2 -0.97935 2.31797 -0.423 0.6787
## carb3 2.99964 4.29355 0.699 0.4955
## carb4 1.09142 4.44962 0.245 0.8096
## carb6 4.47757 6.38406 0.701 0.4938
## carb8 7.25041 8.36057 0.867 0.3995
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.833 on 15 degrees of freedom
## Multiple R-squared: 0.8931, Adjusted R-squared: 0.779
## F-statistic: 7.83 on 16 and 15 DF, p-value: 0.000124
Feature Engineering :
back_multiregression2 <- step(multiregression2, direction="backward", trace=FALSE)
summary(back_multiregression2)
##
## Call:
## lm(formula = mpg ~ cyl + hp + wt + am, data = new_mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9387 -1.2560 -0.4013 1.1253 5.0513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.70832 2.60489 12.940 7.73e-13 ***
## cyl6 -3.03134 1.40728 -2.154 0.04068 *
## cyl8 -2.16368 2.28425 -0.947 0.35225
## hp -0.03211 0.01369 -2.345 0.02693 *
## wt -2.49683 0.88559 -2.819 0.00908 **
## am1 1.80921 1.39630 1.296 0.20646
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.41 on 26 degrees of freedom
## Multiple R-squared: 0.8659, Adjusted R-squared: 0.8401
## F-statistic: 33.57 on 5 and 26 DF, p-value: 1.506e-10
Intepretation :
#Diagnostic plots to check assumptions
par(mfrow = c(2, 2)) # 2x2 plot layout
plot(back_multiregression2)
Here’s a description of the diagnostic plots :
This plot checks the linearity assumption. Ideally, residuals should be randomly scattered around the horizontal line (y = 0).
Result : In this case, there seems to be a slight curve, suggesting potential non-linearity or an issue with model fit.
This plot checks if residuals follow a normal distribution. Points should fall along the diagonal line.
Result : Most points follow the line, but deviations in the tails suggest that the residuals may not be perfectly normally distributed.
This plot tests for homoscedasticity (constant variance of residuals). The residuals should be evenly spread along the fitted values.
Result : The red line appears fairly flat, but some spread increases on the right side, indicating potential heteroscedasticity (non-constant variance).
This plot identifies influential points and potential outliers. Points with high leverage and large residuals could disproportionately affect the model.
Result : A few points, like 17, 18, and 20, are highlighted near Cook’s distance lines, which could indicate influential data points.
Overall, the diagnostic plots suggest that while the model performs reasonably well, there may be concerns regarding non-linearity, potential outliers, and slight heteroscedasticity. Adjustments to the model or further investigation of influential points may be needed.
# Check for multicollinearity using VIF (Variance Inflation Factor)
vif(back_multiregression2)
## GVIF Df GVIF^(1/(2*Df))
## cyl 5.824545 2 1.553515
## hp 4.703625 1 2.168784
## wt 4.007113 1 2.001778
## am 2.590777 1 1.609589
Calculated the Generalized Variance Inflation Factor (GVIF) for variables to check for multicollinearity.
GVIF: Generalized Variance Inflation Factor, which adjusts for the degrees of freedom (Df). Df: Degrees of freedom for each variable.
Interpretation :
Generally, a GVIF^(1/(2*Df)) value greater than 2.5 or 3 indicates potential multicollinearity issues. In this case, none of the variables exceed this threshold, suggesting that multicollinearity is not a significant problem for the model.
#Conclusion (print output)
cat("\nBest variables selected for predicting mpg:\n")
##
## Best variables selected for predicting mpg:
print(names(coef(back_multiregression2)))
## [1] "(Intercept)" "cyl6" "cyl8" "hp" "wt"
## [6] "am1"
new_mtcars[,. (mpg, hp, wt, am, cyl)]
cat("\nModel Summary:\n")
##
## Model Summary:
print(summary(back_multiregression2))
##
## Call:
## lm(formula = mpg ~ cyl + hp + wt + am, data = new_mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9387 -1.2560 -0.4013 1.1253 5.0513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.70832 2.60489 12.940 7.73e-13 ***
## cyl6 -3.03134 1.40728 -2.154 0.04068 *
## cyl8 -2.16368 2.28425 -0.947 0.35225
## hp -0.03211 0.01369 -2.345 0.02693 *
## wt -2.49683 0.88559 -2.819 0.00908 **
## am1 1.80921 1.39630 1.296 0.20646
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.41 on 26 degrees of freedom
## Multiple R-squared: 0.8659, Adjusted R-squared: 0.8401
## F-statistic: 33.57 on 5 and 26 DF, p-value: 1.506e-10
Based on the linear regression model summary, here are the key conclusions:
In summary, the model suggests that the number of cylinders (specifically 6 cylinders), horsepower, and weight are significant factors affecting a car’s fuel efficiency, while the type of transmission and having 8 cylinders are not significant predictors in this context. The model fits the data well and explains a large portion of the variability in fuel efficiency.