The Dtaset

The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).
We will explore two questions 1- which is better for mpg, manual or automatic transmition using hypothesis t-test? 2- What is the best regression model for this dataset?

 data("mtcars")

Exploratory Data Analysis

#summary of the data
library(skimr)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

skim(mtcars)

Data summary
Name	mtcars
Number of rows	32
Number of columns	11
_______________________
Column type frequency:
numeric	11
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
mpg	1	20.09	6.03	10.40	15.43	19.20	22.80	33.90	▃▇▅▁▂
cyl	1	6.19	1.79	4.00	4.00	6.00	8.00	8.00	▆▁▃▁▇
disp	1	230.72	123.94	71.10	120.83	196.30	326.00	472.00	▇▃▃▃▂
hp	1	146.69	68.56	52.00	96.50	123.00	180.00	335.00	▇▇▆▃▁
drat	1	3.60	0.53	2.76	3.08	3.70	3.92	4.93	▇▃▇▅▁
wt	1	3.22	0.98	1.51	2.58	3.33	3.61	5.42	▃▃▇▁▂
qsec	1	17.85	1.79	14.50	16.89	17.71	18.90	22.90	▃▇▇▂▁
vs	1	0.44	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
am	1	0.41	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
gear	1	3.69	0.74	3.00	3.00	4.00	4.00	5.00	▇▁▆▁▂
carb	1	2.81	1.62	1.00	2.00	2.00	4.00	8.00	▇▂▅▁▁

str(mtcars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

No missing values observed in this dataset

# Visualizing the histogram of the dependent variable(mpg)
# Histogram with density plot
library(ggplot2)

p1 <- ggplot(mtcars, aes(x= mpg)) + 
       geom_histogram(aes(y=..density..), colour="black", fill="white")+
      geom_density(alpha=.2, fill="#69b3a2") 
p1

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram is skewed to the left for the mpg variable so transformation can enhance the distribution.

#visualizing the log of mpg
mtcars$log_mpg = log(mtcars$mpg)
p <- ggplot(mtcars, aes(x= log_mpg)) + 
       geom_histogram(aes(y=..density..), colour="black", fill="white")+
      geom_density(alpha=.2, fill="#69b3a2") 
p

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It is now normally distributed with log_mpg

# Visualizing the histogram and density plot of the independent variables
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

p1 <- ggplot(mtcars, aes(x=cyl)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#69b3a2") 
p2 <- ggplot(mtcars, aes(x=disp)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#69b3a2") 
p3 <- ggplot(mtcars, aes(x=hp)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#69b3a2") 
p4 <- ggplot(mtcars, aes(x=wt)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#69b3a2") 
p5 <- ggplot(mtcars, aes(x= drat)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#69b3a2") 
p6 <- ggplot(mtcars, aes(x= qsec)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#69b3a2") 

grid.arrange(p1,p2,p3,p4,p5,p6, ncol=3)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#transforming the variable hp to log_hp 
mtcars$log_hp = log(mtcars$hp)
p <- ggplot(mtcars, aes(x= log_hp)) + 
       geom_histogram(aes(y=..density..), colour="black", fill="white")+
      geom_density(alpha=.2, fill="#69b3a2") 
p

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(gridExtra)
plot1 <- ggplot(data = mtcars, aes(x=wt, y=mpg)) +
geom_point(aes(color = factor(am))) +
labs(x = "wt")
plot2 <- ggplot(data = mtcars, aes(x=hp, y=mpg)) +
geom_point(aes(color = factor(am))) +
labs(x = "hp")

grid.arrange(plot1, plot2, ncol = 2)

It is shown that there is no interaction between am(automatic, manual) and hp/ wt

#scatterplot between the variables to observe the correlation
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(corrplot)

## corrplot 0.92 loaded

mtcars1 <- mtcars[, c(1:6)] 
ggpairs(mtcars1)+theme_bw()

#correlation plot with numbers
library(corrplot)
library("Hmisc")

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

cor1 <- rcorr(as.matrix(mtcars1))
cor1

##        mpg   cyl  disp    hp  drat    wt
## mpg   1.00 -0.85 -0.85 -0.78  0.68 -0.87
## cyl  -0.85  1.00  0.90  0.83 -0.70  0.78
## disp -0.85  0.90  1.00  0.79 -0.71  0.89
## hp   -0.78  0.83  0.79  1.00 -0.45  0.66
## drat  0.68 -0.70 -0.71 -0.45  1.00 -0.71
## wt   -0.87  0.78  0.89  0.66 -0.71  1.00
## 
## n= 32 
## 
## 
## P
##      mpg  cyl  disp hp   drat wt  
## mpg       0.00 0.00 0.00 0.00 0.00
## cyl  0.00      0.00 0.00 0.00 0.00
## disp 0.00 0.00      0.00 0.00 0.00
## hp   0.00 0.00 0.00      0.01 0.00
## drat 0.00 0.00 0.00 0.01      0.00
## wt   0.00 0.00 0.00 0.00 0.00

ggcorr(mtcars1, label = TRUE, nbreaks = 4, palette = "RdGy",label_color = "white")+ theme_bw()

Hypothesis Testing

First we have to transform “am” into a factor [Transmission (0 = automatic, 1 = manual)]

mtcars$trans= factor(mtcars$am, levels=c(0,1), labels=c("automatic","manual"))
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb  log_mpg
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4 3.044522
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4 3.044522
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1 3.126761
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1 3.063391
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 2.928524
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1 2.895912
##                     log_hp     trans
## Mazda RX4         4.700480    manual
## Mazda RX4 Wag     4.700480    manual
## Datsun 710        4.532599    manual
## Hornet 4 Drive    4.700480 automatic
## Hornet Sportabout 5.164786 automatic
## Valiant           4.653960 automatic

summary(mtcars$trans)

## automatic    manual 
##        19        13

#visualizing the manual and automatic transition 
library(hrbrthemes )

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

b <- mtcars %>%
  
  ggplot( aes(x=mtcars$trans, y=mpg) ) +
    geom_boxplot(fill="#69b3a2") +
    theme_ipsum() +
      geom_jitter(position=position_jitter(width=.1, height=0))+
    xlab("am")
b

## Warning: Use of `mtcars$trans` is discouraged. Use `trans` instead.
## Use of `mtcars$trans` is discouraged. Use `trans` instead.

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

From the boxplot it is shown that there is difference in fuel consumptions between automatic and manual autombiles. Before setting the hypothesis test we have to check the the two ‘independent populations’ are normally distributed

Checking the normality:

1.Normal QQ-plot

#Normal QQ-plot
par(mfrow=c(1,2))
with(mtcars,qqnorm(mpg[trans=="manual"]));with(mtcars,qqline(mpg[trans=="manual"]))
with(mtcars,qqnorm(mpg[trans=="automatic"])); with(mtcars, qqline(mpg[trans=="automatic"]))

ggplot(mtcars, aes(sample=mpg, color=trans))+stat_qq()+theme_bw()

Shapiro-Wilk normality test

with(mtcars, shapiro.test(mpg[trans == "manual"]))

## 
##  Shapiro-Wilk normality test
## 
## data:  mpg[trans == "manual"]
## W = 0.9458, p-value = 0.5363

with(mtcars, shapiro.test(mpg[trans == "automatic"]))

## 
##  Shapiro-Wilk normality test
## 
## data:  mpg[trans == "automatic"]
## W = 0.97677, p-value = 0.8987

From the graph and the test we can conclude that they are normally distributed. ### Checking the variance

with(mtcars,var(mpg[trans=="manual"]))

## [1] 38.02577

with(mtcars,var(mpg[trans=="automatic"]))

## [1] 14.6993

The variance is different between the two populations

Setting the hypothesis testing:

The null hypothesis H0: mu1 = mu2 , There is no difference in fuel consumption between automatic and manual automobiles
The alternative hypothesis H1: mu1 != mu2 , There is difference in fuel consumption between manual and automatic autombiles.

with(mtcars,t.test(mpg~trans,mu=0, alt="two.sided",conf=0.95,var.eq=F,paired=F))

## 
##  Welch Two Sample t-test
## 
## data:  mpg by trans
## t = -3.7671, df = 18.332, p-value = 0.001374
## alternative hypothesis: true difference in means between group automatic and group manual is not equal to 0
## 95 percent confidence interval:
##  -11.280194  -3.209684
## sample estimates:
## mean in group automatic    mean in group manual 
##                17.14737                24.39231

Conclusion

The p-value is < 0.05, We can conclude that there is difference in fuel consumption between manual and automatic autombiles

Linear Regression Model:

mtcars_1 <- mtcars[,-c(12:15)]
head(mtcars_1)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

lm <- lm(mpg~., mtcars_1)
summary(lm)

## 
## Call:
## lm(formula = mpg ~ ., data = mtcars_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07

The p-values of all the variables in this model are not significant because of multidisciplinary between the variables and some variables such as gear and carbs look unimportant in this model. We will use stepwise AIC selection model to get the best fitting model.The lower the AIC the better.

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

step = stepAIC(lm(mpg~.,data = mtcars_1))

## Start:  AIC=70.9
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - cyl   1    0.0799 147.57 68.915
## - vs    1    0.1601 147.66 68.932
## - carb  1    0.4067 147.90 68.986
## - gear  1    1.3531 148.85 69.190
## - drat  1    1.6270 149.12 69.249
## - disp  1    3.9167 151.41 69.736
## - hp    1    6.8399 154.33 70.348
## - qsec  1    8.8641 156.36 70.765
## <none>              147.49 70.898
## - am    1   10.5467 158.04 71.108
## - wt    1   27.0144 174.51 74.280
## 
## Step:  AIC=68.92
## mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - vs    1    0.2685 147.84 66.973
## - carb  1    0.5201 148.09 67.028
## - gear  1    1.8211 149.40 67.308
## - drat  1    1.9826 149.56 67.342
## - disp  1    3.9009 151.47 67.750
## - hp    1    7.3632 154.94 68.473
## <none>              147.57 68.915
## - qsec  1   10.0933 157.67 69.032
## - am    1   11.8359 159.41 69.384
## - wt    1   27.0280 174.60 72.297
## 
## Step:  AIC=66.97
## mpg ~ disp + hp + drat + wt + qsec + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - carb  1    0.6855 148.53 65.121
## - gear  1    2.1437 149.99 65.434
## - drat  1    2.2139 150.06 65.449
## - disp  1    3.6467 151.49 65.753
## - hp    1    7.1060 154.95 66.475
## <none>              147.84 66.973
## - am    1   11.5694 159.41 67.384
## - qsec  1   15.6830 163.53 68.200
## - wt    1   27.3799 175.22 70.410
## 
## Step:  AIC=65.12
## mpg ~ disp + hp + drat + wt + qsec + am + gear
## 
##        Df Sum of Sq    RSS    AIC
## - gear  1     1.565 150.09 63.457
## - drat  1     1.932 150.46 63.535
## <none>              148.53 65.121
## - disp  1    10.110 158.64 65.229
## - am    1    12.323 160.85 65.672
## - hp    1    14.826 163.35 66.166
## - qsec  1    26.408 174.94 68.358
## - wt    1    69.127 217.66 75.350
## 
## Step:  AIC=63.46
## mpg ~ disp + hp + drat + wt + qsec + am
## 
##        Df Sum of Sq    RSS    AIC
## - drat  1     3.345 153.44 62.162
## - disp  1     8.545 158.64 63.229
## <none>              150.09 63.457
## - hp    1    13.285 163.38 64.171
## - am    1    20.036 170.13 65.466
## - qsec  1    25.574 175.67 66.491
## - wt    1    67.572 217.66 73.351
## 
## Step:  AIC=62.16
## mpg ~ disp + hp + wt + qsec + am
## 
##        Df Sum of Sq    RSS    AIC
## - disp  1     6.629 160.07 61.515
## <none>              153.44 62.162
## - hp    1    12.572 166.01 62.682
## - qsec  1    26.470 179.91 65.255
## - am    1    32.198 185.63 66.258
## - wt    1    69.043 222.48 72.051
## 
## Step:  AIC=61.52
## mpg ~ hp + wt + qsec + am
## 
##        Df Sum of Sq    RSS    AIC
## - hp    1     9.219 169.29 61.307
## <none>              160.07 61.515
## - qsec  1    20.225 180.29 63.323
## - am    1    25.993 186.06 64.331
## - wt    1    78.494 238.56 72.284
## 
## Step:  AIC=61.31
## mpg ~ wt + qsec + am
## 
##        Df Sum of Sq    RSS    AIC
## <none>              169.29 61.307
## - am    1    26.178 195.46 63.908
## - qsec  1   109.034 278.32 75.217
## - wt    1   183.347 352.63 82.790

step

## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars_1)
## 
## Coefficients:
## (Intercept)           wt         qsec           am  
##       9.618       -3.917        1.226        2.936

The stepwise regression selection suggests mpg ~ wt + qsec + am

lm_model_1 <- lm(mpg ~ wt + qsec + am, mtcars_1)
summary(lm_model_1)

## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4811 -1.5555 -0.7257  1.4110  4.6610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6178     6.9596   1.382 0.177915    
## wt           -3.9165     0.7112  -5.507 6.95e-06 ***
## qsec          1.2259     0.2887   4.247 0.000216 ***
## am            2.9358     1.4109   2.081 0.046716 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8336 
## F-statistic: 52.75 on 3 and 28 DF,  p-value: 1.21e-11

Assessment of the linear model assumptions:

par(mfrow=c(2,2))
plot(lm_model_1)

library(gvlma)
 gvlma::gvlma(x = lm_model_1)

## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars_1)
## 
## Coefficients:
## (Intercept)           wt         qsec           am  
##       9.618       -3.917        1.226        2.936  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma::gvlma(x = lm_model_1) 
## 
##                      Value  p-value                   Decision
## Global Stat        11.3771 0.022637 Assumptions NOT satisfied!
## Skewness            1.5545 0.212473    Assumptions acceptable.
## Kurtosis            0.6585 0.417077    Assumptions acceptable.
## Link Function       8.9061 0.002842 Assumptions NOT satisfied!
## Heteroscedasticity  0.2580 0.611526    Assumptions acceptable.

Not all of the assumptions are met in this model.

stepback selection

stepback     <- stepAIC(lm(mpg~.,data = mtcars_1), direction = "backward")

## Start:  AIC=70.9
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - cyl   1    0.0799 147.57 68.915
## - vs    1    0.1601 147.66 68.932
## - carb  1    0.4067 147.90 68.986
## - gear  1    1.3531 148.85 69.190
## - drat  1    1.6270 149.12 69.249
## - disp  1    3.9167 151.41 69.736
## - hp    1    6.8399 154.33 70.348
## - qsec  1    8.8641 156.36 70.765
## <none>              147.49 70.898
## - am    1   10.5467 158.04 71.108
## - wt    1   27.0144 174.51 74.280
## 
## Step:  AIC=68.92
## mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - vs    1    0.2685 147.84 66.973
## - carb  1    0.5201 148.09 67.028
## - gear  1    1.8211 149.40 67.308
## - drat  1    1.9826 149.56 67.342
## - disp  1    3.9009 151.47 67.750
## - hp    1    7.3632 154.94 68.473
## <none>              147.57 68.915
## - qsec  1   10.0933 157.67 69.032
## - am    1   11.8359 159.41 69.384
## - wt    1   27.0280 174.60 72.297
## 
## Step:  AIC=66.97
## mpg ~ disp + hp + drat + wt + qsec + am + gear + carb
## 
##        Df Sum of Sq    RSS    AIC
## - carb  1    0.6855 148.53 65.121
## - gear  1    2.1437 149.99 65.434
## - drat  1    2.2139 150.06 65.449
## - disp  1    3.6467 151.49 65.753
## - hp    1    7.1060 154.95 66.475
## <none>              147.84 66.973
## - am    1   11.5694 159.41 67.384
## - qsec  1   15.6830 163.53 68.200
## - wt    1   27.3799 175.22 70.410
## 
## Step:  AIC=65.12
## mpg ~ disp + hp + drat + wt + qsec + am + gear
## 
##        Df Sum of Sq    RSS    AIC
## - gear  1     1.565 150.09 63.457
## - drat  1     1.932 150.46 63.535
## <none>              148.53 65.121
## - disp  1    10.110 158.64 65.229
## - am    1    12.323 160.85 65.672
## - hp    1    14.826 163.35 66.166
## - qsec  1    26.408 174.94 68.358
## - wt    1    69.127 217.66 75.350
## 
## Step:  AIC=63.46
## mpg ~ disp + hp + drat + wt + qsec + am
## 
##        Df Sum of Sq    RSS    AIC
## - drat  1     3.345 153.44 62.162
## - disp  1     8.545 158.64 63.229
## <none>              150.09 63.457
## - hp    1    13.285 163.38 64.171
## - am    1    20.036 170.13 65.466
## - qsec  1    25.574 175.67 66.491
## - wt    1    67.572 217.66 73.351
## 
## Step:  AIC=62.16
## mpg ~ disp + hp + wt + qsec + am
## 
##        Df Sum of Sq    RSS    AIC
## - disp  1     6.629 160.07 61.515
## <none>              153.44 62.162
## - hp    1    12.572 166.01 62.682
## - qsec  1    26.470 179.91 65.255
## - am    1    32.198 185.63 66.258
## - wt    1    69.043 222.48 72.051
## 
## Step:  AIC=61.52
## mpg ~ hp + wt + qsec + am
## 
##        Df Sum of Sq    RSS    AIC
## - hp    1     9.219 169.29 61.307
## <none>              160.07 61.515
## - qsec  1    20.225 180.29 63.323
## - am    1    25.993 186.06 64.331
## - wt    1    78.494 238.56 72.284
## 
## Step:  AIC=61.31
## mpg ~ wt + qsec + am
## 
##        Df Sum of Sq    RSS    AIC
## <none>              169.29 61.307
## - am    1    26.178 195.46 63.908
## - qsec  1   109.034 278.32 75.217
## - wt    1   183.347 352.63 82.790

stepback

## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars_1)
## 
## Coefficients:
## (Intercept)           wt         qsec           am  
##       9.618       -3.917        1.226        2.936

It is observed that stepwise and stepback suggest the same model lm(formula = mpg ~ wt + qsec + am, data = mtcars_1)

#Building a correlation matrix
cor(as.matrix(mtcars[,c(1,7,9)]))

##            mpg       qsec         am
## mpg  1.0000000  0.4186840  0.5998324
## qsec 0.4186840  1.0000000 -0.2298609
## am   0.5998324 -0.2298609  1.0000000

Model 2

I will try to use log-mpg and log-hp as they are normally distributed, visualized in the EDA above.

#removing mpg, hp and transmittion 
mtcars_2 <- mtcars[,-c(1,4,14,15)]
head(mtcars_2)

##                   cyl disp drat    wt  qsec vs am gear carb  log_mpg   log_hp
## Mazda RX4           6  160 3.90 2.620 16.46  0  1    4    4 3.044522 4.700480
## Mazda RX4 Wag       6  160 3.90 2.875 17.02  0  1    4    4 3.044522 4.700480
## Datsun 710          4  108 3.85 2.320 18.61  1  1    4    1 3.126761 4.532599
## Hornet 4 Drive      6  258 3.08 3.215 19.44  1  0    3    1 3.063391 4.700480
## Hornet Sportabout   8  360 3.15 3.440 17.02  0  0    3    2 2.928524 5.164786
## Valiant             6  225 2.76 3.460 20.22  1  0    3    1 2.895912 4.653960

step2 = stepAIC(lm(log_mpg~.,data = mtcars_2))

## Start:  AIC=-129.33
## log_mpg ~ cyl + disp + drat + wt + qsec + vs + am + gear + carb + 
##     log_hp
## 
##          Df Sum of Sq     RSS     AIC
## - disp    1  0.000007 0.28268 -131.33
## - drat    1  0.000026 0.28270 -131.33
## - vs      1  0.000086 0.28276 -131.32
## - am      1  0.001172 0.28385 -131.20
## - cyl     1  0.001331 0.28401 -131.18
## - carb    1  0.005141 0.28781 -130.76
## - qsec    1  0.005775 0.28845 -130.69
## - gear    1  0.016870 0.29954 -129.48
## <none>                0.28267 -129.33
## - log_hp  1  0.032853 0.31553 -127.82
## - wt      1  0.052399 0.33507 -125.89
## 
## Step:  AIC=-131.33
## log_mpg ~ cyl + drat + wt + qsec + vs + am + gear + carb + log_hp
## 
##          Df Sum of Sq     RSS     AIC
## - drat    1  0.000022 0.28270 -133.33
## - vs      1  0.000086 0.28277 -133.32
## - am      1  0.001180 0.28386 -133.20
## - cyl     1  0.001573 0.28425 -133.16
## - qsec    1  0.006194 0.28887 -132.64
## - carb    1  0.008379 0.29106 -132.40
## - gear    1  0.017012 0.29969 -131.46
## <none>                0.28268 -131.33
## - log_hp  1  0.036280 0.31896 -129.47
## - wt      1  0.117956 0.40064 -122.17
## 
## Step:  AIC=-133.33
## log_mpg ~ cyl + wt + qsec + vs + am + gear + carb + log_hp
## 
##          Df Sum of Sq     RSS     AIC
## - vs      1  0.000090 0.28279 -135.32
## - am      1  0.001158 0.28386 -135.20
## - cyl     1  0.001700 0.28440 -135.14
## - qsec    1  0.006443 0.28915 -134.61
## - carb    1  0.008966 0.29167 -134.33
## - gear    1  0.017148 0.29985 -133.45
## <none>                0.28270 -133.33
## - log_hp  1  0.037932 0.32064 -131.30
## - wt      1  0.118148 0.40085 -124.16
## 
## Step:  AIC=-135.32
## log_mpg ~ cyl + wt + qsec + am + gear + carb + log_hp
## 
##          Df Sum of Sq     RSS     AIC
## - am      1  0.001321 0.28411 -137.17
## - cyl     1  0.002227 0.28502 -137.07
## - qsec    1  0.006888 0.28968 -136.55
## - carb    1  0.009003 0.29180 -136.32
## - gear    1  0.017068 0.29986 -135.45
## <none>                0.28279 -135.32
## - log_hp  1  0.039235 0.32203 -133.16
## - wt      1  0.123145 0.40594 -125.75
## 
## Step:  AIC=-137.17
## log_mpg ~ cyl + wt + qsec + gear + carb + log_hp
## 
##          Df Sum of Sq     RSS     AIC
## - cyl     1  0.001637 0.28575 -138.99
## - qsec    1  0.005572 0.28969 -138.55
## - carb    1  0.009031 0.29315 -138.17
## <none>                0.28411 -137.17
## - gear    1  0.023660 0.30777 -136.61
## - log_hp  1  0.043146 0.32726 -134.65
## - wt      1  0.127090 0.41120 -127.34
## 
## Step:  AIC=-138.99
## log_mpg ~ wt + qsec + gear + carb + log_hp
## 
##          Df Sum of Sq     RSS     AIC
## - qsec    1  0.003935 0.28969 -140.55
## - carb    1  0.008500 0.29425 -140.05
## <none>                0.28575 -138.99
## - gear    1  0.025311 0.31106 -138.27
## - log_hp  1  0.044851 0.33060 -136.32
## - wt      1  0.130582 0.41633 -128.94
## 
## Step:  AIC=-140.55
## log_mpg ~ wt + gear + carb + log_hp
## 
##          Df Sum of Sq     RSS     AIC
## - carb    1  0.012582 0.30227 -141.19
## <none>                0.28969 -140.55
## - gear    1  0.025830 0.31552 -139.82
## - log_hp  1  0.120610 0.41030 -131.41
## - wt      1  0.145644 0.43533 -129.52
## 
## Step:  AIC=-141.19
## log_mpg ~ wt + gear + log_hp
## 
##          Df Sum of Sq     RSS     AIC
## - gear    1  0.013273 0.31554 -141.81
## <none>                0.30227 -141.19
## - wt      1  0.211739 0.51401 -126.20
## - log_hp  1  0.250113 0.55238 -123.90
## 
## Step:  AIC=-141.81
## log_mpg ~ wt + log_hp
## 
##          Df Sum of Sq     RSS     AIC
## <none>                0.31554 -141.81
## - log_hp  1   0.24092 0.55646 -125.66
## - wt      1   0.46586 0.78140 -114.80

step2

## 
## Call:
## lm(formula = log_mpg ~ wt + log_hp, data = mtcars_2)
## 
## Coefficients:
## (Intercept)           wt       log_hp  
##      4.8317      -0.1794      -0.2657

Based on the result this model is more significant than the last model mpg+qsec+am ### Assessment model_2 assumptions:

lm_model_2 <- lm(formula = log_mpg ~ wt + log_hp, data = mtcars_2)
summary(lm_model_2)

## 
## Call:
## lm(formula = log_mpg ~ wt + log_hp, data = mtcars_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16296 -0.07799 -0.02210  0.06837  0.25985 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.83167    0.22198  21.766  < 2e-16 ***
## wt          -0.17942    0.02742  -6.543 3.63e-07 ***
## log_hp      -0.26566    0.05646  -4.706 5.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1043 on 29 degrees of freedom
## Multiple R-squared:  0.8852, Adjusted R-squared:  0.8773 
## F-statistic: 111.8 on 2 and 29 DF,  p-value: 2.338e-14

par(mfrow=c(2,2))
plot(lm_model_2)

library(gvlma)
 gvlma::gvlma(x = lm_model_2)

## 
## Call:
## lm(formula = log_mpg ~ wt + log_hp, data = mtcars_2)
## 
## Coefficients:
## (Intercept)           wt       log_hp  
##      4.8317      -0.1794      -0.2657  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma::gvlma(x = lm_model_2) 
## 
##                      Value p-value                Decision
## Global Stat        3.11370  0.5390 Assumptions acceptable.
## Skewness           1.93334  0.1644 Assumptions acceptable.
## Kurtosis           0.06221  0.8030 Assumptions acceptable.
## Link Function      0.62940  0.4276 Assumptions acceptable.
## Heteroscedasticity 0.48875  0.4845 Assumptions acceptable.

All the assumptions are met in this model lm(formula = log_mpg ~ wt + log_hp, data = mtcars_2)

#Building a correlation matrix
cor(as.matrix(mtcars_2[,c(1,6,11)]))

##               cyl         vs     log_hp
## cyl     1.0000000 -0.8108118  0.8806646
## vs     -0.8108118  1.0000000 -0.7617319
## log_hp  0.8806646 -0.7617319  1.0000000

In conclusion:

Based on the R-squared, model assumptions and p-value, we can conclude that model 2 “lm(formula = log_mpg ~ wt + log_hp, data = mtcars_2)” is better than model 1.

Hypothesis Testing and Linear regression

Esraa Abdelhamed

2022-04-23

The Dtaset

Exploratory Data Analysis

Hypothesis Testing

First we have to transform “am” into a factor [Transmission (0 = automatic, 1 = manual)]

Checking the normality:

Setting the hypothesis testing:

Conclusion

Linear Regression Model:

Assessment of the linear model assumptions:

stepback selection

Model 2

In conclusion: