Regression in machine learning

0.1 Assumption 2

mod <- lm(dist ~ speed, data=cars)
mean(mod$residuals)

## [1] 8.65974e-17

Since the mean of residuals is approximately zero, this assumption holds true for this model.

0.2 Assumption 3

par(mfrow=c(2,2))  # set 2 rows and 2 column plot layout
mod_1 <- lm(mpg ~ disp, data=mtcars)  # linear model
plot(mod_1)

mod <- lm(dist ~ speed, data=cars[1:20, ])  #  linear model
plot(mod)

0.3 Assumption 4

# Method 1: Visualize with acf plot
library(ggplot2)
data(economics)
lmMod <- lm(pce ~ pop, data=economics)
acf(lmMod$residuals)  # highly autocorrelated from the picture.

# Method 2: Runs test to test for randomness
library(lawstat)
runs.test(lmMod$residuals)

## 
##  Runs Test - Two sided
## 
## data:  lmMod$residuals
## Standardized Runs Statistic = -23.812, p-value < 2.2e-16

#=>   Runs Test - Two sided

# Method 3: Durbin-Watson test
lmtest::dwtest(lmMod)

## 
##  Durbin-Watson test
## 
## data:  lmMod
## DW = 0.002159, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

#=>   Durbin-Watson test

library(DataCombine)
econ_data <- data.frame(economics, resid_mod1=lmMod$residuals)
econ_data_1 <- slide(econ_data, Var="resid_mod1", NewVar = "lag1", slideBy = -1)

## 
## Remember to put econ_data in time order before running.

## 
## Lagging resid_mod1 by 1 time units.

econ_data_2 <- na.omit(econ_data_1)
lmMod2 <- lm(pce ~ pop + lag1, data=econ_data_2)

acf(lmMod2$residuals)

runs.test(lmMod2$residuals)  #runs test

## 
##  Runs Test - Two sided
## 
## data:  lmMod2$residuals
## Standardized Runs Statistic = 0.20913, p-value = 0.8343

#Runs Test - Two sided

lmtest::dwtest(lmMod2)

## 
##  Durbin-Watson test
## 
## data:  lmMod2
## DW = 2.0309, p-value = 0.6126
## alternative hypothesis: true autocorrelation is greater than 0

#=>   Durbin-Watson test

0.4 Assumption 5

mod.lm <- lm(dist ~ speed, data=cars)
cor.test(cars$speed, mod.lm$residuals)  # do correlation test

## 
##  Pearson's product-moment correlation
## 
## data:  cars$speed and mod.lm$residuals
## t = 5.583e-16, df = 48, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2783477  0.2783477
## sample estimates:
##          cor 
## 8.058406e-17

# Pearson's product-moment correlation

0.5 Assumption 6

The number of observations must be greater than number of Xs This can be directly observed by looking at the data.

0.6 Assumption 7

var(cars$speed)

## [1] 27.95918

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:lawstat':
## 
##     levene.test

mod2 <- lm(mpg ~ ., data=mtcars)

0.7 Assumption 8

The regression model is correctly specified This means that if the Y and X variable has an inverse relationship, the model equation should be specified appropriately: Y=β1+β2∗(1/X)

0.8 Assumption 9

library(corrplot)

## corrplot 0.92 loaded

corrplot(cor(mtcars[, -1]))

library(car)
mod <- lm(mpg ~ cyl + gear, data=mtcars)
vif(mod)

##      cyl     gear 
## 1.320551 1.320551

par(mfrow=c(2,2))
mod <- lm(dist ~ speed, data=cars)
plot(mod)

0.9 Assumption 10

par(mfrow=c(2,2)) 
mod <- lm(dist ~ speed, data=cars)
plot(mod)

0.10 Check Assumptions Automatically

par(mfrow=c(2,2))  # draw 4 plots in same window
mod <- lm(dist ~ speed, data=cars)
gvlma::gvlma(mod)

## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Coefficients:
## (Intercept)        speed  
##     -17.579        3.932  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma::gvlma(x = mod) 
## 
##                     Value  p-value                   Decision
## Global Stat        15.801 0.003298 Assumptions NOT satisfied!
## Skewness            6.528 0.010621 Assumptions NOT satisfied!
## Kurtosis            1.661 0.197449    Assumptions acceptable.
## Link Function       2.329 0.126998    Assumptions acceptable.
## Heteroscedasticity  5.283 0.021530 Assumptions NOT satisfied!

plot(mod)

mod <- lm(dist ~ speed, data=cars[-c(23, 35, 49), ])
gvlma::gvlma(mod)

## 
## Call:
## lm(formula = dist ~ speed, data = cars[-c(23, 35, 49), ])
## 
## Coefficients:
## (Intercept)        speed  
##     -15.137        3.608  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma::gvlma(x = mod) 
## 
##                     Value p-value                Decision
## Global Stat        7.5910 0.10776 Assumptions acceptable.
## Skewness           0.8129 0.36725 Assumptions acceptable.
## Kurtosis           0.2210 0.63831 Assumptions acceptable.
## Link Function      3.2239 0.07257 Assumptions acceptable.
## Heteroscedasticity 3.3332 0.06789 Assumptions acceptable.

influence.measures(mod)

## Influence measures of
##   lm(formula = dist ~ speed, data = cars[-c(23, 35, 49), ]) :
## 
##       dfb.1_ dfb.sped    dffit cov.r   cook.d    hat inf
## 1   0.087848 -0.08003  0.08834 1.184 3.99e-03 0.1187   *
## 2   0.351238 -0.32000  0.35320 1.138 6.25e-02 0.1187   *
## 3  -0.145914  0.12652 -0.15010 1.114 1.14e-02 0.0735    
## 4   0.285653 -0.24768  0.29384 1.075 4.31e-02 0.0735    
## 5   0.047920 -0.04053  0.05012 1.113 1.28e-03 0.0615    
## 6  -0.136783  0.11208 -0.14670 1.083 1.09e-02 0.0511    
## 7  -0.047436  0.03725 -0.05287 1.089 1.43e-03 0.0422    
## 8   0.081425 -0.06394  0.09076 1.083 4.19e-03 0.0422    
## 9   0.212931 -0.16721  0.23734 1.031 2.80e-02 0.0422    
## 10 -0.103835  0.07682 -0.12283 1.064 7.64e-03 0.0349    
## 11  0.047151 -0.03488  0.05578 1.080 1.59e-03 0.0349    
## 12 -0.163139  0.11031 -0.21176 1.008 2.22e-02 0.0292    
## 13 -0.092988  0.06288 -0.12070 1.054 7.37e-03 0.0292    
## 14 -0.047239  0.03194 -0.06132 1.071 1.92e-03 0.0292    
## 15 -0.001863  0.00126 -0.00242 1.077 2.99e-06 0.0292    
## 16 -0.052208  0.03031 -0.07843 1.061 3.13e-03 0.0250    
## 17  0.020094 -0.01167  0.03019 1.071 4.66e-04 0.0250    
## 18  0.020094 -0.01167  0.03019 1.071 4.66e-04 0.0250    
## 19  0.130480 -0.07576  0.19602 1.003 1.90e-02 0.0250    
## 20 -0.063700  0.02683 -0.12078 1.040 7.35e-03 0.0224    
## 21  0.004170 -0.00176  0.00791 1.070 3.20e-05 0.0224    
## 22  0.174775 -0.07362  0.33138 0.870 5.06e-02 0.0224    
## 24 -0.087733  0.00892 -0.24379 0.948 2.86e-02 0.0213    
## 25 -0.059046  0.00600 -0.16408 1.011 1.34e-02 0.0213    
## 26  0.068553 -0.00697  0.19049 0.992 1.79e-02 0.0213    
## 27 -0.023886 -0.02060 -0.13480 1.031 9.12e-03 0.0218    
## 28 -0.005806 -0.00501 -0.03276 1.067 5.48e-04 0.0218    
## 29  0.000274 -0.06235 -0.19077 1.002 1.80e-02 0.0238    
## 30  0.000118 -0.02688 -0.08223 1.058 3.44e-03 0.0238    
## 31 -0.000072  0.01639  0.05015 1.066 1.28e-03 0.0238    
## 32  0.017941 -0.05284 -0.11169 1.054 6.32e-03 0.0274    
## 33 -0.014171  0.04174  0.08822 1.062 3.95e-03 0.0274    
## 34 -0.063461  0.18692  0.39507 0.848 7.09e-02 0.0274   *
## 36  0.081746 -0.16394 -0.27847 0.976 3.77e-02 0.0326    
## 37  0.034106 -0.06840 -0.11619 1.062 6.84e-03 0.0326    
## 38 -0.067855  0.13608  0.23115 1.007 2.64e-02 0.0326    
## 39  0.182368 -0.30828 -0.45545 0.875 9.51e-02 0.0393    
## 40  0.062731 -0.10604 -0.15666 1.060 1.24e-02 0.0393    
## 41  0.034787 -0.05880 -0.08688 1.080 3.84e-03 0.0393    
## 42  0.007121 -0.01204 -0.01778 1.088 1.62e-04 0.0393    
## 43 -0.048260  0.08158  0.12053 1.071 7.37e-03 0.0393    
## 44 -0.020499  0.02947  0.03716 1.108 7.06e-04 0.0573    
## 45  0.200260 -0.27525 -0.33127 1.051 5.43e-02 0.0687    
## 46  0.024652 -0.03277 -0.03811 1.138 7.42e-04 0.0816   *
## 47 -0.358515  0.47655  0.55420 0.979 1.46e-01 0.0816    
## 48 -0.377456  0.50173  0.58348 0.964 1.60e-01 0.0816    
## 50 -0.195430  0.25314  0.28687 1.118 4.14e-02 0.0961