Introduction

The goal of this paper is to present: the concept of prediction in Multiple Regression, the assumptions underlying multiple regression analysis and cross-validation.

#Loading libraries
library(downloader)
library(ggplot2)
#library(GGally)
#Downloading data from University of Lyon2
url <- "http://eric.univ-lyon2.fr/~ricco/cours/didacticiels/R/automobiles_pour_regression.txt"
filename   <- "automobiles_pour_regression.txt"
           
     if (!file.exists(filename)) download(url,filename)
     msleep <- read.table(file = "automobiles_pour_regression.txt",sep="\t",header=TRUE,dec=".",row.names=1)
#Changing column names
#Old columns from UL2
head(msleep,n=2)
##                       prix cylindree puissance poids conso
## Daihatsu Cuore       11600       846        32   650   5.7
## Suzuki Swift 1.0 GLS 12490       993        39   790   5.8
colnames(msleep) <- c("Price","Cylinder","Power","Weight","Consumption")
#News columns
head(msleep,n=2)
##                      Price Cylinder Power Weight Consumption
## Daihatsu Cuore       11600      846    32    650         5.7
## Suzuki Swift 1.0 GLS 12490      993    39    790         5.8

Multiple Linear Regression

#Consumption is the response variable
#Price, Cylinder, Power and Weight are explanatory variables
#Our model is Consumption ~ Price + Cylinder + Power + Weight
varReg <- lm (Consumption ~ Price + Cylinder + Power + Weight, msleep)
varRes <- residuals(varReg)

Plot of residuals (Normality)

qqnorm(varRes,datax=TRUE,ylab="Quantiles Observed",xlab="Theoretical Quantile")

                  QQ-plot allows to check the normality of distribution.

The comparison between Quantiles Observed and Theoretical Quantile in the Q-Q Plot, we can observe Henry’s line which is showing that the distribution is consistent with the normal distribution.

Residuals vs variables

par(mfrow =c(2,3))

for (j in 1:5){
               plot(msleep[,j], varRes, ylab="Residuals", xlab=names(msleep)[j],col="red")
               abline(h=0)
              }

The graphs of the residuals (y-axis) vs. the study variables (x-axis) to visually detect outliers: both variables (the points on the outskirts x-axis) in the regression (the points on the periphery axis).

Study of outliers

Outlier with The standardized residual

#Calculation of standardized residual
varStandRes <- rstandard(varReg) 
#Setting the value of alpha
alpha <- 0.1
#Calculation of the threshold from Student distrution (n-p-1) degree of freedom
#n is the size of dataset and p is the number of columns
n <- dim(msleep)[1]
p <- dim(msleep)[2]

#In statistics, the number of degrees of freedom is the number of values in the final calculation of a statistic that are free to vary
dFreedom <- n-p-1
threshold <- qt(1-alpha/2,dFreedom)

plot(msleep$Consumption,varStandRes,ylab="Standardized residual",xlab="Consumption", col="blue")
#Adding to this graph limits for ruling on the atypical nature of the residual
abline(h=-threshold)#Negative threshold
abline(h=+threshold)#Postive threshold
abline(h=0)#Line in the middle (Point XOY)

#Observation in the dataset challenging the standardized residual with the negative/positive threshold
obs <- msleep [varStandRes < -threshold | varStandRes > +threshold,]

#Vehicle name retrieval in the dataset and put it in the plot
for (i in 1:nrow(obs)){
                        vName <- row.names(obs)[i]
                        text(msleep[vName,"Consumption"],varStandRes[vName],vName)
                      }

                  Automatic detection of outliers within the meaning of standardized residual

Outlier with Hat Matrix

#In statistics, the hat matrix, H, sometimes also called the influence matrix or projection matrix, maps the vector of response values (dependent variable values) to the vector of fitted values (or predicted values).

outlier <- influence.measures(varReg)

#Attribritues attached to the hat matrix
attributes(outlier)
## $names
## [1] "infmat" "is.inf" "call"  
## 
## $class
## [1] "infl"
#Retrieval of Levarage points in the columns "hat"
levPoints <- outlier$infmat[,"hat"]

#Recommended threshold formula for the Levarage is 2*p/n
rT <- 2*p/n

#outliers within the meaning of the leverage points
oLP <- msleep [levPoints > rT,]
print(oLP)
##                     Price Cylinder Power Weight Consumption
## Ferrari 456 GT     285000     5474   325   1690        21.3
## Mercedes S 600     183900     5987   300   2250        18.7
## Maserati Ghibli GT  92500     2789   209   1485        14.5

Observation

The comparison of Outlier with standardized residual and Hat Matrix, we can see :

  1. Ferrari and Mercedes are atypical, they are both in the description as atypical in predicting the consumption

Data Cleaning

Remove suspicious outliers

#Boolean vector of suspicion for standardized residual
bStRes <- (varStandRes < -threshold | varStandRes > +threshold ) 

#Boolean vector of suspicion for Hat Matrix
bLevPoints <- (levPoints > rT)

#Detection of suspicion
susp <- bStRes | bLevPoints

#Data not suspected from the dataset
notSusp <- !susp

msleep2 <- msleep [notSusp,]

The new size of the dataset is 26.

Regression with cleaned data

varRegCleaned <- lm(Consumption ~., msleep2)

#Multiple R-Squared
MRS <- summary(varRegCleaned)$r.squared
MRS
## [1] 0.9259853

[obs1] The Multiple R-Squared is : 0.9259853, from this value we will detect the pairwise correlations among all the variables which are posing problem in the consumption of vehicle

#Detection of collinearity :  Klein's Rule of Thumb > suggests that multicollinearity may be a problem only if the R² obtained from an auxiliary regression is greater than the overall R²
mcxx <- cor(msleep2[,c(1,2,3,4)])
#Power 2
mcxx <- mcxx^2
mcxx
##              Price  Cylinder     Power    Weight
## Price    1.0000000 0.8727168 0.8550363 0.8952794
## Cylinder 0.8727168 1.0000000 0.9481205 0.7774552
## Power    0.8550363 0.9481205 1.0000000 0.7213830
## Weight   0.8952794 0.7774552 0.7213830 1.0000000

Cylinder and Power values are roughly the same with the Multiple R-Squared. Let check this observation with the Variance Inflation Factor if we will get the same result.

#Loading libraries
library(sp)
library(raster)
library(usdm)
library(MASS)
#Detection of collinearity : Variance Inflation Factor (VIF)
#It provides an index that measures how much the variance (the square of the estimate's standard deviation) of an estimated regression coefficient is increased because of collinearity.
dT <- msleep2[,c(1,2,3,4)]
vif(dT)
##   Variables      VIF
## 1     Price 19.22509
## 2  Cylinder 24.36757
## 3     Power 22.35158
## 4    Weight 10.69749

Here the max value is the variable Cylinder and min value is the variable Weight.The first observation [obs1] is not accurate.

Let compare VIF and the AIC if we will have the same conclusion

#Variable Selection based on AIC (Akaike information criterion)
#First method
varRegSel1 <- lm(Consumption ~ Price + Cylinder + Power + Weight,data= msleep2)
stepAIC(varRegSel1,direction = "backward",trace=TRUE)
## Start:  AIC=-16.76
## Consumption ~ Price + Cylinder + Power + Weight
## 
##            Df Sum of Sq     RSS     AIC
## - Power     1    0.0001  9.2886 -18.762
## - Price     1    0.2367  9.5252 -18.108
## - Cylinder  1    0.3463  9.6348 -17.811
## <none>                   9.2885 -16.762
## - Weight    1    3.4582 12.7467 -10.533
## 
## Step:  AIC=-18.76
## Consumption ~ Price + Cylinder + Weight
## 
##            Df Sum of Sq     RSS     AIC
## - Price     1    0.2679  9.5565 -20.023
## <none>                   9.2886 -18.762
## - Cylinder  1    1.0400 10.3286 -18.003
## - Weight    1    3.8881 13.1767 -11.671
## 
## Step:  AIC=-20.02
## Consumption ~ Cylinder + Weight
## 
##            Df Sum of Sq     RSS      AIC
## <none>                   9.5565 -20.0227
## - Cylinder  1    3.2279 12.7844 -14.4567
## - Weight    1   11.7250 21.2815  -1.2067
## 
## Call:
## lm(formula = Consumption ~ Cylinder + Weight, data = msleep2)
## 
## Coefficients:
## (Intercept)     Cylinder       Weight  
##    1.388941     0.001266     0.004570
#Second method
varRegSel2 <- lm(Consumption ~ 1,data=msleep2)
stepAIC(varRegSel2,scope=list(upper=~Price+Cylinder+Power+Weight,lower=~1),direction="forward",trace=TRUE)
## Start:  AIC=42.93
## Consumption ~ 1
## 
##            Df Sum of Sq     RSS     AIC
## + Weight    1   112.711  12.784 -14.457
## + Price     1   111.353  14.142 -11.833
## + Cylinder  1   104.214  21.281  -1.207
## + Power     1    98.277  27.219   5.191
## <none>                  125.495  42.928
## 
## Step:  AIC=-14.46
## Consumption ~ Weight
## 
##            Df Sum of Sq     RSS     AIC
## + Cylinder  1    3.2279  9.5565 -20.023
## + Power     1    2.8839  9.9005 -19.103
## + Price     1    2.4558 10.3286 -18.003
## <none>                  12.7844 -14.457
## 
## Step:  AIC=-20.02
## Consumption ~ Weight + Cylinder
## 
##         Df Sum of Sq    RSS     AIC
## <none>               9.5565 -20.023
## + Price  1  0.267874 9.2886 -18.762
## + Power  1  0.031319 9.5252 -18.108
## 
## Call:
## lm(formula = Consumption ~ Weight + Cylinder, data = msleep2)
## 
## Coefficients:
## (Intercept)       Weight     Cylinder  
##    1.388941     0.004570     0.001266

Search “backward” and “forward” looking the same result: weight and cylinder are the relevant variables to explain vehicle consumption.

library(DAAG)

Cross validation

#The goal of cross validation is to define a dataset to "test" the model in the training phase (i.e., the validation dataset), in order to limit problems like overfitting, give an insight on how the model will generalize to an independent dataset
CVlm(msleep2, form.lm = formula(Consumption ~ Price + Cylinder + Power + Weight), printit = TRUE,m=10)
## Analysis of Variance Table
## 
## Response: Consumption
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## Price      1  111.4   111.4  251.75 3.6e-13 ***
## Cylinder   1    1.0     1.0    2.18   0.154    
## Power      1    0.4     0.4    0.97   0.335    
## Weight     1    3.5     3.5    7.82   0.011 *  
## Residuals 21    9.3     0.4                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in CVlm(msleep2, form.lm = formula(Consumption ~ Price + Cylinder + : 
## 
##  As there is >1 explanatory variable, cross-validation
##  predicted values for a fold are not a linear function
##  of corresponding overall predicted values.  Lines that
##  are shown for the different folds are approximate

## 
## fold 1 
## Observations in test set: 2 
##             Opel Astra 1.6i 16V Honda Civic Joker 1.4
## Predicted                  8.38                 8.245
## cvpred                     8.50                 8.523
## Consumption                7.40                 7.700
## CV residual               -1.10                -0.823
## 
## Sum of squares = 1.9    Mean square = 0.95    n = 2 
## 
## fold 2 
## Observations in test set: 3 
##             Lancia K 3.0 LS Toyota Previa salon Volvo 960 Kombi aut
## Predicted            12.337              12.813               11.90
## cvpred               12.195              12.513               11.63
## Consumption          11.900              12.800               12.70
## CV residual          -0.295               0.287                1.07
## 
## Sum of squares = 1.31    Mean square = 0.44    n = 3 
## 
## fold 3 
## Observations in test set: 3 
##             Subaru Vivio 4WD Mazda Hachtback V Opel Omega 2.5i V6
## Predicted               5.80           10.5668              12.25
## cvpred                  5.29           10.7431              12.35
## Consumption             6.80           10.8000              11.30
## CV residual             1.51            0.0569              -1.05
## 
## Sum of squares = 3.38    Mean square = 1.13    n = 3 
## 
## fold 4 
## Observations in test set: 3 
##             Daihatsu Cuore Opel Corsa 1.2i Eco Seat Alhambra 2.0
## Predicted           5.5684               6.952            11.258
## cvpred              5.6174               6.973            11.127
## Consumption         5.7000               6.800            11.600
## CV residual         0.0826              -0.173             0.473
## 
## Sum of squares = 0.26    Mean square = 0.09    n = 3 
## 
## fold 5 
## Observations in test set: 3 
##             Peugeot 306 XS 108 Renault Safrane 2.2. V Nissan Primera 2.0
## Predicted                 8.52                 10.917              9.437
## cvpred                    8.43                 10.792              9.304
## Consumption               9.00                 11.700              9.200
## CV residual               0.57                  0.908             -0.104
## 
## Sum of squares = 1.16    Mean square = 0.39    n = 3 
## 
## fold 6 
## Observations in test set: 3 
##             Citroen ZX Volcane Volvo 850 2.5 Peugeot 806 2.0
## Predicted                9.117        10.787          11.002
## cvpred                   9.144        10.819          11.033
## Consumption              8.800        10.800          10.800
## CV residual             -0.344        -0.019          -0.233
## 
## Sum of squares = 0.17    Mean square = 0.06    n = 3 
## 
## fold 7 
## Observations in test set: 3 
##             Fiat Panda Mambo L Seat Ibiza 2.0 GTI VW Golt 2.0 GTI
## Predicted                5.888               8.64           9.257
## cvpred                   5.798               8.35           9.131
## Consumption              6.100               9.50           9.500
## CV residual              0.302               1.15           0.369
## 
## Sum of squares = 1.55    Mean square = 0.52    n = 3 
## 
## fold 8 
## Observations in test set: 2 
##             Suzuki Swift 1.0 GLS Toyota Corolla
## Predicted                  6.274          7.674
## cvpred                     6.388          7.758
## Consumption                5.800          7.100
## CV residual               -0.588         -0.658
## 
## Sum of squares = 0.78    Mean square = 0.39    n = 2 
## 
## fold 9 
## Observations in test set: 2 
##             VW Polo 1.4 60 Fort Escort 1.4i PT
## Predicted             7.44               8.141
## cvpred                7.63               8.187
## Consumption           6.50               8.600
## CV residual          -1.13               0.413
## 
## Sum of squares = 1.46    Mean square = 0.73    n = 2 
## 
## fold 10 
## Observations in test set: 2 
##             Fiat Tempra 1.6 Liberty Ford Fiesta 1.2 Zetec
## Predicted                      8.28                 7.328
## cvpred                         8.26                 7.369
## Consumption                    9.30                 6.600
## CV residual                    1.04                -0.769
## 
## Sum of squares = 1.68    Mean square = 0.84    n = 2 
## 
## Overall (Sum over all 2 folds) 
##    ms 
## 0.525