Multiple Linear Regression Using R

R is an open source statistical computing application that is great for performing quantitative methods. There exists an extensive number of libraries create by statistician and those who utilize quantitative methods in their research.

In this example, I demonstrate a very simple example of creating the multiple linear regression model and then following up with a couple of tests to check the assumptions of the data. It is not exhaustive and meant to be instructive only. The data set is the “Advertising” data set avaiable on Kaggle.

This work is offered to the public with no claims to any rights whatsoever.

options(messages = FALSE)
#if these are not installed, then use : install.packages("nameoflibrary")
suppressWarnings({
suppressPackageStartupMessages({
library(psych)
library(car)
})
})
#import the data set 
df <- read.csv(file.choose(), header = TRUE)
#Check the dimensions
dim(df)
## [1] 200   4
#Check column names 
names(df)
## [1] "TV"        "Radio"     "Newspaper" "Sales"
#check for missing/nulls 
sum(is.na(df))
## [1] 0
#attach the data 
attach(df) 
#Small EDA 
summary(df)
##        TV             Radio          Newspaper          Sales      
##  Min.   :  0.70   Min.   : 0.000   Min.   :  0.30   Min.   : 1.60  
##  1st Qu.: 74.38   1st Qu.: 9.975   1st Qu.: 12.75   1st Qu.:11.00  
##  Median :149.75   Median :22.900   Median : 25.75   Median :16.00  
##  Mean   :147.04   Mean   :23.264   Mean   : 30.55   Mean   :15.13  
##  3rd Qu.:218.82   3rd Qu.:36.525   3rd Qu.: 45.10   3rd Qu.:19.05  
##  Max.   :296.40   Max.   :49.600   Max.   :114.00   Max.   :27.00
describeBy(df)
## Warning in describeBy(df): no grouping variable requested
##           vars   n   mean    sd median trimmed    mad min   max range  skew
## TV           1 200 147.04 85.85 149.75  147.20 108.82 0.7 296.4 295.7 -0.07
## Radio        2 200  23.26 14.85  22.90   23.00  19.79 0.0  49.6  49.6  0.09
## Newspaper    3 200  30.55 21.78  25.75   28.41  23.13 0.3 114.0 113.7  0.88
## Sales        4 200  15.13  5.28  16.00   15.17   6.08 1.6  27.0  25.4 -0.07
##           kurtosis   se
## TV           -1.24 6.07
## Radio        -1.28 1.05
## Newspaper     0.57 1.54
## Sales        -0.68 0.37
boxplot(df, col = 'lightblue')

boxplot(Sales, col = 'lightblue')

hist(Sales, freq=FALSE, col = 'lightblue', 
     main = 'Histogram of Sales in 1000 units', xlab = 'Values', ylab = 'density')
lines(density(Sales), col='blue')

plot(df, col = 'blue', pch = 16)

corr.test(df, ci = TRUE)
## Call:corr.test(x = df, ci = TRUE)
## Correlation matrix 
##             TV Radio Newspaper Sales
## TV        1.00  0.05      0.06  0.90
## Radio     0.05  1.00      0.35  0.35
## Newspaper 0.06  0.35      1.00  0.16
## Sales     0.90  0.35      0.16  1.00
## Sample Size 
## [1] 200
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##             TV Radio Newspaper Sales
## TV        0.00  0.85      0.85  0.00
## Radio     0.44  0.00      0.00  0.00
## Newspaper 0.43  0.00      0.00  0.08
## Sales     0.00  0.00      0.03  0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
cor.ci(df)

## Call:corCi(x = x, keys = keys, n.iter = n.iter, p = p, overlap = overlap, 
##     poly = poly, method = method, plot = plot, minlength = minlength, 
##     n = n)
## 
##  Coefficients and bootstrapped confidence intervals 
##           TV   Radio Nwspp Sales
## TV        1.00                  
## Radio     0.05 1.00             
## Newspaper 0.06 0.35  1.00       
## Sales     0.90 0.35  0.16  1.00 
## 
##  scale correlations and bootstrapped confidence intervals 
##             lower.emp lower.norm estimate upper.norm upper.emp    p
## TV-Radio        -0.07      -0.08     0.05       0.17      0.17 0.51
## TV-Nwspp        -0.07      -0.08     0.06       0.22      0.22 0.37
## TV-Sales         0.88       0.88     0.90       0.92      0.92 0.00
## Radio-Nwspp      0.23       0.23     0.35       0.48      0.47 0.00
## Radio-Sales      0.22       0.22     0.35       0.45      0.46 0.00
## Nwspp-Sales      0.03       0.02     0.16       0.32      0.33 0.03
#Explanatory Linear Regression 
model <- lm(Sales ~., data =df )

summary(model)
## 
## Call:
## lm(formula = Sales ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3034 -0.8244 -0.0008  0.8976  3.7473 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.6251241  0.3075012  15.041   <2e-16 ***
## TV          0.0544458  0.0013752  39.592   <2e-16 ***
## Radio       0.1070012  0.0084896  12.604   <2e-16 ***
## Newspaper   0.0003357  0.0057881   0.058    0.954    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.662 on 196 degrees of freedom
## Multiple R-squared:  0.9026, Adjusted R-squared:  0.9011 
## F-statistic: 605.4 on 3 and 196 DF,  p-value: < 2.2e-16
confint(model)
##                   2.5 %     97.5 %
## (Intercept)  4.01868836 5.23155980
## TV           0.05173372 0.05715784
## Radio        0.09025861 0.12374384
## Newspaper   -0.01107921 0.01175052
anova(model)
## Analysis of Variance Table
## 
## Response: Sales
##            Df Sum Sq Mean Sq   F value Pr(>F)    
## TV          1 4512.4  4512.4 1634.2115 <2e-16 ***
## Radio       1  502.3   502.3  181.9255 <2e-16 ***
## Newspaper   1    0.0     0.0    0.0034 0.9538    
## Residuals 196  541.2     2.8                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(model)
##        TV     Radio Newspaper 
##  1.004611  1.144952  1.145187
#Compare the model to the null_model 

null_model <- lm(Sales ~ 1, data =df)

anova(null_model, model)
## Analysis of Variance Table
## 
## Model 1: Sales ~ 1
## Model 2: Sales ~ TV + Radio + Newspaper
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    199 5556.0                                  
## 2    196  541.2  3    5014.8 605.38 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(null_model)
## [1] 1236.438
AIC(model)
## [1] 776.6702
#Use backwards elimination to find the best set of features
final_model <- step(model, direction = "backward", trace = 1)
## Start:  AIC=207.09
## Sales ~ TV + Radio + Newspaper
## 
##             Df Sum of Sq    RSS    AIC
## - Newspaper  1       0.0  541.2 205.10
## <none>                    541.2 207.09
## - Radio      1     438.6  979.8 323.81
## - TV         1    4328.2 4869.4 644.48
## 
## Step:  AIC=205.1
## Sales ~ TV + Radio
## 
##         Df Sum of Sq    RSS    AIC
## <none>                541.2 205.10
## - Radio  1     502.3 1043.5 334.41
## - TV     1    4335.6 4876.8 642.79
summary(final_model)
## 
## Call:
## lm(formula = Sales ~ TV + Radio, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3131 -0.8269  0.0095  0.9022  3.7484 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.630879   0.290308   15.95   <2e-16 ***
## TV          0.054449   0.001371   39.73   <2e-16 ***
## Radio       0.107175   0.007926   13.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.657 on 197 degrees of freedom
## Multiple R-squared:  0.9026, Adjusted R-squared:  0.9016 
## F-statistic: 912.7 on 2 and 197 DF,  p-value: < 2.2e-16
confint(final_model)
##                  2.5 %     97.5 %
## (Intercept) 4.05836898 5.20338995
## TV          0.05174600 0.05715192
## Radio       0.09154425 0.12280489
vif(model)
##        TV     Radio Newspaper 
##  1.004611  1.144952  1.145187
shapiro.test(fitted(model)- Sales)
## 
##  Shapiro-Wilk normality test
## 
## data:  fitted(model) - Sales
## W = 0.97581, p-value = 0.001576
#Examine the residuals others 
plot(final_model)