R is an open source statistical computing application that is great for performing quantitative methods. There exists an extensive number of libraries create by statistician and those who utilize quantitative methods in their research.
In this example, I demonstrate a very simple example of creating the multiple linear regression model and then following up with a couple of tests to check the assumptions of the data. It is not exhaustive and meant to be instructive only. The data set is the “Advertising” data set avaiable on Kaggle.
This work is offered to the public with no claims to any rights whatsoever.
options(messages = FALSE)
#if these are not installed, then use : install.packages("nameoflibrary")
suppressWarnings({
suppressPackageStartupMessages({
library(psych)
library(car)
})
})
#import the data set
df <- read.csv(file.choose(), header = TRUE)
#Check the dimensions
dim(df)
## [1] 200 4
#Check column names
names(df)
## [1] "TV" "Radio" "Newspaper" "Sales"
#check for missing/nulls
sum(is.na(df))
## [1] 0
#attach the data
attach(df)
#Small EDA
summary(df)
## TV Radio Newspaper Sales
## Min. : 0.70 Min. : 0.000 Min. : 0.30 Min. : 1.60
## 1st Qu.: 74.38 1st Qu.: 9.975 1st Qu.: 12.75 1st Qu.:11.00
## Median :149.75 Median :22.900 Median : 25.75 Median :16.00
## Mean :147.04 Mean :23.264 Mean : 30.55 Mean :15.13
## 3rd Qu.:218.82 3rd Qu.:36.525 3rd Qu.: 45.10 3rd Qu.:19.05
## Max. :296.40 Max. :49.600 Max. :114.00 Max. :27.00
describeBy(df)
## Warning in describeBy(df): no grouping variable requested
## vars n mean sd median trimmed mad min max range skew
## TV 1 200 147.04 85.85 149.75 147.20 108.82 0.7 296.4 295.7 -0.07
## Radio 2 200 23.26 14.85 22.90 23.00 19.79 0.0 49.6 49.6 0.09
## Newspaper 3 200 30.55 21.78 25.75 28.41 23.13 0.3 114.0 113.7 0.88
## Sales 4 200 15.13 5.28 16.00 15.17 6.08 1.6 27.0 25.4 -0.07
## kurtosis se
## TV -1.24 6.07
## Radio -1.28 1.05
## Newspaper 0.57 1.54
## Sales -0.68 0.37
boxplot(df, col = 'lightblue')
boxplot(Sales, col = 'lightblue')
hist(Sales, freq=FALSE, col = 'lightblue',
main = 'Histogram of Sales in 1000 units', xlab = 'Values', ylab = 'density')
lines(density(Sales), col='blue')
plot(df, col = 'blue', pch = 16)
corr.test(df, ci = TRUE)
## Call:corr.test(x = df, ci = TRUE)
## Correlation matrix
## TV Radio Newspaper Sales
## TV 1.00 0.05 0.06 0.90
## Radio 0.05 1.00 0.35 0.35
## Newspaper 0.06 0.35 1.00 0.16
## Sales 0.90 0.35 0.16 1.00
## Sample Size
## [1] 200
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## TV Radio Newspaper Sales
## TV 0.00 0.85 0.85 0.00
## Radio 0.44 0.00 0.00 0.00
## Newspaper 0.43 0.00 0.00 0.08
## Sales 0.00 0.00 0.03 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
cor.ci(df)
## Call:corCi(x = x, keys = keys, n.iter = n.iter, p = p, overlap = overlap,
## poly = poly, method = method, plot = plot, minlength = minlength,
## n = n)
##
## Coefficients and bootstrapped confidence intervals
## TV Radio Nwspp Sales
## TV 1.00
## Radio 0.05 1.00
## Newspaper 0.06 0.35 1.00
## Sales 0.90 0.35 0.16 1.00
##
## scale correlations and bootstrapped confidence intervals
## lower.emp lower.norm estimate upper.norm upper.emp p
## TV-Radio -0.07 -0.08 0.05 0.17 0.17 0.51
## TV-Nwspp -0.07 -0.08 0.06 0.22 0.22 0.37
## TV-Sales 0.88 0.88 0.90 0.92 0.92 0.00
## Radio-Nwspp 0.23 0.23 0.35 0.48 0.47 0.00
## Radio-Sales 0.22 0.22 0.35 0.45 0.46 0.00
## Nwspp-Sales 0.03 0.02 0.16 0.32 0.33 0.03
#Explanatory Linear Regression
model <- lm(Sales ~., data =df )
summary(model)
##
## Call:
## lm(formula = Sales ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3034 -0.8244 -0.0008 0.8976 3.7473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6251241 0.3075012 15.041 <2e-16 ***
## TV 0.0544458 0.0013752 39.592 <2e-16 ***
## Radio 0.1070012 0.0084896 12.604 <2e-16 ***
## Newspaper 0.0003357 0.0057881 0.058 0.954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.662 on 196 degrees of freedom
## Multiple R-squared: 0.9026, Adjusted R-squared: 0.9011
## F-statistic: 605.4 on 3 and 196 DF, p-value: < 2.2e-16
confint(model)
## 2.5 % 97.5 %
## (Intercept) 4.01868836 5.23155980
## TV 0.05173372 0.05715784
## Radio 0.09025861 0.12374384
## Newspaper -0.01107921 0.01175052
anova(model)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## TV 1 4512.4 4512.4 1634.2115 <2e-16 ***
## Radio 1 502.3 502.3 181.9255 <2e-16 ***
## Newspaper 1 0.0 0.0 0.0034 0.9538
## Residuals 196 541.2 2.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(model)
## TV Radio Newspaper
## 1.004611 1.144952 1.145187
#Compare the model to the null_model
null_model <- lm(Sales ~ 1, data =df)
anova(null_model, model)
## Analysis of Variance Table
##
## Model 1: Sales ~ 1
## Model 2: Sales ~ TV + Radio + Newspaper
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 199 5556.0
## 2 196 541.2 3 5014.8 605.38 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(null_model)
## [1] 1236.438
AIC(model)
## [1] 776.6702
#Use backwards elimination to find the best set of features
final_model <- step(model, direction = "backward", trace = 1)
## Start: AIC=207.09
## Sales ~ TV + Radio + Newspaper
##
## Df Sum of Sq RSS AIC
## - Newspaper 1 0.0 541.2 205.10
## <none> 541.2 207.09
## - Radio 1 438.6 979.8 323.81
## - TV 1 4328.2 4869.4 644.48
##
## Step: AIC=205.1
## Sales ~ TV + Radio
##
## Df Sum of Sq RSS AIC
## <none> 541.2 205.10
## - Radio 1 502.3 1043.5 334.41
## - TV 1 4335.6 4876.8 642.79
summary(final_model)
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3131 -0.8269 0.0095 0.9022 3.7484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.630879 0.290308 15.95 <2e-16 ***
## TV 0.054449 0.001371 39.73 <2e-16 ***
## Radio 0.107175 0.007926 13.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.657 on 197 degrees of freedom
## Multiple R-squared: 0.9026, Adjusted R-squared: 0.9016
## F-statistic: 912.7 on 2 and 197 DF, p-value: < 2.2e-16
confint(final_model)
## 2.5 % 97.5 %
## (Intercept) 4.05836898 5.20338995
## TV 0.05174600 0.05715192
## Radio 0.09154425 0.12280489
vif(model)
## TV Radio Newspaper
## 1.004611 1.144952 1.145187
shapiro.test(fitted(model)- Sales)
##
## Shapiro-Wilk normality test
##
## data: fitted(model) - Sales
## W = 0.97581, p-value = 0.001576
#Examine the residuals others
plot(final_model)