1 Import library

library(readxl) #Library used to import excel files
library(tidyverse) # Pack of most used libraries
library(skimr) # Library used for providing a summary of the data
library(DataExplorer) 
library(corrplot) # Library used for correlation plots
library(car) # Library used for testing autocorrelation (Durbin Watson)
library(olsrr)

2 Data loading

dataset <- read_excel("TDM_Class3_MLR_Chicago_Example.xls") 
class(dataset)
## [1] "tbl_df"     "tbl"        "data.frame"

3 Transform dataset into a dataframe

df <- data.frame(dataset)

4 Show summary statistics

skim(df)
Data summary
Name df
Number of rows 57
Number of columns 6
_______________________
Column type frequency:
numeric 6
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TODU 0 1 5.37 1.33 3.02 4.54 5.10 6.13 9.14 ▃▇▅▃▁
ACO 0 1 0.81 0.18 0.50 0.67 0.79 0.92 1.32 ▆▇▇▃▁
AHS 0 1 3.19 0.39 1.83 3.00 3.19 3.37 4.50 ▁▂▇▂▁
SI 0 1 13.07 12.19 2.17 6.82 9.86 15.08 62.53 ▇▂▁▁▁
SRI 0 1 49.56 15.84 20.89 38.14 49.37 60.85 87.38 ▅▆▇▅▂
UI 0 1 52.62 13.46 24.08 44.80 55.51 61.09 83.66 ▃▅▇▅▁
summary(df)
##       TODU            ACO              AHS              SI       
##  Min.   :3.020   Min.   :0.5000   Min.   :1.830   Min.   : 2.17  
##  1st Qu.:4.540   1st Qu.:0.6700   1st Qu.:3.000   1st Qu.: 6.82  
##  Median :5.100   Median :0.7900   Median :3.190   Median : 9.86  
##  Mean   :5.373   Mean   :0.8118   Mean   :3.185   Mean   :13.07  
##  3rd Qu.:6.130   3rd Qu.:0.9200   3rd Qu.:3.370   3rd Qu.:15.08  
##  Max.   :9.140   Max.   :1.3200   Max.   :4.500   Max.   :62.53  
##       SRI              UI       
##  Min.   :20.89   Min.   :24.08  
##  1st Qu.:38.14   1st Qu.:44.80  
##  Median :49.37   Median :55.51  
##  Mean   :49.56   Mean   :52.62  
##  3rd Qu.:60.85   3rd Qu.:61.09  
##  Max.   :87.38   Max.   :83.66

5 Multiple Linear Regression

5.1 Check assumptions

# Linear relation: if the independent variables have linear relation with the dependent variable
par(mfrow = c(2,3)) 
plot(x = df$TODU, y = df$ACO, xlab = "TODU", ylab = "ACO")
plot(x = df$TODU, y = df$AHS, xlab = "TODU", ylab = "AHS")  
plot(x = df$TODU, y = df$SI, xlab = "TODU", ylab = "SI")  
plot(x = df$TODU, y = df$SRI, xlab = "TODU", ylab = "SRI")  
plot(x = df$TODU, y = df$UI, xlab = "TODU", ylab = "UI")

# or pairwise scatterplot matrix, that compares every variable with each other
pairs(df[,1:6], pch = 19, lower.panel = NULL)

# Normal distribution of dependent variable
# If the sample is smaller than 2000 observations, use Shapiro-Wilk test
shapiro.test(df$TODU) # p value > 0.05, accept null hypothesis normal distribution 
## 
##  Shapiro-Wilk normality test
## 
## data:  df$TODU
## W = 0.96816, p-value = 0.1377
# if not, use the Kolmogorov-Smirnov test (only for sample > 2,000)
ks.test(df$TODU, "pnorm", mean=mean(df$TODU), sd = sd(df$TODU))
## Warning in ks.test(df$TODU, "pnorm", mean = mean(df$TODU), sd = sd(df$TODU)):
## ties should not be present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  df$TODU
## D = 0.12231, p-value = 0.3612
## alternative hypothesis: two-sided

5.2 Multiple linear regression model

model <- lm(TODU ~ ACO + AHS + SI + SRI + UI, data = df)
summary(model)
## 
## Call:
## lm(formula = TODU ~ ACO + AHS + SI + SRI + UI, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4771 -0.3842 -0.0262  0.4116  2.0806 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.817367   2.208380   1.276 0.207820    
## ACO          3.646707   0.956500   3.813 0.000372 ***
## AHS          0.323673   0.412119   0.785 0.435860    
## SI           0.005325   0.009279   0.574 0.568550    
## SRI          0.008135   0.008804   0.924 0.359783    
## UI          -0.036264   0.013330  -2.720 0.008894 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7554 on 51 degrees of freedom
## Multiple R-squared:  0.7042, Adjusted R-squared:  0.6752 
## F-statistic: 24.28 on 5 and 51 DF,  p-value: 2.04e-12
# Residuals
par(mfrow=c(2,2))
plot(model)

# Autocorrelation
durbinWatsonTest(model)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1416308      1.597747   0.062
##  Alternative hypothesis: rho != 0
# Multicollinearity
ols_vif_tol(model)
##   Variables Tolerance      VIF
## 1       ACO 0.3528890 2.833752
## 2       AHS 0.3963709 2.522889
## 3        SI 0.7968916 1.254876
## 4       SRI 0.5236950 1.909508
## 5        UI 0.3165801 3.158758
ols_eigen_cindex(model)
##    Eigenvalue Condition Index    intercept          ACO          AHS
## 1 5.386537577        1.000000 6.994331e-05 0.0005136938 0.0001916512
## 2 0.444466338        3.481252 6.484243e-05 0.0026682253 0.0001278701
## 3 0.084386209        7.989491 5.829055e-05 0.0478676279 0.0091615336
## 4 0.073784878        8.544195 1.355679e-03 0.0031699136 0.0100934045
## 5 0.009322827       24.037043 5.414145e-03 0.7943557055 0.2105218176
## 6 0.001502171       59.881840 9.930371e-01 0.1514248340 0.7699037229
##            SI         SRI           UI
## 1 0.007888051 0.001515333 6.216297e-04
## 2 0.693175876 0.006641788 7.488285e-07
## 3 0.051400736 0.055585833 1.128114e-01
## 4 0.152292605 0.382488929 4.801705e-02
## 5 0.090809203 0.374832118 1.851308e-01
## 6 0.004433528 0.178935999 6.534183e-01
ols_coll_diag(model)
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##   Variables Tolerance      VIF
## 1       ACO 0.3528890 2.833752
## 2       AHS 0.3963709 2.522889
## 3        SI 0.7968916 1.254876
## 4       SRI 0.5236950 1.909508
## 5        UI 0.3165801 3.158758
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##    Eigenvalue Condition Index    intercept          ACO          AHS
## 1 5.386537577        1.000000 6.994331e-05 0.0005136938 0.0001916512
## 2 0.444466338        3.481252 6.484243e-05 0.0026682253 0.0001278701
## 3 0.084386209        7.989491 5.829055e-05 0.0478676279 0.0091615336
## 4 0.073784878        8.544195 1.355679e-03 0.0031699136 0.0100934045
## 5 0.009322827       24.037043 5.414145e-03 0.7943557055 0.2105218176
## 6 0.001502171       59.881840 9.930371e-01 0.1514248340 0.7699037229
##            SI         SRI           UI
## 1 0.007888051 0.001515333 6.216297e-04
## 2 0.693175876 0.006641788 7.488285e-07
## 3 0.051400736 0.055585833 1.128114e-01
## 4 0.152292605 0.382488929 4.801705e-02
## 5 0.090809203 0.374832118 1.851308e-01
## 6 0.004433528 0.178935999 6.534183e-01

6 Reference

Filipe Moura, Gabriel Valença, Miguel Costa, Carlos Roque, & Rosa Félix. (2021, March). U-Shift/Transport-Demand-Modelling: Supporting materials to Transportation Demand Modelling classes at Instituto Superior Técnico - University of Lisbon. (Version 2021.0). GitHub. http://doi.org/10.5281/zenodo.4599525