1 Library Loading

library(dplyr)
library(tidyverse)
library(readxl) 
library(tidyverse) 
library(skimr) # Library used for providing a summary of the data
library(DataExplorer) # Library used in data science to perform exploratory data analysis
library(corrplot)

2 Data loading

dataset <- read_excel("TDM_Class3_MLR_Chicago_Example.xls") 

3 Exploratory Data Analysis

3.1 Get to know data

str(dataset)
## tibble [57 × 6] (S3: tbl_df/tbl/data.frame)
##  $ TODU: num [1:57] 3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
##  $ ACO : num [1:57] 0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
##  $ AHS : num [1:57] 3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
##  $ SI  : num [1:57] 21 21.6 12.6 17.6 35.3 ...
##  $ SRI : num [1:57] 28.3 20.9 26 28.5 27.2 ...
##  $ UI  : num [1:57] 60.1 65.7 63.2 66.2 58.4 ...
head(dataset,5)
## # A tibble: 5 × 6
##    TODU   ACO   AHS    SI   SRI    UI
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  3.18  0.59  3.26  21.0  28.3  60.1
## 2  3.89  0.57  3.13  21.6  20.9  65.7
## 3  3.98  0.61  3.02  12.6  26.0  63.2
## 4  4.16  0.61  3.14  17.6  28.5  66.2
## 5  3.6   0.63  3.75  35.3  27.2  58.4
typeof(dataset)
## [1] "list"
class(dataset)
## [1] "tbl_df"     "tbl"        "data.frame"

3.2 Transform the dataset into a dataframe

df <- data.frame(dataset)
str(dataset)
## tibble [57 × 6] (S3: tbl_df/tbl/data.frame)
##  $ TODU: num [1:57] 3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
##  $ ACO : num [1:57] 0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
##  $ AHS : num [1:57] 3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
##  $ SI  : num [1:57] 21 21.6 12.6 17.6 35.3 ...
##  $ SRI : num [1:57] 28.3 20.9 26 28.5 27.2 ...
##  $ UI  : num [1:57] 60.1 65.7 63.2 66.2 58.4 ...
str(df)
## 'data.frame':    57 obs. of  6 variables:
##  $ TODU: num  3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
##  $ ACO : num  0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
##  $ AHS : num  3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
##  $ SI  : num  21 21.6 12.6 17.6 35.3 ...
##  $ SRI : num  28.3 20.9 26 28.5 27.2 ...
##  $ UI  : num  60.1 65.7 63.2 66.2 58.4 ...
head(df,10)
##    TODU  ACO  AHS    SI   SRI    UI
## 1  3.18 0.59 3.26 21.01 28.32 60.10
## 2  3.89 0.57 3.13 21.61 20.89 65.71
## 3  3.98 0.61 3.02 12.57 25.99 63.19
## 4  4.16 0.61 3.14 17.61 28.52 66.24
## 5  3.60 0.63 3.75 35.32 27.18 58.36
## 6  4.10 0.66 3.24 14.73 27.95 59.58
## 7  4.36 0.71 2.77 11.61 39.91 64.64
## 8  4.87 0.77 2.74 10.71 48.36 67.88
## 9  5.85 0.84 3.02  8.20 42.15 56.86
## 10 4.97 0.74 2.84  7.94 38.14 62.44

3.3 Show summary statistics

skim(df)
Data summary
Name df
Number of rows 57
Number of columns 6
_______________________
Column type frequency:
numeric 6
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TODU 0 1 5.37 1.33 3.02 4.54 5.10 6.13 9.14 ▃▇▅▃▁
ACO 0 1 0.81 0.18 0.50 0.67 0.79 0.92 1.32 ▆▇▇▃▁
AHS 0 1 3.19 0.39 1.83 3.00 3.19 3.37 4.50 ▁▂▇▂▁
SI 0 1 13.07 12.19 2.17 6.82 9.86 15.08 62.53 ▇▂▁▁▁
SRI 0 1 49.56 15.84 20.89 38.14 49.37 60.85 87.38 ▅▆▇▅▂
UI 0 1 52.62 13.46 24.08 44.80 55.51 61.09 83.66 ▃▅▇▅▁

4 Deal with missing data

4.1 Dataset with missing data

table(is.na(df)) # FALSE <- no missing data 
## 
## FALSE 
##   342
df_missing <- read_excel("TDM_Class3_MLR_Chicago_Example_md.xls")
df_missing <- data.frame(df_missing)

4.2 How many missing data?

table(is.na(df_missing))
## 
## FALSE  TRUE 
##   337     5

4.3 Plot the percentage of missing data

plot_missing(df_missing)

4.4 Treat missing data

# listwise deletion
df_missingListwise = na.omit(df_missing) #removes all rows with at least one NA in any variable
# Pairwise deletion
df_missingPairwise = df_missing[!is.na(df_missing$ACO),] #removes all rows with NA in ACO variable
# Replace missing value with mean or median 
df_missing$ACO[is.na(df_missing$ACO)] <- mean(df_missing$ACO, na.rm = TRUE)
df_missing$ACO[is.na(df_missing$ACO)] <- median(df_missing$ACO, na.rm = TRUE)

5 Detect outliers

5.1 Detect

df_no_outliers <- df 
boxplot(df_no_outliers) # other method to detect outliers: Cook distance, QQplot

5.2 Take out the outliers from the variable SI

outlier <- function(x){
  quant <- quantile(x, probs=c(0.25,0.75))
  caps <- quantile(x, probs=c(0.05,0.95))
  H <- 1.5*IQR(x, na.rm = TRUE)
  x[x < (quant[1] - H)] <- caps[1]
  x[x > (quant[2] + H)] <- caps[2]
  return(x)
}

df_no_outliers$SI <- outlier(df_no_outliers$SI)

boxplot(df_no_outliers)

5.3 Compare results of dataset with and without outliers

mean(df$SI)
## [1] 13.07316
mean(df_no_outliers$SI)
## [1] 12.14681
median(df$SI)
## [1] 9.86
median(df_no_outliers$SI)
## [1] 9.86
var(df$SI)
## [1] 148.5175
var(df_no_outliers$SI)
## [1] 80.62296

6 Histograms

plot_histogram(df,ncol = 3) #with 3 columns

## Plot boxplots of each independent variable with TODU
plot_boxplot(df, by = "TODU", ncol = 3) # If you increase the average car ownership (ACO) it will tend to increase the number of trips per dwelling unit (TODU). This makes sense

7 Correlations

res <- cor.mtest(df,conf.level = .95)
corrplot(cor(df), p.mat = res$p, method = "number", type = "upper", order="hclust", sig.level = 0.05)

cor.test(df$AHS,df$SI) # check p-value
## 
##  Pearson's product-moment correlation
## 
## data:  df$AHS and df$SI
## t = 0.63199, df = 55, p-value = 0.53
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1796317  0.3379997
## sample estimates:
##        cor 
## 0.08491026

8 Reference

Filipe Moura, Gabriel Valença, Miguel Costa, Carlos Roque, & Rosa Félix. (2021, March). U-Shift/Transport-Demand-Modelling: Supporting materials to Transportation Demand Modelling classes at Instituto Superior Técnico - University of Lisbon. (Version 2021.0). GitHub. http://doi.org/10.5281/zenodo.4599525