1 Library Loading

library(dplyr)
library(tidyverse)
library(readxl) 
library(tidyverse) 
library(skimr) # Library used for providing a summary of the data
library(DataExplorer) # Library used in data science to perform exploratory data analysis
library(corrplot)

2 Data loading

dataset <- read_excel("TDM_Class3_MLR_Chicago_Example.xls")

3 Exploratory Data Analysis

3.1 Get to know data

str(dataset)

## tibble [57 × 6] (S3: tbl_df/tbl/data.frame)
##  $ TODU: num [1:57] 3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
##  $ ACO : num [1:57] 0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
##  $ AHS : num [1:57] 3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
##  $ SI  : num [1:57] 21 21.6 12.6 17.6 35.3 ...
##  $ SRI : num [1:57] 28.3 20.9 26 28.5 27.2 ...
##  $ UI  : num [1:57] 60.1 65.7 63.2 66.2 58.4 ...

head(dataset,5)

## # A tibble: 5 × 6
##    TODU   ACO   AHS    SI   SRI    UI
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  3.18  0.59  3.26  21.0  28.3  60.1
## 2  3.89  0.57  3.13  21.6  20.9  65.7
## 3  3.98  0.61  3.02  12.6  26.0  63.2
## 4  4.16  0.61  3.14  17.6  28.5  66.2
## 5  3.6   0.63  3.75  35.3  27.2  58.4

typeof(dataset)

## [1] "list"

class(dataset)

## [1] "tbl_df"     "tbl"        "data.frame"

3.2 Transform the dataset into a dataframe

df <- data.frame(dataset)
str(dataset)

## tibble [57 × 6] (S3: tbl_df/tbl/data.frame)
##  $ TODU: num [1:57] 3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
##  $ ACO : num [1:57] 0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
##  $ AHS : num [1:57] 3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
##  $ SI  : num [1:57] 21 21.6 12.6 17.6 35.3 ...
##  $ SRI : num [1:57] 28.3 20.9 26 28.5 27.2 ...
##  $ UI  : num [1:57] 60.1 65.7 63.2 66.2 58.4 ...

str(df)

## 'data.frame':    57 obs. of  6 variables:
##  $ TODU: num  3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
##  $ ACO : num  0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
##  $ AHS : num  3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
##  $ SI  : num  21 21.6 12.6 17.6 35.3 ...
##  $ SRI : num  28.3 20.9 26 28.5 27.2 ...
##  $ UI  : num  60.1 65.7 63.2 66.2 58.4 ...

head(df,10)

##    TODU  ACO  AHS    SI   SRI    UI
## 1  3.18 0.59 3.26 21.01 28.32 60.10
## 2  3.89 0.57 3.13 21.61 20.89 65.71
## 3  3.98 0.61 3.02 12.57 25.99 63.19
## 4  4.16 0.61 3.14 17.61 28.52 66.24
## 5  3.60 0.63 3.75 35.32 27.18 58.36
## 6  4.10 0.66 3.24 14.73 27.95 59.58
## 7  4.36 0.71 2.77 11.61 39.91 64.64
## 8  4.87 0.77 2.74 10.71 48.36 67.88
## 9  5.85 0.84 3.02  8.20 42.15 56.86
## 10 4.97 0.74 2.84  7.94 38.14 62.44

3.3 Show summary statistics

skim(df)

Data summary
Name	df
Number of rows	57
Number of columns	6
_______________________
Column type frequency:
numeric	6
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
TODU	1	5.37	1.33	3.02	4.54	5.10	6.13	9.14	▃▇▅▃▁
ACO	1	0.81	0.18	0.50	0.67	0.79	0.92	1.32	▆▇▇▃▁
AHS	1	3.19	0.39	1.83	3.00	3.19	3.37	4.50	▁▂▇▂▁
SI	1	13.07	12.19	2.17	6.82	9.86	15.08	62.53	▇▂▁▁▁
SRI	1	49.56	15.84	20.89	38.14	49.37	60.85	87.38	▅▆▇▅▂
UI	1	52.62	13.46	24.08	44.80	55.51	61.09	83.66	▃▅▇▅▁

4 Deal with missing data

4.1 Dataset with missing data

table(is.na(df)) # FALSE <- no missing data

## 
## FALSE 
##   342

df_missing <- read_excel("TDM_Class3_MLR_Chicago_Example_md.xls")
df_missing <- data.frame(df_missing)

4.2 How many missing data?

table(is.na(df_missing))

## 
## FALSE  TRUE 
##   337     5

4.3 Plot the percentage of missing data

plot_missing(df_missing)

4.4 Treat missing data

# listwise deletion
df_missingListwise = na.omit(df_missing) #removes all rows with at least one NA in any variable
# Pairwise deletion
df_missingPairwise = df_missing[!is.na(df_missing$ACO),] #removes all rows with NA in ACO variable
# Replace missing value with mean or median 
df_missing$ACO[is.na(df_missing$ACO)] <- mean(df_missing$ACO, na.rm = TRUE)
df_missing$ACO[is.na(df_missing$ACO)] <- median(df_missing$ACO, na.rm = TRUE)

5 Detect outliers

5.1 Detect

df_no_outliers <- df 
boxplot(df_no_outliers) # other method to detect outliers: Cook distance, QQplot

5.2 Take out the outliers from the variable SI

outlier <- function(x){
  quant <- quantile(x, probs=c(0.25,0.75))
  caps <- quantile(x, probs=c(0.05,0.95))
  H <- 1.5*IQR(x, na.rm = TRUE)
  x[x < (quant[1] - H)] <- caps[1]
  x[x > (quant[2] + H)] <- caps[2]
  return(x)
}

df_no_outliers$SI <- outlier(df_no_outliers$SI)

boxplot(df_no_outliers)

5.3 Compare results of dataset with and without outliers

mean(df$SI)

## [1] 13.07316

mean(df_no_outliers$SI)

## [1] 12.14681

median(df$SI)

## [1] 9.86

median(df_no_outliers$SI)

## [1] 9.86

var(df$SI)

## [1] 148.5175

var(df_no_outliers$SI)

## [1] 80.62296

6 Histograms

plot_histogram(df,ncol = 3) #with 3 columns

## Plot boxplots of each independent variable with TODU
plot_boxplot(df, by = "TODU", ncol = 3) # If you increase the average car ownership (ACO) it will tend to increase the number of trips per dwelling unit (TODU). This makes sense

7 Correlations

res <- cor.mtest(df,conf.level = .95)
corrplot(cor(df), p.mat = res$p, method = "number", type = "upper", order="hclust", sig.level = 0.05)

cor.test(df$AHS,df$SI) # check p-value

## 
##  Pearson's product-moment correlation
## 
## data:  df$AHS and df$SI
## t = 0.63199, df = 55, p-value = 0.53
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1796317  0.3379997
## sample estimates:
##        cor 
## 0.08491026

8 Reference

Filipe Moura, Gabriel Valença, Miguel Costa, Carlos Roque, & Rosa Félix. (2021, March). U-Shift/Transport-Demand-Modelling: Supporting materials to Transportation Demand Modelling classes at Instituto Superior Técnico - University of Lisbon. (Version 2021.0). GitHub. http://doi.org/10.5281/zenodo.4599525

Exploratory Data Analysis

Mar 19, 2022