library(dplyr)
library(tidyverse)
library(readxl)
library(tidyverse)
library(skimr) # Library used for providing a summary of the data
library(DataExplorer) # Library used in data science to perform exploratory data analysis
library(corrplot)dataset <- read_excel("TDM_Class3_MLR_Chicago_Example.xls") str(dataset)## tibble [57 × 6] (S3: tbl_df/tbl/data.frame)
## $ TODU: num [1:57] 3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
## $ ACO : num [1:57] 0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
## $ AHS : num [1:57] 3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
## $ SI : num [1:57] 21 21.6 12.6 17.6 35.3 ...
## $ SRI : num [1:57] 28.3 20.9 26 28.5 27.2 ...
## $ UI : num [1:57] 60.1 65.7 63.2 66.2 58.4 ...
head(dataset,5)## # A tibble: 5 × 6
## TODU ACO AHS SI SRI UI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.18 0.59 3.26 21.0 28.3 60.1
## 2 3.89 0.57 3.13 21.6 20.9 65.7
## 3 3.98 0.61 3.02 12.6 26.0 63.2
## 4 4.16 0.61 3.14 17.6 28.5 66.2
## 5 3.6 0.63 3.75 35.3 27.2 58.4
typeof(dataset)## [1] "list"
class(dataset)## [1] "tbl_df" "tbl" "data.frame"
df <- data.frame(dataset)
str(dataset)## tibble [57 × 6] (S3: tbl_df/tbl/data.frame)
## $ TODU: num [1:57] 3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
## $ ACO : num [1:57] 0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
## $ AHS : num [1:57] 3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
## $ SI : num [1:57] 21 21.6 12.6 17.6 35.3 ...
## $ SRI : num [1:57] 28.3 20.9 26 28.5 27.2 ...
## $ UI : num [1:57] 60.1 65.7 63.2 66.2 58.4 ...
str(df)## 'data.frame': 57 obs. of 6 variables:
## $ TODU: num 3.18 3.89 3.98 4.16 3.6 4.1 4.36 4.87 5.85 4.97 ...
## $ ACO : num 0.59 0.57 0.61 0.61 0.63 0.66 0.71 0.77 0.84 0.74 ...
## $ AHS : num 3.26 3.13 3.02 3.14 3.75 3.24 2.77 2.74 3.02 2.84 ...
## $ SI : num 21 21.6 12.6 17.6 35.3 ...
## $ SRI : num 28.3 20.9 26 28.5 27.2 ...
## $ UI : num 60.1 65.7 63.2 66.2 58.4 ...
head(df,10)## TODU ACO AHS SI SRI UI
## 1 3.18 0.59 3.26 21.01 28.32 60.10
## 2 3.89 0.57 3.13 21.61 20.89 65.71
## 3 3.98 0.61 3.02 12.57 25.99 63.19
## 4 4.16 0.61 3.14 17.61 28.52 66.24
## 5 3.60 0.63 3.75 35.32 27.18 58.36
## 6 4.10 0.66 3.24 14.73 27.95 59.58
## 7 4.36 0.71 2.77 11.61 39.91 64.64
## 8 4.87 0.77 2.74 10.71 48.36 67.88
## 9 5.85 0.84 3.02 8.20 42.15 56.86
## 10 4.97 0.74 2.84 7.94 38.14 62.44
skim(df)| Name | df |
| Number of rows | 57 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| TODU | 0 | 1 | 5.37 | 1.33 | 3.02 | 4.54 | 5.10 | 6.13 | 9.14 | ▃▇▅▃▁ |
| ACO | 0 | 1 | 0.81 | 0.18 | 0.50 | 0.67 | 0.79 | 0.92 | 1.32 | ▆▇▇▃▁ |
| AHS | 0 | 1 | 3.19 | 0.39 | 1.83 | 3.00 | 3.19 | 3.37 | 4.50 | ▁▂▇▂▁ |
| SI | 0 | 1 | 13.07 | 12.19 | 2.17 | 6.82 | 9.86 | 15.08 | 62.53 | ▇▂▁▁▁ |
| SRI | 0 | 1 | 49.56 | 15.84 | 20.89 | 38.14 | 49.37 | 60.85 | 87.38 | ▅▆▇▅▂ |
| UI | 0 | 1 | 52.62 | 13.46 | 24.08 | 44.80 | 55.51 | 61.09 | 83.66 | ▃▅▇▅▁ |
table(is.na(df)) # FALSE <- no missing data ##
## FALSE
## 342
df_missing <- read_excel("TDM_Class3_MLR_Chicago_Example_md.xls")
df_missing <- data.frame(df_missing)table(is.na(df_missing))##
## FALSE TRUE
## 337 5
plot_missing(df_missing)# listwise deletion
df_missingListwise = na.omit(df_missing) #removes all rows with at least one NA in any variable
# Pairwise deletion
df_missingPairwise = df_missing[!is.na(df_missing$ACO),] #removes all rows with NA in ACO variable
# Replace missing value with mean or median
df_missing$ACO[is.na(df_missing$ACO)] <- mean(df_missing$ACO, na.rm = TRUE)
df_missing$ACO[is.na(df_missing$ACO)] <- median(df_missing$ACO, na.rm = TRUE)df_no_outliers <- df
boxplot(df_no_outliers) # other method to detect outliers: Cook distance, QQplotoutlier <- function(x){
quant <- quantile(x, probs=c(0.25,0.75))
caps <- quantile(x, probs=c(0.05,0.95))
H <- 1.5*IQR(x, na.rm = TRUE)
x[x < (quant[1] - H)] <- caps[1]
x[x > (quant[2] + H)] <- caps[2]
return(x)
}
df_no_outliers$SI <- outlier(df_no_outliers$SI)
boxplot(df_no_outliers)mean(df$SI)## [1] 13.07316
mean(df_no_outliers$SI)## [1] 12.14681
median(df$SI)## [1] 9.86
median(df_no_outliers$SI)## [1] 9.86
var(df$SI)## [1] 148.5175
var(df_no_outliers$SI)## [1] 80.62296
plot_histogram(df,ncol = 3) #with 3 columns## Plot boxplots of each independent variable with TODU
plot_boxplot(df, by = "TODU", ncol = 3) # If you increase the average car ownership (ACO) it will tend to increase the number of trips per dwelling unit (TODU). This makes senseres <- cor.mtest(df,conf.level = .95)
corrplot(cor(df), p.mat = res$p, method = "number", type = "upper", order="hclust", sig.level = 0.05)cor.test(df$AHS,df$SI) # check p-value##
## Pearson's product-moment correlation
##
## data: df$AHS and df$SI
## t = 0.63199, df = 55, p-value = 0.53
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1796317 0.3379997
## sample estimates:
## cor
## 0.08491026
Filipe Moura, Gabriel Valença, Miguel Costa, Carlos Roque, & Rosa Félix. (2021, March). U-Shift/Transport-Demand-Modelling: Supporting materials to Transportation Demand Modelling classes at Instituto Superior Técnico - University of Lisbon. (Version 2021.0). GitHub. http://doi.org/10.5281/zenodo.4599525