####################
setwd("J:/AIiHS/Chap03")
dat= read.csv("TAHIR_rwd1.csv")
table(dat$HwyClass)
##
## Rural Interstate Rural Multi-lane Div. Rural Multi-lane Undiv.
## 356 519 158
## Rural Two-Lane Urban Interstate Urban Multi-lane Div.
## 8461 529 789
## Urban Multi-lane Undiv. Urban Two-lane
## 739 2482
## NewSegID Urban_Rur HwyClass Length AADT Lanes LaneWidth
## 1 001-01_0_2 Rural Rural Two-Lane 2.000 2920 2 12
## 2 001-01_2.53_4.349 Rural Rural Two-Lane 1.819 2920 2 12
## 3 001-01_2_2.53 Rural Rural Two-Lane 0.530 2920 2 12
## 4 001-01_4.349_6.3 Urban Urban Two-lane 1.951 4320 2 12
## 5 001-01_6.3_8.3 Urban Urban Two-lane 2.000 4320 2 12
## 6 001-01_8.3_9.297 Urban Urban Two-lane 0.997 4320 2 12
## ShWidth Curve MinPSL Total_Crash KABC_Crash
## 1 6 8 55 12 4
## 2 6 4 45 6 3
## 3 6 4 55 1 1
## 4 6 7 35 9 4
## 5 6 5 45 3 1
## 6 6 2 45 2 2
dat1= subset(dat, HwyClass=="Rural Two-Lane")
dim(dat1)
## [1] 8461 12
dat1= dat1[, c(4, 5, 7:11)]
# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# Set global knitr chunk options
knitr::opts_chunk$set(
fig.align = "center",
fig.height = 3.5
)
# Helper packages
library(dplyr) # for data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # for awesome graphics
# Modeling packages
library(caret) # for cross-validation, etc.
## Loading required package: lattice
# Model interpretability packages
library(vip) # variable importance
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(rsample)
# stratified sampling with the rsample package
split <- initial_split(dat1, prop = 0.7, strata = "Total_Crash")
ames_train <- training(split)
ames_test <- testing(split)
model1 <- lm(Total_Crash ~ AADT, data = ames_train)
# Fitted regression line (full training data)
p1 <- model1 %>%
broom::augment() %>%
ggplot(aes(Total_Crash, AADT)) +
geom_point(size = 1, alpha = 0.3) +
geom_smooth(se = FALSE, method = "lm") +
ggtitle("Fitted regression line")
# Fitted regression line (restricted range)
p2 <- model1 %>%
broom::augment() %>%
ggplot(aes(Total_Crash, AADT)) +
geom_segment(aes(x = AADT, y =Total_Crash,
xend = AADT, yend = .fitted),
alpha = 0.3) +
geom_point(size = 1, alpha = 0.3) +
geom_smooth(se = FALSE, method = "lm") +
ggtitle("Fitted regression line (with residuals)")
# Side-by-side plots
grid.arrange(p1, p2, nrow = 1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

##
## Call:
## lm(formula = Total_Crash ~ AADT, data = ames_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.047 -2.089 -1.079 0.937 36.154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.753e+00 6.994e-02 25.06 <2e-16 ***
## AADT 4.984e-04 2.303e-05 21.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.889 on 5922 degrees of freedom
## Multiple R-squared: 0.07328, Adjusted R-squared: 0.07312
## F-statistic: 468.2 on 1 and 5922 DF, p-value: < 2.2e-16
(model2 <- lm(Total_Crash ~ Length+ AADT+LaneWidth+ShWidth+Curve+MinPSL,
data = ames_train))
##
## Call:
## lm(formula = Total_Crash ~ Length + AADT + LaneWidth + ShWidth +
## Curve + MinPSL, data = ames_train)
##
## Coefficients:
## (Intercept) Length AADT LaneWidth ShWidth Curve
## -3.5632504 1.9884062 0.0006287 0.2427817 -0.1551573 0.0698271
## MinPSL
## 0.0010470
# Fitted models
fit1 <- lm(Total_Crash ~ Length+ AADT,
data = ames_train)
fit2 <- lm(Total_Crash ~ Length*AADT, data = ames_train)
# Regression plane data
plot_grid <- expand.grid(
Length = seq(from = min(ames_train$Length),
to = max(ames_train$Length),
length = 100),
AADT = seq(from = min(ames_train$AADT),
to = max(ames_train$AADT),
length = 100)
)
plot_grid$y1 <- predict(fit1, newdata = plot_grid)
plot_grid$y2 <- predict(fit2, newdata = plot_grid)
# Level plots
p1 <- ggplot(plot_grid, aes(x = Length, y =AADT,
z = y1, fill = y1)) +
geom_tile() +
geom_contour(color = "white") +
viridis::scale_fill_viridis(name = "Predicted\nvalue", option = "inferno") +
theme_bw() +
ggtitle("Main effects only")
p2 <- ggplot(plot_grid, aes(x = Length, y = AADT,
z = y2, fill = y1)) +
geom_tile() +
geom_contour(color = "white") +
viridis::scale_fill_viridis(name = "Predicted\nvalue", option = "inferno") +
theme_bw() +
ggtitle("Main effects with two-way interaction")
gridExtra::grid.arrange(p1, p2, nrow = 1)

model3 <- lm(Total_Crash ~ ., data = ames_train)
# print estimated coefficients in a tidy data frame
broom::tidy(model3)
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -3.56 0.517 -6.89 6.26e- 12
## 2 Length 1.99 0.0732 27.2 2.89e-153
## 3 AADT 0.000629 0.0000243 25.9 2.18e-140
## 4 LaneWidth 0.243 0.0478 5.08 3.91e- 7
## 5 ShWidth -0.155 0.0187 -8.31 1.20e- 16
## 6 Curve 0.0698 0.00951 7.34 2.34e- 13
## 7 MinPSL 0.00105 0.00240 0.435 6.63e- 1
set.seed(123) # for reproducibility
(cv_model1 <- train(
form = Total_Crash ~ AADT,
data = ames_train,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
))
## Linear Regression
##
## 5924 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5332, 5331, 5332, 5332, 5332, 5332, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 3.875347 0.07673183 2.577924
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model 2 CV
set.seed(123)
cv_model2 <- train(
Total_Crash ~ AADT+ Length,
data = ames_train,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
# model 3 CV
set.seed(123)
cv_model3 <- train(
Total_Crash ~ .,
data = ames_train,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
# Extract out of sample performance measures
summary(resamples(list(
model1 = cv_model1,
model2 = cv_model2,
model3 = cv_model3
)))
##
## Call:
## summary.resamples(object = resamples(list(model1 = cv_model1, model2
## = cv_model2, model3 = cv_model3)))
##
## Models: model1, model2, model3
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model1 2.405090 2.532822 2.581976 2.577924 2.627131 2.740485 0
## model2 2.177279 2.220739 2.261894 2.280169 2.322081 2.450890 0
## model3 2.158196 2.226499 2.271603 2.272069 2.296491 2.429654 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model1 3.479928 3.639352 3.773912 3.875347 3.995628 4.66892 0
## model2 3.191103 3.317634 3.395189 3.549546 3.652029 4.32677 0
## model3 3.150689 3.274881 3.399690 3.510730 3.589900 4.27591 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model1 0.03450388 0.06544595 0.0755200 0.07673183 0.09313064 0.1068552 0
## model2 0.16649632 0.21674539 0.2297650 0.22473498 0.24671061 0.2499206 0
## model3 0.20418271 0.22152070 0.2425712 0.24218291 0.26640708 0.2766963 0