Lin Reg

####################

setwd("J:/AIiHS/Chap03")
dat= read.csv("TAHIR_rwd1.csv")
table(dat$HwyClass)

## 
##        Rural Interstate   Rural Multi-lane Div. Rural Multi-lane Undiv. 
##                     356                     519                     158 
##          Rural Two-Lane        Urban Interstate   Urban Multi-lane Div. 
##                    8461                     529                     789 
## Urban Multi-lane Undiv.          Urban Two-lane 
##                     739                    2482

head(dat)

##            NewSegID Urban_Rur       HwyClass Length AADT Lanes LaneWidth
## 1        001-01_0_2     Rural Rural Two-Lane  2.000 2920     2        12
## 2 001-01_2.53_4.349     Rural Rural Two-Lane  1.819 2920     2        12
## 3     001-01_2_2.53     Rural Rural Two-Lane  0.530 2920     2        12
## 4  001-01_4.349_6.3     Urban Urban Two-lane  1.951 4320     2        12
## 5    001-01_6.3_8.3     Urban Urban Two-lane  2.000 4320     2        12
## 6  001-01_8.3_9.297     Urban Urban Two-lane  0.997 4320     2        12
##   ShWidth Curve MinPSL Total_Crash KABC_Crash
## 1       6     8     55          12          4
## 2       6     4     45           6          3
## 3       6     4     55           1          1
## 4       6     7     35           9          4
## 5       6     5     45           3          1
## 6       6     2     45           2          2

dat1= subset(dat, HwyClass=="Rural Two-Lane")
dim(dat1)

## [1] 8461   12

dat1= dat1[, c(4, 5, 7:11)]

# Set the graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# Set global knitr chunk options
knitr::opts_chunk$set(
  fig.align = "center",
  fig.height = 3.5
)

# Helper packages
library(dplyr)    # for data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)  # for awesome graphics

# Modeling packages
library(caret)    # for cross-validation, etc.

## Loading required package: lattice

# Model interpretability packages
library(vip)      # variable importance

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(rsample)
# stratified sampling with the rsample package
split  <- initial_split(dat1, prop = 0.7, strata = "Total_Crash")
ames_train  <- training(split)
ames_test   <- testing(split)

model1 <- lm(Total_Crash ~ AADT, data = ames_train)


# Fitted regression line (full training data)
p1 <- model1 %>%
  broom::augment() %>%
  ggplot(aes(Total_Crash,  AADT)) + 
  geom_point(size = 1, alpha = 0.3) +
  geom_smooth(se = FALSE, method = "lm") +
  ggtitle("Fitted regression line")

# Fitted regression line (restricted range)
p2 <- model1 %>%
  broom::augment() %>%
  ggplot(aes(Total_Crash,  AADT)) + 
  geom_segment(aes(x =  AADT, y =Total_Crash,
                   xend =  AADT, yend = .fitted), 
               alpha = 0.3) +
  geom_point(size = 1, alpha = 0.3) +
  geom_smooth(se = FALSE, method = "lm") +
  ggtitle("Fitted regression line (with residuals)")

# Side-by-side plots
grid.arrange(p1, p2, nrow = 1)

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

summary(model1)

## 
## Call:
## lm(formula = Total_Crash ~ AADT, data = ames_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.047  -2.089  -1.079   0.937  36.154 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.753e+00  6.994e-02   25.06   <2e-16 ***
## AADT        4.984e-04  2.303e-05   21.64   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.889 on 5922 degrees of freedom
## Multiple R-squared:  0.07328,    Adjusted R-squared:  0.07312 
## F-statistic: 468.2 on 1 and 5922 DF,  p-value: < 2.2e-16

(model2 <- lm(Total_Crash ~ Length+ AADT+LaneWidth+ShWidth+Curve+MinPSL, 
              data = ames_train))

## 
## Call:
## lm(formula = Total_Crash ~ Length + AADT + LaneWidth + ShWidth + 
##     Curve + MinPSL, data = ames_train)
## 
## Coefficients:
## (Intercept)       Length         AADT    LaneWidth      ShWidth        Curve  
##  -3.5632504    1.9884062    0.0006287    0.2427817   -0.1551573    0.0698271  
##      MinPSL  
##   0.0010470

# Fitted models
fit1 <- lm(Total_Crash ~ Length+ AADT, 
           data = ames_train)
fit2 <- lm(Total_Crash ~ Length*AADT, data = ames_train)

# Regression plane data
plot_grid <- expand.grid(
  Length = seq(from = min(ames_train$Length), 
                    to = max(ames_train$Length), 
                    length = 100), 
  AADT = seq(from = min(ames_train$AADT), 
                   to = max(ames_train$AADT), 
                   length = 100)
)
plot_grid$y1 <- predict(fit1, newdata = plot_grid)
plot_grid$y2 <- predict(fit2, newdata = plot_grid)

# Level plots
p1 <- ggplot(plot_grid, aes(x = Length, y =AADT, 
                            z = y1, fill = y1)) +
  geom_tile() +
  geom_contour(color = "white") +
  viridis::scale_fill_viridis(name = "Predicted\nvalue", option = "inferno") +
  theme_bw() +
  ggtitle("Main effects only")
p2 <- ggplot(plot_grid, aes(x = Length, y = AADT, 
                            z = y2, fill = y1)) +
  geom_tile() +
  geom_contour(color = "white") +
  viridis::scale_fill_viridis(name = "Predicted\nvalue", option = "inferno") +
  theme_bw() +
  ggtitle("Main effects with two-way interaction")

gridExtra::grid.arrange(p1, p2, nrow = 1)

model3 <- lm(Total_Crash ~ ., data = ames_train) 

# print estimated coefficients in a tidy data frame
broom::tidy(model3)

## # A tibble: 7 x 5
##   term         estimate std.error statistic   p.value
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept) -3.56     0.517        -6.89  6.26e- 12
## 2 Length       1.99     0.0732       27.2   2.89e-153
## 3 AADT         0.000629 0.0000243    25.9   2.18e-140
## 4 LaneWidth    0.243    0.0478        5.08  3.91e-  7
## 5 ShWidth     -0.155    0.0187       -8.31  1.20e- 16
## 6 Curve        0.0698   0.00951       7.34  2.34e- 13
## 7 MinPSL       0.00105  0.00240       0.435 6.63e-  1

set.seed(123)  # for reproducibility
(cv_model1 <- train(
  form = Total_Crash ~ AADT, 
  data = ames_train, 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
))

## Linear Regression 
## 
## 5924 samples
##    1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 5332, 5331, 5332, 5332, 5332, 5332, ... 
## Resampling results:
## 
##   RMSE      Rsquared    MAE     
##   3.875347  0.07673183  2.577924
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

# model 2 CV
set.seed(123)
cv_model2 <- train(
  Total_Crash ~ AADT+ Length, 
  data = ames_train, 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
)

# model 3 CV
set.seed(123)
cv_model3 <- train(
  Total_Crash ~ ., 
  data = ames_train, 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
)

# Extract out of sample performance measures
summary(resamples(list(
  model1 = cv_model1, 
  model2 = cv_model2, 
  model3 = cv_model3
)))

## 
## Call:
## summary.resamples(object = resamples(list(model1 = cv_model1, model2
##  = cv_model2, model3 = cv_model3)))
## 
## Models: model1, model2, model3 
## Number of resamples: 10 
## 
## MAE 
##            Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## model1 2.405090 2.532822 2.581976 2.577924 2.627131 2.740485    0
## model2 2.177279 2.220739 2.261894 2.280169 2.322081 2.450890    0
## model3 2.158196 2.226499 2.271603 2.272069 2.296491 2.429654    0
## 
## RMSE 
##            Min.  1st Qu.   Median     Mean  3rd Qu.    Max. NA's
## model1 3.479928 3.639352 3.773912 3.875347 3.995628 4.66892    0
## model2 3.191103 3.317634 3.395189 3.549546 3.652029 4.32677    0
## model3 3.150689 3.274881 3.399690 3.510730 3.589900 4.27591    0
## 
## Rsquared 
##              Min.    1st Qu.    Median       Mean    3rd Qu.      Max. NA's
## model1 0.03450388 0.06544595 0.0755200 0.07673183 0.09313064 0.1068552    0
## model2 0.16649632 0.21674539 0.2297650 0.22473498 0.24671061 0.2499206    0
## model3 0.20418271 0.22152070 0.2425712 0.24218291 0.26640708 0.2766963    0