Crash 01

setwd("J:/AIiHS/Chap03")
dat= read.csv("TAHIR_rwd1.csv")
table(dat$HwyClass)

## 
##        Rural Interstate   Rural Multi-lane Div. Rural Multi-lane Undiv. 
##                     356                     519                     158 
##          Rural Two-Lane        Urban Interstate   Urban Multi-lane Div. 
##                    8461                     529                     789 
## Urban Multi-lane Undiv.          Urban Two-lane 
##                     739                    2482

head(dat)

##            NewSegID Urban_Rur       HwyClass Length AADT Lanes LaneWidth
## 1        001-01_0_2     Rural Rural Two-Lane  2.000 2920     2        12
## 2 001-01_2.53_4.349     Rural Rural Two-Lane  1.819 2920     2        12
## 3     001-01_2_2.53     Rural Rural Two-Lane  0.530 2920     2        12
## 4  001-01_4.349_6.3     Urban Urban Two-lane  1.951 4320     2        12
## 5    001-01_6.3_8.3     Urban Urban Two-lane  2.000 4320     2        12
## 6  001-01_8.3_9.297     Urban Urban Two-lane  0.997 4320     2        12
##   ShWidth Curve MinPSL Total_Crash KABC_Crash
## 1       6     8     55          12          4
## 2       6     4     45           6          3
## 3       6     4     55           1          1
## 4       6     7     35           9          4
## 5       6     5     45           3          1
## 6       6     2     45           2          2

dat1= subset(dat, HwyClass=="Rural Two-Lane")
dim(dat1)

## [1] 8461   12

## 75% of the sample size
smp_size <- floor(0.75 * nrow(dat1))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(dat1)), size = smp_size)

train <- dat1[train_ind, ]
test <- dat1[-train_ind, ]

dim(train)

## [1] 6345   12

dim(test)

## [1] 2116   12

train_df= train[, c(4, 5, 7:11)]
test_df= test[, c(4, 5, 7:11)]



library(keras)
library(tfdatasets)
library(tensorflow)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)


spec <- feature_spec(train_df, Total_Crash ~ . ) %>% 
  step_numeric_column(all_numeric(), normalizer_fn = scaler_standard()) %>% 
  fit()

spec

## -- Feature Spec ---------------------------------------------------------------- 
## A feature_spec with 6 steps.
## Fitted: TRUE 
## -- Steps ----------------------------------------------------------------------- 
## The feature_spec has 1 dense features.
## StepNumericColumn: Length, AADT, LaneWidth, ShWidth, Curve, MinPSL 
## -- Dense features --------------------------------------------------------------

input <- layer_input_from_dataset(train_df %>% select(-Total_Crash))

output <- input %>% 
  layer_dense_features(dense_features(spec)) %>% 
  layer_dense(units = 64, activation = "relu") %>%
  layer_dense(units = 64, activation = "relu") %>%
  layer_dense(units = 1) 

model <- keras_model(input, output)

summary(model)

## Model: "model"
## ________________________________________________________________________________
## Layer (type)              Output Shape      Param #  Connected to               
## ================================================================================
## AADT (InputLayer)         [(None,)]         0                                   
## ________________________________________________________________________________
## Curve (InputLayer)        [(None,)]         0                                   
## ________________________________________________________________________________
## LaneWidth (InputLayer)    [(None,)]         0                                   
## ________________________________________________________________________________
## Length (InputLayer)       [(None,)]         0                                   
## ________________________________________________________________________________
## MinPSL (InputLayer)       [(None,)]         0                                   
## ________________________________________________________________________________
## ShWidth (InputLayer)      [(None,)]         0                                   
## ________________________________________________________________________________
## dense_features (DenseFeat (None, 6)         0        AADT[0][0]                 
##                                                      Curve[0][0]                
##                                                      LaneWidth[0][0]            
##                                                      Length[0][0]               
##                                                      MinPSL[0][0]               
##                                                      ShWidth[0][0]              
## ________________________________________________________________________________
## dense_2 (Dense)           (None, 64)        448      dense_features[0][0]       
## ________________________________________________________________________________
## dense_1 (Dense)           (None, 64)        4160     dense_2[0][0]              
## ________________________________________________________________________________
## dense (Dense)             (None, 1)         65       dense_1[0][0]              
## ================================================================================
## Total params: 4,673
## Trainable params: 4,673
## Non-trainable params: 0
## ________________________________________________________________________________

model %>% 
  compile(
    loss = "mse",
    optimizer = optimizer_rmsprop(),
    metrics = list("mean_absolute_error")
  )


build_model <- function() {
  input <- layer_input_from_dataset(train_df %>% select(-Total_Crash))
  
  output <- input %>% 
    layer_dense_features(dense_features(spec)) %>% 
    layer_dense(units = 64, activation = "relu") %>%
    layer_dense(units = 64, activation = "relu") %>%
    layer_dense(units = 1) 
  
  model <- keras_model(input, output)
  
  model %>% 
    compile(
      loss = "mse",
      optimizer = optimizer_rmsprop(),
      metrics = list("mean_absolute_error")
    )
  
  model
}

# Display training progress by printing a single dot for each completed epoch.
print_dot_callback <- callback_lambda(
  on_epoch_end = function(epoch, logs) {
    if (epoch %% 80 == 0) cat("\n")
    cat(".")
  }
)    

model <- build_model()

history <- model %>% fit(
  x = train_df %>% select(-Total_Crash),
  y = train_df$Total_Crash,
  epochs = 200,
  validation_split = 0.2,
  verbose = 0,
  callbacks = list(print_dot_callback)
)

## 
## ................................................................................
## ................................................................................
## ........................................

library(ggplot2)
plot(history)+theme_bw(base_size=16)

## `geom_smooth()` using formula 'y ~ x'

test_predictions <- model %>% predict(test_df %>% select(-Total_Crash))
test_df$pred= test_predictions[ , 1]


sum(test_df$Total_Crash)

## [1] 6020

sum(test_df$Total_Crash)

## [1] 6020

train_predictions <- model %>% predict(train_df %>% select(-Total_Crash))
train_df$pred= train_predictions[ , 1]


sum(train_df$Total_Crash)

## [1] 17615

sum(train_df$Total_Crash)

## [1] 17615