1 Louisiana Crashes

1.1 Hydroplaning

library(textmineR)
library(DT)
setwd("C:/Users/subas/Syncplicity/MyProjects_IMP/MY_Papers_V2/TRB 2020/00000000 FINALz/0005 Hydroplaning")
dat= read.csv("Hydroplane10_16Narr.csv")
names(dat)
## [1] "X"          "CRASH_NUM1" "NARRATIVE"
datatable(
  dat[,c(2, 3)], extensions = c('Select', 'Buttons'), options = list(
    select = list(style = 'os', items = 'row'),
    dom = 'Blfrtip',
    rowId = 0,
    buttons = c('selectRows', 'csv', 'excel')
  ),
  selection = 'none'
)
# create a document term matrix 
dtm <- CreateDtm(doc_vec = dat$NARRATIVE, # character vector of documents
                 doc_names = dat$CRASH_NUM1, # document names
                 ngram_window = c(1, 2), # minimum and maximum n-gram length
                 stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
                                  stopwords::stopwords(source = "smart")), # this is the default value
                 lower = TRUE, # lowercase - this is the default value
                 remove_punctuation = TRUE, # punctuation - this is the default
                 remove_numbers = TRUE, # numbers - this is the default
                 verbose = FALSE, # Turn off status bar for this demo
                 cpus = 2) # default is all available cpus on the system

# construct the matrix of term counts to get the IDF vector
tf_mat <- TermDocFreq(dtm)


# TF-IDF and cosine similarity
tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf

tfidf <- t(tfidf)


csim <- tfidf / sqrt(rowSums(tfidf * tfidf))
csim <- csim %*% t(csim)
cdist <- as.dist(1 - csim)


hc <- hclust(cdist, "ward.D")

clustering <- cutree(hc, 10)


plot(hc, main = " ",
     ylab = "", xlab = "", yaxt = "n")

rect.hclust(hc, 12, border = "red")

p_words <- colSums(dtm) / sum(dtm)

cluster_words <- lapply(unique(clustering), function(x){
  rows <- dtm[ clustering == x , ]
  
  # for memory's sake, drop all words that don't appear in the cluster
  rows <- rows[ , colSums(rows) > 0 ]
  
  colSums(rows) / sum(rows) - p_words[ colnames(rows) ]
})

1.2 Cluster Summary

# create a summary table of the top 5 words defining each cluster
cluster_summary <- data.frame(cluster = unique(clustering),
                              size = as.numeric(table(clustering)),
                              top_words = sapply(cluster_words, function(d){
                                paste(
                                  names(d)[ order(d, decreasing = TRUE) ][ 1:5 ], 
                                  collapse = ", ")
                              }),
                              stringsAsFactors = FALSE)
cluster_summary
##    cluster size                                                      top_words
## 1        1  402                          vehicle, driver, lane, front, officer
## 2        2  319                               ditch, deputy, roadway, road, mr
## 3        3   32                  driver, driver_stated, stated, drivers, block
## 4        4   27                    veh, driver_veh, veh_stated, oper, oper_veh
## 5        5    5              sherwood, forest, sherwood_forest, stated, driver
## 6        6   26                                due, vehicle, st, time, jackson
## 7        7    4 number, vehicle_number, vehicle, driver_vehicle, number_stated
## 8        8    8              advised, cleared_action, officer, cleared, action
## 9        9   12                  driver_vehicle, vehicle, crash, petty, driver
## 10      10    3        lt_pelletier, pelletier, lt, pelletier_reports, reports