Hydroplaning
library(textmineR)
library(DT)
setwd("C:/Users/subas/Syncplicity/MyProjects_IMP/MY_Papers_V2/TRB 2020/00000000 FINALz/0005 Hydroplaning")
dat= read.csv("Hydroplane10_16Narr.csv")
names(dat)
## [1] "X" "CRASH_NUM1" "NARRATIVE"
datatable(
dat[,c(2, 3)], extensions = c('Select', 'Buttons'), options = list(
select = list(style = 'os', items = 'row'),
dom = 'Blfrtip',
rowId = 0,
buttons = c('selectRows', 'csv', 'excel')
),
selection = 'none'
)
# create a document term matrix
dtm <- CreateDtm(doc_vec = dat$NARRATIVE, # character vector of documents
doc_names = dat$CRASH_NUM1, # document names
ngram_window = c(1, 2), # minimum and maximum n-gram length
stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
stopwords::stopwords(source = "smart")), # this is the default value
lower = TRUE, # lowercase - this is the default value
remove_punctuation = TRUE, # punctuation - this is the default
remove_numbers = TRUE, # numbers - this is the default
verbose = FALSE, # Turn off status bar for this demo
cpus = 2) # default is all available cpus on the system
# construct the matrix of term counts to get the IDF vector
tf_mat <- TermDocFreq(dtm)
# TF-IDF and cosine similarity
tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf
tfidf <- t(tfidf)
csim <- tfidf / sqrt(rowSums(tfidf * tfidf))
csim <- csim %*% t(csim)
cdist <- as.dist(1 - csim)
hc <- hclust(cdist, "ward.D")
clustering <- cutree(hc, 10)
plot(hc, main = " ",
ylab = "", xlab = "", yaxt = "n")
rect.hclust(hc, 12, border = "red")

p_words <- colSums(dtm) / sum(dtm)
cluster_words <- lapply(unique(clustering), function(x){
rows <- dtm[ clustering == x , ]
# for memory's sake, drop all words that don't appear in the cluster
rows <- rows[ , colSums(rows) > 0 ]
colSums(rows) / sum(rows) - p_words[ colnames(rows) ]
})
Cluster Summary
# create a summary table of the top 5 words defining each cluster
cluster_summary <- data.frame(cluster = unique(clustering),
size = as.numeric(table(clustering)),
top_words = sapply(cluster_words, function(d){
paste(
names(d)[ order(d, decreasing = TRUE) ][ 1:5 ],
collapse = ", ")
}),
stringsAsFactors = FALSE)
cluster_summary
## cluster size top_words
## 1 1 402 vehicle, driver, lane, front, officer
## 2 2 319 ditch, deputy, roadway, road, mr
## 3 3 32 driver, driver_stated, stated, drivers, block
## 4 4 27 veh, driver_veh, veh_stated, oper, oper_veh
## 5 5 5 sherwood, forest, sherwood_forest, stated, driver
## 6 6 26 due, vehicle, st, time, jackson
## 7 7 4 number, vehicle_number, vehicle, driver_vehicle, number_stated
## 8 8 8 advised, cleared_action, officer, cleared, action
## 9 9 12 driver_vehicle, vehicle, crash, petty, driver
## 10 10 3 lt_pelletier, pelletier, lt, pelletier_reports, reports