The exciting and convenient text mining package: tm

The Bicycle Collision database contains all bicycle collisions in the city of Boston from 2009 to 2012 that were recorded in Boston Police Department records. I decided to use it to just simply explore the tm package. I intend to use the package to get to the mst used words, and a wordcloud graph.
Through this package, I present methods for data import, corpus handling, preprocessing, and creation of term-document matrices. The documentation can be found here: https://www.rdocumentation.org/packages/tm

library(tm) # calling the packages
library(knitr)
library(tidytext)
library(tidyverse)
library(wordcloud)
getSources() # checking to see what data sources are acceptable

## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"

# VectorSource a vector of characters (treats each component as a document)
# DataframeSource a data frame containing text (like CSV files)
# DirSource for use with file directories
bike_dt <- read.csv("D:\\School\\Semester 8\\New courses\\New folder\\bike\\Final Bike Collision Database.csv")
bike_dt$Narrative <- as.character(bike_dt$Narrative)

#Create a Corpus
my_corpus <- VCorpus(VectorSource(bike_dt$Narrative))
my_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3454

writeLines(as.character(my_corpus[[3454]]))  # transform to workable data

# Create a function called "addspace" that finds a user specified pattern and substitutes the pattern with a space.
addspace <- content_transformer(function(x, pattern) {
  return(gsub(pattern, " ", x))
  })

# tm uses a specific interface to apply functions to corpora called tm_map()
# Replace "-" with space
my_corpus <- tm_map(my_corpus, addspace, "-")
writeLines(as.character(my_corpus[[3454]]))

# remove other punctuation, numbers, make all words lower case, remove stop words and remove extra white space
my_corpus <- tm_map(my_corpus, removePunctuation)
my_corpus <- tm_map(my_corpus, removeNumbers)
my_corpus <- tm_map(my_corpus, removeWords, stopwords("english"))

# Transform to lower case (need to wrap in content_transformer)
my_corpus <- tm_map(my_corpus,content_transformer(tolower))
my_corpus <- tm_map(my_corpus, stripWhitespace)

writeLines(as.character(my_corpus[[3454]])) # write again after edits in texts

# coerces my_corpus into a Document Term Matrix
dtm_bike <- DocumentTermMatrix(my_corpus)


# inspects chapters 1:5
inspect(dtm_bike[1:5,])

## <<DocumentTermMatrix (documents: 5, terms: 3285)>>
## Non-/sparse entries: 399/16026
## Sparsity           : 98%
## Maximal term length: 51
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs and officer stated the was xxx xxxx xxxxx xxxxxx xxxxxxx
##    1   0       3      2   0   0   0    4     6      3       5
##    2  11       3      5  16   6   4   10     5      5       3
##    3   6       4      4  11   6   1    1     8      3       5
##    4   0       2      3   0   0   9    2     4      3       2
##    5   0       5      3   2   0  10    5     7      5       1

# Sum all columns(words) to get frequency
words_frequency <- colSums(as.matrix(dtm_bike)) 

# order the freqs
ord <- order(words_frequency, decreasing=TRUE)

# get the top 10 words by frequency of appeearance
words_frequency[head(ord, 10)] %>% 
  kable()

	x
the	9134
xxxxxx	6488
xxxxx	6286
xxx	6020
victim	5874
xxxx	5545
officer	5405
stated	5189
xxxxxxx	5133
xxxxxxxx	4252

# Return unoredered frequent terms that appeared more than 200, but less than infinity.
findFreqTerms(dtm_bike, lowfreq = 200, highfreq = Inf)

##   [1] "about"         "above"         "accident"      "along"        
##   [5] "also"          "ambulance"     "and"           "approached"   
##   [9] "area"          "arrival"       "arrived"       "asked"        
##  [13] "assigned"      "attempted"     "attention"     "auto"         
##  [17] "ave"           "avenue"        "back"          "bicycle"      
##  [21] "bicyclist"     "bike"          "black"         "blue"         
##  [25] "boston"        "both"          "bumper"        "cab"          
##  [29] "call"          "cambridge"     "came"          "car"          
##  [33] "causing"       "center"        "centre"        "collided"     
##  [37] "coming"        "complained"    "completed"     "contact"      
##  [41] "crash"         "crossing"      "cyclist"       "damage"       
##  [45] "declined"      "described"     "did"           "direction"    
##  [49] "district"      "door"          "dorchester"    "driver"       
##  [53] "drivers"       "driving"       "ems"           "emts"         
##  [57] "evaluation"    "fall"          "fell"          "fire"         
##  [61] "for"           "form"          "from"          "front"        
##  [65] "further"       "going"         "got"           "green"        
##  [69] "ground"        "had"           "hand"          "head"         
##  [73] "her"           "hill"          "him"           "his"          
##  [77] "hit"           "hood"          "hospital"      "hours"        
##  [81] "huntington"    "identified"    "inbound"       "incident"     
##  [85] "information"   "informed"      "injuries"      "injury"       
##  [89] "intersection"  "into"          "involving"     "lane"         
##  [93] "later"         "left"          "leg"           "light"        
##  [97] "made"          "make"          "making"        "male"         
## [101] "mass"          "massachusetts" "medical"       "minor"        
## [105] "mirror"        "motor"         "not"           "notified"     
## [109] "observed"      "officer"       "officers"      "one"          
## [113] "onto"          "opened"        "operating"     "operator"     
## [117] "outbound"      "pain"          "parked"        "parties"      
## [121] "passenger"     "pedestrian"    "police"        "radio"        
## [125] "rear"          "red"           "refused"       "reg"          
## [129] "report"        "reported"      "responded"     "riding"       
## [133] "right"         "road"          "said"          "saw"          
## [137] "scene"         "see"           "sgt"           "she"          
## [141] "shoulder"      "side"          "sidewalk"      "spoke"        
## [145] "stated"        "states"        "stop"          "stopped"      
## [149] "street"        "struck"        "suspect"       "sustained"    
## [153] "taken"         "taxi"          "that"          "the"          
## [157] "then"          "this"          "time"          "tire"         
## [161] "took"          "towards"       "traffic"       "transported"  
## [165] "traveling"     "treated"       "treatment"     "tremont"      
## [169] "truck"         "turn"          "turned"        "unit"         
## [173] "unknown"       "upon"          "vehicle"       "victim"       
## [177] "victims"       "was"           "washington"    "way"          
## [181] "went"          "wheel"         "when"          "while"        
## [185] "white"         "who"           "with"          "witness"      
## [189] "xxx"           "xxxs"          "xxxx"          "xxxxx"        
## [193] "xxxxxx"        "xxxxxxx"       "xxxxxxxx"      "xxxxxxxxx"    
## [197] "xxxxxxxxxx"    "xxxxxxxxxxx"   "xxxxxxxxxxxx"  "xxxxxxxxxxxxx"

# what words were associated with each other?
# Find words that are correlated with a specific word with a coefficient > .70
findAssocs(dtm_bike, "accident", .70) %>% 
  kable()

x

# convert dtm into a df
df_bike <- tidy(dtm_bike)

df_bike <- df_bike[!grepl("^.+x$",df_bike$term),]

# take the product. Graph it. 

a <- df_bike %>%
  group_by(term) %>%
  # filter(document == 1 ) %>%
  mutate(count_total = sum(count))%>%
  filter(!duplicated(count_total))

a <- a[a$term != "the",]

pp <- a %>%
  filter(count_total > 1500) %>%
  ggplot(aes(x = reorder(term, -count_total), y= count_total)) +
  geom_bar(stat = "identity", fill = "red3") +
  labs(title = "The top 20 most used words in bike collision narratives",
             x = "Words", y = "count") +
  theme(text = element_text(size = 8)) + 
  coord_flip()
#   facet_wrap(~document, ncol = 2) +

pp

# plotly::ggplotly(pp)

The bar graph shows the 20 most used words in the bike collision dataset. Victim and officer are the two most used words. BOth are used between 5000 and 6000 times in the description of the accidents. All words in this graph are expected to be there and form the well-known vocabulary of motor accidents.

wordcloud(words = a$term, freq = a$count_total, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Here we can see a wordcloud of the most frequent words. Words as suspect and intersection are also seen here which can give ideas of the status of the accidents.

R Notebook

The exciting and convenient text mining package: tm