Load the library functions

library(NLP)
library(topicmodels)
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)  
library(rJava)
library(coreNLP)
library(wordnet)

Read the text file

setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Text Analytic/WK 1/mini project")
textdata <- read.delim("osha.txt", header=FALSE, sep="\t", quote = "", stringsAsFactors = FALSE)
textdata[1,]
##          V1                                              V2
## 1 201079928  Employee Is Burned By Forklift Radiator Fluid 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         V3
## 1  At approximately 11:30 a.m. on November 13  2013  Employee #1  with Edco Waste  & Recycling Services  was operating a forklift (Linde Lift Truck; Serial  Number: H2X393S04578; identified by the employer as FL-3) from approximately  4:00 a.m.  moving bales of recyclable paper products from a collection area in  the yard into trucks. Then  Employee #1 cleaned and was replacing an air  filter on the forklift FL-3. To clean out the air filter  Employee #1 parked  FL-3 in the doorway of the maintenance building. The air filter was located on  the rear of the forklift  behind the cab frame on the driver's side. Employee  #1 removed the air filter and cleaned it out  and then he climbed up onto the  back of the forklift to replace it. While up on the back of the forklift   Employee #1's foot dislodged the cooling system radiator cap. The fluid in the  lift truck's cooling system was hot and under pressure from being operated all  morning. The hot fluid sprayed up and out of the reservoir. Employee #1 was  burned on the upper legs and the groin area. Employee #1 jumped off of the  back of the forklift onto the ground. Coworkers came to his assistance and  called emergency services. Employee #1 was hospitalized at a burn center for  over 24 hours  for treatment of second degree burns to the upper legs and  groin area.

Separate V2 and V3

comment <- textdata$V2
comment1 <- textdata$V3
head(comment)
## [1] " Employee Is Burned By Forklift Radiator Fluid "            
## [2] " Employee Falls From Flatbed Trailer And Later Dies "       
## [3] " Two Workers Are Struck By Motor Vehicle And One Is Killed "
## [4] " Employee Is Struck By Bales Of Wire And Killed "           
## [5] " Employee Is Splashed With Hot Water And Is Burned "        
## [6] " Employee Suffers Burns While Moving Soup "
head(comment1)
## [1] " At approximately 11:30 a.m. on November 13  2013  Employee #1  with Edco Waste  & Recycling Services  was operating a forklift (Linde Lift Truck; Serial  Number: H2X393S04578; identified by the employer as FL-3) from approximately  4:00 a.m.  moving bales of recyclable paper products from a collection area in  the yard into trucks. Then  Employee #1 cleaned and was replacing an air  filter on the forklift FL-3. To clean out the air filter  Employee #1 parked  FL-3 in the doorway of the maintenance building. The air filter was located on  the rear of the forklift  behind the cab frame on the driver's side. Employee  #1 removed the air filter and cleaned it out  and then he climbed up onto the  back of the forklift to replace it. While up on the back of the forklift   Employee #1's foot dislodged the cooling system radiator cap. The fluid in the  lift truck's cooling system was hot and under pressure from being operated all  morning. The hot fluid sprayed up and out of the reservoir. Employee #1 was  burned on the upper legs and the groin area. Employee #1 jumped off of the  back of the forklift onto the ground. Coworkers came to his assistance and  called emergency services. Employee #1 was hospitalized at a burn center for  over 24 hours  for treatment of second degree burns to the upper legs and  groin area.                                                                     "                                                                                                                                                        
## [2] " On August 30  2013  Employee #1 was working from a flatbed trailer. As he  worked  he fell from the flatbed trailer onto the ground  striking his  abdomen. The fall height of the flatbed trailer was 57 inches. Employee #1  sustained unspecified injuries in the fall that later on caused his death.      "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
## [3] " On August 27  2013  Employees #1 and #2  of Templar Inc.  a construction  company specializing in fiber optic installation and services  were working  along a highway. The highway speed limit was posted at 55 miles per hour.  Employee #1 was marking the location of an underground line that ran below the  turn lane. Employee #2 was next to Employee #1 and performing the duties of a  flagger. A privately owned vehicle was travelling in the travel/through lane.  The vehicle veered to the right  entered the turn lane  and struck both  workers. Emergency medical services were called. Employee #1 was declared dead  at the scene. Employee #2 refused emergency medical treatment for the bruises  he received when struck.                                                        "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## [4] " On August 26  2013  Employee #1  with Lee Iron and Metal Company  Inc.  was  using a forklift (Hyster 50 sit down rider) to move and stack bundled cubes of  recycled insulated co-axial (coax) cable and insulated copper wire. The cubes  weighed anywhere from 1 000 to 2 000 lbs. Employee #1 had completed stacking  one column  consisting of five cubes in height  along the northeast wall of  the recycling warehouse. Employee #1 was beginning a new second column  directly in front of the existing first column. Employee #1 placed the first  cube of the second column tight against the bottom cube of the existing first  column. Employee #1 then backed the forklift up  approximately 5 ft  and  exited the forklift. Employee #1 then stepped in between the forklift and the  cubes to either clean off the top from possible loose wires that created a  hump or to place 4 ft by 4 ft dunnage so the next cube could be stacked. The  five tier stack of cubes began to overturn. The first one went over Employee  #1's head and settled on the forks. The second cube (weighing 1 948 lbs)  struck Employee #1 on the left side of his face and upper torso and forced him  back and down. Employee #1 was pinned between the first cube that fell and  second cube. A third cube also fell  which struck Employee #1's legs. Employee  #1 received a puncture wound to the right abdomen area which caused increased  blood loss. According to the medical examiner's report  Employee #1 died at  the scene from blunt force trauma to the head.                                  "
## [5] " On July 14  2013  Employee #1  vacuum pump truck driver and operator  was  offloading hot brine water at a geothermal power plant. He was assigned to  transfer loads of the brine between power plants and had already made several  trips between facilities. When he arrived at the plant  Employee #1 connected  the hose to the vehicle's tank outlet valve and proceeded to empty the tanks  contents by gravity. While the tank was left emptying  he went inside the  plant's control center briefly to cool off and to get a drink of water.  Employee #1 noticed that the flow of brine had stopped due to a clog in the  hose when he returned. He tried to clear the hose by switching the truck to  vacuum. As he did this  he noticed that the flow was still impeded. Employee  #1 loosened the hose coupler on the truck's valve  which caused hot  pressurized brine to flow out. He attempted to hold the hose  but eventually  let go  causing the hot brine water to splash on his left abdomen  right leg   left leg and left shoulder. Employee #1 rushed into the control room and  removed his hot brine soaked clothing. A coworker observed the pump truck with  the brine pouring out and Employee #1 running toward the control room. This  coworker shut off the valve on the pump truck and went to check on Employee  #1. Employee #1 was taken to a hospital and was then transferred to the burn  unit of a medical center. He was admitted to the medical center  where he was  treated for second and third-degree burns and then hospitalized.                "             
## [6] " On June 30  2013  Employee #1 was working in a food Taqueria for a  supermarket. The employee was in the kitchen area transferring soup from the  kitchen area to a refrigerator. While in the refrigerator  the employee lifted  the pail while hot and the pail was soft due to the heat  it folded causing  the liquid to spill on her the employee's arm  chest and abdomen area.  Employee #1 was transported to an area hospital  where she was treated for  second and third degree burns and remains hospitalized.                         "

Create my own stop words

my_stopwords <- c(stopwords("english"), "will", "also", "etc", "else", "can", "even", "within", "without", 
                  "well", "say", "year", "must", "need", "never", "now", "want", "still", 
                  "time", "therefore", "send", "today", "may", "many", "make", "whose",
                  "however", "get", "have", "just", "him","will", "worker", "employe", "one", "two", 
                  "employ", "work", "cowork", "approxim", "use", "day", "employee")

Create the corpus

vector <- VectorSource(comment)
vector1 <- VectorSource(comment1)
corpus <- VCorpus(vector)
corpus1 <- VCorpus(vector1)

Loop and explore 1 - 6 sentences

for(i in 1:6) {
  print(corpus[[i]][1])
}
## $content
## [1] " Employee Is Burned By Forklift Radiator Fluid "
## 
## $content
## [1] " Employee Falls From Flatbed Trailer And Later Dies "
## 
## $content
## [1] " Two Workers Are Struck By Motor Vehicle And One Is Killed "
## 
## $content
## [1] " Employee Is Struck By Bales Of Wire And Killed "
## 
## $content
## [1] " Employee Is Splashed With Hot Water And Is Burned "
## 
## $content
## [1] " Employee Suffers Burns While Moving Soup "
for(i in 1:6) {
  print(corpus1[[i]][1])
}
## $content
## [1] " At approximately 11:30 a.m. on November 13  2013  Employee #1  with Edco Waste  & Recycling Services  was operating a forklift (Linde Lift Truck; Serial  Number: H2X393S04578; identified by the employer as FL-3) from approximately  4:00 a.m.  moving bales of recyclable paper products from a collection area in  the yard into trucks. Then  Employee #1 cleaned and was replacing an air  filter on the forklift FL-3. To clean out the air filter  Employee #1 parked  FL-3 in the doorway of the maintenance building. The air filter was located on  the rear of the forklift  behind the cab frame on the driver's side. Employee  #1 removed the air filter and cleaned it out  and then he climbed up onto the  back of the forklift to replace it. While up on the back of the forklift   Employee #1's foot dislodged the cooling system radiator cap. The fluid in the  lift truck's cooling system was hot and under pressure from being operated all  morning. The hot fluid sprayed up and out of the reservoir. Employee #1 was  burned on the upper legs and the groin area. Employee #1 jumped off of the  back of the forklift onto the ground. Coworkers came to his assistance and  called emergency services. Employee #1 was hospitalized at a burn center for  over 24 hours  for treatment of second degree burns to the upper legs and  groin area.                                                                     "
## 
## $content
## [1] " On August 30  2013  Employee #1 was working from a flatbed trailer. As he  worked  he fell from the flatbed trailer onto the ground  striking his  abdomen. The fall height of the flatbed trailer was 57 inches. Employee #1  sustained unspecified injuries in the fall that later on caused his death.      "
## 
## $content
## [1] " On August 27  2013  Employees #1 and #2  of Templar Inc.  a construction  company specializing in fiber optic installation and services  were working  along a highway. The highway speed limit was posted at 55 miles per hour.  Employee #1 was marking the location of an underground line that ran below the  turn lane. Employee #2 was next to Employee #1 and performing the duties of a  flagger. A privately owned vehicle was travelling in the travel/through lane.  The vehicle veered to the right  entered the turn lane  and struck both  workers. Emergency medical services were called. Employee #1 was declared dead  at the scene. Employee #2 refused emergency medical treatment for the bruises  he received when struck.                                                        "
## 
## $content
## [1] " On August 26  2013  Employee #1  with Lee Iron and Metal Company  Inc.  was  using a forklift (Hyster 50 sit down rider) to move and stack bundled cubes of  recycled insulated co-axial (coax) cable and insulated copper wire. The cubes  weighed anywhere from 1 000 to 2 000 lbs. Employee #1 had completed stacking  one column  consisting of five cubes in height  along the northeast wall of  the recycling warehouse. Employee #1 was beginning a new second column  directly in front of the existing first column. Employee #1 placed the first  cube of the second column tight against the bottom cube of the existing first  column. Employee #1 then backed the forklift up  approximately 5 ft  and  exited the forklift. Employee #1 then stepped in between the forklift and the  cubes to either clean off the top from possible loose wires that created a  hump or to place 4 ft by 4 ft dunnage so the next cube could be stacked. The  five tier stack of cubes began to overturn. The first one went over Employee  #1's head and settled on the forks. The second cube (weighing 1 948 lbs)  struck Employee #1 on the left side of his face and upper torso and forced him  back and down. Employee #1 was pinned between the first cube that fell and  second cube. A third cube also fell  which struck Employee #1's legs. Employee  #1 received a puncture wound to the right abdomen area which caused increased  blood loss. According to the medical examiner's report  Employee #1 died at  the scene from blunt force trauma to the head.                                  "
## 
## $content
## [1] " On July 14  2013  Employee #1  vacuum pump truck driver and operator  was  offloading hot brine water at a geothermal power plant. He was assigned to  transfer loads of the brine between power plants and had already made several  trips between facilities. When he arrived at the plant  Employee #1 connected  the hose to the vehicle's tank outlet valve and proceeded to empty the tanks  contents by gravity. While the tank was left emptying  he went inside the  plant's control center briefly to cool off and to get a drink of water.  Employee #1 noticed that the flow of brine had stopped due to a clog in the  hose when he returned. He tried to clear the hose by switching the truck to  vacuum. As he did this  he noticed that the flow was still impeded. Employee  #1 loosened the hose coupler on the truck's valve  which caused hot  pressurized brine to flow out. He attempted to hold the hose  but eventually  let go  causing the hot brine water to splash on his left abdomen  right leg   left leg and left shoulder. Employee #1 rushed into the control room and  removed his hot brine soaked clothing. A coworker observed the pump truck with  the brine pouring out and Employee #1 running toward the control room. This  coworker shut off the valve on the pump truck and went to check on Employee  #1. Employee #1 was taken to a hospital and was then transferred to the burn  unit of a medical center. He was admitted to the medical center  where he was  treated for second and third-degree burns and then hospitalized.                "
## 
## $content
## [1] " On June 30  2013  Employee #1 was working in a food Taqueria for a  supermarket. The employee was in the kitchen area transferring soup from the  kitchen area to a refrigerator. While in the refrigerator  the employee lifted  the pail while hot and the pail was soft due to the heat  it folded causing  the liquid to spill on her the employee's arm  chest and abdomen area.  Employee #1 was transported to an area hospital  where she was treated for  second and third degree burns and remains hospitalized.                         "

Check the column 2 first

corpus <- tm_map(corpus, content_transformer(tolower)) #covernt to lower cases
corpus <- tm_map(corpus, removeNumbers) #remove digits
corpus <- tm_map(corpus, removeWords, my_stopwords)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument) #word stemming
corpus <- tm_map(corpus, removeWords, my_stopwords) #stopwords removal
corpus <- tm_map(corpus, stripWhitespace) #delete redundent whitespace "a  b"-> "a b"

Explore the data from 1 - 6 after data pre-processing

for(i in 1:6) {
  print(corpus[[i]][1])
}
## $content
## [1] "burn forklift radiat fluid"
## 
## $content
## [1] "fall flatb trailer later die"
## 
## $content
## [1] " struck motor vehicl kill"
## 
## $content
## [1] "struck bale wire kill"
## 
## $content
## [1] "splash hot water burn"
## 
## $content
## [1] "suffer burn move soup"

Convert to document-term matrix

dtm <- DocumentTermMatrix(corpus)

word cloud to check roughly what’s in the data

tf <-sort(colSums(as.matrix(dtm)), decreasing=TRUE)
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(tf), tf, max.words=50, scale=c(3, 0.8), colors=dark2)

Check the column 3 and perform the data pre-processing

corpus1 <- tm_map(corpus1, content_transformer(tolower)) #covernt to lower cases
corpus1 <- tm_map(corpus1, removeNumbers) #remove digits
corpus1 <- tm_map(corpus1, removeWords, my_stopwords)
corpus1 <- tm_map(corpus1, removePunctuation)
corpus1 <- tm_map(corpus1, stemDocument) #word stemming
corpus1 <- tm_map(corpus1, removeWords, my_stopwords) #stopwords removal
corpus1 <- tm_map(corpus1, stripWhitespace) #delete redundent whitespace "a  b"-> "a b"

Explore the data from 1-6 after pre-processing

for(i in 1:6) {
  print(corpus1[[i]][1])
}
## $content
## [1] " m novemb edco wast recycl servic oper forklift lind lift truck serial number hxs identifi fl m move bale recycl paper product collect area yard truck clean replac air filter forklift fl clean air filter park fl doorway mainten build air filter locat rear forklift behind cab frame driver side remov air filter clean climb onto back forklift replac back forklift s foot dislodg cool system radiat cap fluid lift truck cool system hot pressur oper morn hot fluid spray reservoir burn upper leg groin area jump back forklift onto ground came assist call emerg servic hospit burn center hour treatment second degre burn upper leg groin area"
## 
## $content
## [1] "august flatb trailer fell flatb trailer onto ground strike abdomen fall height flatb trailer inch sustain unspecifi injuri fall later caus death"
## 
## $content
## [1] "august templar inc construct compani special fiber optic instal servic along highway highway speed limit post mile per hour mark locat underground line ran turn lane next perform duti flagger privat vehicl travel travel lane vehicl veer right enter turn lane struck emerg medic servic call declar dead scene refus emerg medic treatment bruis receiv struck"
## 
## $content
## [1] "august lee iron metal compani inc forklift hyster sit rider move stack bundl cube recycl insul coaxial coax cabl insul copper wire cube weigh anywher lbs complet stack column consist five cube height along northeast wall recycl warehous begin new second column direct front exist first column place first cube second column tight bottom cube exist first column back forklift ft exit forklift step forklift cube either clean top possibl loos wire creat hump place ft ft dunnag next cube stack five tier stack cube began overturn first went s head settl fork second cube weigh lbs struck left side face upper torso forc back pin first cube fell second cube third cube fell struck s leg receiv punctur wound right abdomen area caus increas blood loss accord medic examin report die scene blunt forc trauma head"
## 
## $content
## [1] "juli vacuum pump truck driver oper offload hot brine water geotherm power plant assign transfer load brine power plant alreadi made sever trip facil arriv plant connect hose vehicl tank outlet valv proceed empti tank content graviti tank left empti went insid plant control center briefli cool drink water notic flow brine stop due clog hose return tri clear hose switch truck vacuum notic flow imped loosen hose coupler truck valv caus hot pressur brine flow attempt hold hose eventu let go caus hot brine water splash left abdomen right leg left leg left shoulder rush control room remov hot brine soak cloth observ pump truck brine pour run toward control room shut valv pump truck went check taken hospit transfer burn unit medic center admit medic center treat second thirddegre burn hospit"
## 
## $content
## [1] "june food taqueria supermarket kitchen area transfer soup kitchen area refriger refriger lift pail hot pail soft due heat fold caus liquid spill s arm chest abdomen area transport area hospit treat second third degre burn remain hospit"

Convert to document-term matrix

dtm1 <- DocumentTermMatrix(corpus1)

word cloud to check roughly what’s in the data

tf1 <-sort(colSums(as.matrix(dtm1)), decreasing=TRUE)
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(tf1), tf1, max.words=50, scale=c(2.3, 0.8), colors=dark2)

#====topic modeling on V2========= #- Find the total count of words in each Document

rowTotals <- apply(dtm, 1, sum) 

remove all docs with 0 words due to RemoveSparse

dtm  <- dtm[rowTotals> 0, ] 

LDA is a Bayesian mixture model.

Two estimation methods are available for LDA: VEM and Gibbs.

let’s use Gibbs method

We can produce wordcloud for more intuitive visualisation of the topics.

Let’s create a helper function which takes in a topic model and the index

of a topic and generates a wordcloud for this topic.

lda_5_g <- LDA(dtm, 5, method="Gibbs")

—What are in the topics——–

Look at the most frequent terms for each topic

terms(lda_5_g, 10)
##       Topic 1      Topic 2  Topic 3  Topic 4   Topic 5  
##  [1,] "fall"       "amput"  "kill"   "injur"   "burn"   
##  [2,] "fractur"    "finger" "struck" "die"     "suffer" 
##  [3,] "electr"     "caught" "crush"  "fall"    "explos" 
##  [4,] "shock"      "machin" "truck"  "later"   "sustain"
##  [5,] "leg"        "injur"  "crane"  "collaps" "fire"   
##  [6,] "ladder"     "hand"   "pin"    "head"    "injur"  
##  [7,] "electrocut" "arm"    "run"    "elev"    "chemic" 
##  [8,] "roof"       "press"  "vehicl" "drown"   "expos"  
##  [9,] "line"       "oper"   "roll"   "exposur" "injuri" 
## [10,] "forklift"   "lacer"  "lift"   "attack"  "hot"

Function logLik() gives us the log-likelihood of the model,

which is the sum over the log-likelihoods of all documents,

maximized during maximum likelihood estimation of the model

logLik(lda_5_g)
## 'log Lik.' -201360.9 (df=13615)

use “@terms” to find out the terms in columns

beta, logarithmized parameters of the word distribution for topic N

lda_5_g@terms[1:10]
##  [1] "abdomen"  "abdomin"  "abrad"    "abras"    "access"   "accid"   
##  [7] "accident" "accord"   "accumul"  "acet"
lda_5_g@beta[3, 1:10]
##  [1]  -7.271407 -11.382281 -11.382281 -11.382281  -8.984386 -11.382281
##  [7] -11.382281 -11.382281 -11.382281 -11.382281

get the matrix of probabilities of words over topics - the beta

name the columns of the matrix with the corresponding terms

get the ith topic (a vector of word probabilities) and sort them in decreasing order

display the top 20 most frequent words in wordcloud

showcloud = function (m, i) {
  tt <- m@beta
  colnames(tt) <- m@terms
  top <- sort(tt[i, ], decreasing = TRUE)
  wordcloud(names(top[1:20]), 2^top[1:20],scale=c(2.3, .8),rot.per=0.3, colors=dark2)
}

showcloud(lda_5_g, 5) #show cloud for the selected topic

Now how do we know which document belongs to which topic?

Let us get the 5 topics for the first ten documents.

The first doc most likely belongs to topic 2, second doc most likely belongs topic 1…..

t(topics(lda_5_g, 5))[1:10,]
##    [,1] [,2] [,3] [,4] [,5]
## 1     1    2    4    5    3
## 2     3    4    1    2    5
## 3     3    1    2    4    5
## 4     1    3    2    4    5
## 5     5    1    2    3    4
## 6     5    1    2    3    4
## 7     1    3    4    2    5
## 8     3    1    2    4    5
## 9     1    3    2    4    5
## 10    1    2    5    3    4

which topic has largest number of documents?

which.max(tabulate(topics(lda_5_g)))
## [1] 1
tabulate(topics(lda_5_g))
## [1] 2726 2136 1911 1557 1663
table(topics(lda_5_g))
## 
##    1    2    3    4    5 
## 2726 2136 1911 1557 1663

gamma, posterior topic distribution for each document, gives the actual probabilities

let’s look at the probabilities of the first document belonging to each of the topics

First document highest probability belongs to Topic 2

lda_5_g@gamma[1,]
## [1] 0.2037037 0.2037037 0.1851852 0.2037037 0.2037037
barplot(lda_5_g@gamma[1,], names.arg=1:5, main="Topic distribution of Story 1")

~~~

====topic modeling on V3=========

LDA is a Bayesian mixture model.

Two estimation methods are available for LDA: VEM and Gibbs.

let’s use Gibbs method

lda_5_g1 <- LDA(dtm1, 5, method="Gibbs")

—What are in the topics——–

Look at the most frequent terms for each topic

terms(lda_5_g1, 10)
##       Topic 1  Topic 2     Topic 3    Topic 4   Topic 5 
##  [1,] "hand"   "hospit"    "truck"    "fell"    "burn"  
##  [2,] "left"   "medic"     "back"     "ground"  "electr"
##  [3,] "machin" "transport" "side"     "feet"    "power" 
##  [4,] "right"  "servic"    "oper"     "lift"    "tank"  
##  [5,] "oper"   "emerg"     "load"     "fractur" "water" 
##  [6,] "hospit" "center"    "struck"   "floor"   "line"  
##  [7,] "finger" "treat"     "move"     "fall"    "fire"  
##  [8,] "cut"    "call"      "forklift" "top"     "air"   
##  [9,] "number" "accid"     "kill"     "ladder"  "system"
## [10,] "amput"  "report"    "trailer"  "concret" "hospit"

Function logLik() gives us the log-likelihood of the model,

which is the sum over the log-likelihoods of all documents,

maximized during maximum likelihood estimation of the model

logLik(lda_5_g1)
## 'log Lik.' -3308266 (df=74835)

use “@terms” to find out the terms in columns

beta, logarithmized parameters of the word distribution for topic N

lda_5_g1@terms[1:10]
##  [1] "aaa"     "aachen"  "aal"     "aard"    "aaron"   "abaco"   "abacus" 
##  [8] "abandon" "abat"    "abc"
lda_5_g1@beta[3, 1:10]
##  [1] -13.83675 -13.83675 -13.83675 -13.83675 -11.43886 -13.83675 -11.43886
##  [8] -11.43886 -13.83675 -13.83675

We can produce wordcloud for more intuitive visualisation of the topics.

Let’s create a helper function which takes in a topic model and the index

of a topic and generates a wordcloud for this topic.

get the matrix of probabilities of words over topics - the beta

name the columns of the matrix with the corresponding terms

get the ith topic (a vector of word probabilities) and sort them in decreasing order

display the top 20 most frequent words in wordcloud

showcloud = function (m, i) {
  tt <- m@beta
  colnames(tt) <- m@terms
  top <- sort(tt[i, ], decreasing = TRUE)
  wordcloud(names(top[1:20]), 2^top[1:20],scale=c(2.3, .8),rot.per=0.3, colors=dark2)
}

showcloud(lda_5_g1, 5) #show cloud for the selected topic

####Now how do we know which document belongs to which topic? ####Let us get the 3 most likely topics for the first ten documents.

t(topics(lda_5_g1, 3))[1:10,]
##    [,1] [,2] [,3]
## 1     5    3    2
## 2     4    3    2
## 3     3    2    5
## 4     4    2    3
## 5     5    3    2
## 6     5    1    3
## 7     4    1    3
## 8     3    4    2
## 9     3    2    1
## 10    4    2    1

which topic has largest number of documents?

which.max(tabulate(topics(lda_5_g1)))
## [1] 4
tabulate(topics(lda_5_g1))
## [1] 1908 1858 1874 2226 2134
table(topics(lda_5_g1))
## 
##    1    2    3    4    5 
## 1908 1858 1874 2226 2134

gamma, posterior topic distribution for each document, gives the actual probabilities

let’s look at the probabilities of the first document belonging to each of the topics

lda_5_g1@gamma[1,]
## [1] 0.1381579 0.1447368 0.2894737 0.1250000 0.3026316
barplot(lda_5_g1@gamma[1,], names.arg=1:5, main="Topic distribution of Story 1")

Find the occupations and body parts injure the most

Identify the ‘Occcupation’ word

how to get hyponyms(the children words) of a term

hyponyms <- function(x){
  filter <- getTermFilter("ExactMatchFilter", x, TRUE)
  terms <- getIndexTerms("NOUN", 1, filter)
  synsets <- getSynsets(terms[[1]])
  related <- tryCatch(
    getRelatedSynsets(synsets[[1]], "~"),
    error = function(condition) {
      if (condition$message == "RcallMethod: invalid object parameter")
        message("No direct hyponyms found")
      else
        stop(condition)
      return(NULL)
    }
  )
  if (is.null(related))
    return(NULL)
  return(unlist(sapply(related, getWord)))
}

setDict("C:/Program Files (x86)/WordNet/2.1/dict")
hyponyms("employee")
##  [1] "bartender"             "barman"               
##  [3] "barkeep"               "barkeeper"            
##  [5] "mixologist"            "clerk"                
##  [7] "company man"           "copyist"              
##  [9] "scribe"                "scrivener"            
## [11] "copywriter"            "crewman"              
## [13] "deliveryman"           "delivery boy"         
## [15] "deliverer"             "dining-room attendant"
## [17] "restaurant attendant"  "dispatcher"           
## [19] "dog catcher"           "floater"              
## [21] "floorwalker"           "shopwalker"           
## [23] "gardener"              "gasman"               
## [25] "gofer"                 "hire"                 
## [27] "hired help"            "hireling"             
## [29] "pensionary"            "jobholder"            
## [31] "line worker"           "liveryman"            
## [33] "office boy"            "organization man"     
## [35] "porter"                "Pullman porter"       
## [37] "potboy"                "potman"               
## [39] "public servant"        "registrar"            
## [41] "salesperson"           "sales representative" 
## [43] "sales rep"             "sandwichman"          
## [45] "shop assistant"        "spotter"              
## [47] "spotter"               "staff member"         
## [49] "staffer"               "stagehand"            
## [51] "stage technician"      "stocktaker"           
## [53] "stock-taker"           "sweeper"              
## [55] "tollkeeper"            "tollman"              
## [57] "tollgatherer"          "toll collector"       
## [59] "toll taker"            "toll agent"           
## [61] "toller"                "trainman"             
## [63] "railroader"            "railroad man"         
## [65] "railwayman"            "railway man"          
## [67] "turncock"              "typist"               
## [69] "workman"               "workingman"           
## [71] "working man"           "working person"

Consolidate occupation and body words

occ_words <- c(hyponyms("employee"), hyponyms("worker"))
body_words <- c(hyponyms("external body part"), hyponyms("body part")) 

Create function to extract the noun

getNouns = function (x) {
  tok <- getToken(annotateString(x))
  lem <- unlist(tok[startsWith(tok[, "POS"], "N"), "lemma"])
}

Use the CoreNLP

initCoreNLP(type="english_fast")

Read the file

text <- read.delim("osha.txt", header=FALSE, sep="\t", quote = "", stringsAsFactors = FALSE)
text[1,]
##          V1                                              V2
## 1 201079928  Employee Is Burned By Forklift Radiator Fluid 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         V3
## 1  At approximately 11:30 a.m. on November 13  2013  Employee #1  with Edco Waste  & Recycling Services  was operating a forklift (Linde Lift Truck; Serial  Number: H2X393S04578; identified by the employer as FL-3) from approximately  4:00 a.m.  moving bales of recyclable paper products from a collection area in  the yard into trucks. Then  Employee #1 cleaned and was replacing an air  filter on the forklift FL-3. To clean out the air filter  Employee #1 parked  FL-3 in the doorway of the maintenance building. The air filter was located on  the rear of the forklift  behind the cab frame on the driver's side. Employee  #1 removed the air filter and cleaned it out  and then he climbed up onto the  back of the forklift to replace it. While up on the back of the forklift   Employee #1's foot dislodged the cooling system radiator cap. The fluid in the  lift truck's cooling system was hot and under pressure from being operated all  morning. The hot fluid sprayed up and out of the reservoir. Employee #1 was  burned on the upper legs and the groin area. Employee #1 jumped off of the  back of the forklift onto the ground. Coworkers came to his assistance and  called emergency services. Employee #1 was hospitalized at a burn center for  over 24 hours  for treatment of second degree burns to the upper legs and  groin area.
commentN <- text$V3

Extract the nouns

lemmaN <- sapply(commentN, getNouns)

Convert to corpus

vectorN <- VectorSource(lemmaN)
corpusN <- VCorpus(vectorN)

Convert to DTM by using dictionary of body parts and occupations

dtmN <- DocumentTermMatrix(corpusN, control = list (dictionary = occ_words, 
                                                    stopwords = my_stopwords,
                                                    stemming = FALSE,
                                                    weighting = weightBin))

dtmB <- DocumentTermMatrix(corpusN, control = list (dictionary = body_words, 
                                                    stopwords = my_stopwords,
                                                    stemming = FALSE,
                                                    weighting = weightBin))

Compute the concepts

freqN <- colSums(as.matrix(dtmN))
freqB <- colSums(as.matrix(dtmB))

Sort the terms with highest frequency to lowest

sort(freqN, decreasing = TRUE)
##                driver                  help                helper 
##                   447                   170                    87 
##             assistant                washer               spotter 
##                    72                    43                    28 
##             volunteer                 clerk               trimmer 
##                    27                    26                    26 
##                caster                gutter                hanger 
##                    24                    23                    21 
##              stripper               skidder              finisher 
##                    19                    18                    15 
##               sweeper             processor              scrubber 
##                    15                    14                    13 
##               planter                puller                   rat 
##                     9                     9                     9 
##                  tier                 oiler              splitter 
##                     9                     8                     8 
##              stringer              gardener                winder 
##                     8                     7                     7 
##            dispatcher               splicer                 wiper 
##                     6                     6                     6 
##           independent                  hire              bleacher 
##                     5                     4                     3 
##                carter                poster                pruner 
##                     3                     3                     3 
##                porter             temporary                tugger 
##                     2                     2                     2 
##               workman             bartender               crewman 
##                     2                     1                     1 
##               dragger                melter                nailer 
##                     1                     1                     1 
##                peeler                potman              retainer 
##                     1                     1                     1 
##           salesperson              seasonal                 slave 
##                     1                     1                     1 
##                tapper                  temp               thrower 
##                     1                     1                     1 
##                topper               barkeep             barkeeper 
##                     1                     0                     0 
##                barman                beater           bill poster 
##                     0                     0                     0 
##          bill sticker              blackleg                boater 
##                     0                     0                     0 
##               boatman       charcoal burner           company man 
##                     0                     0                     0 
##               copyist            copywriter            cornhusker 
##                     0                     0                     0 
##             deliverer          delivery boy           deliveryman 
##                     0                     0                     0 
## dining-room attendant           dog catcher            employable 
##                     0                     0                     0 
##              employee               floater           floorwalker 
##                     0                     0                     0 
##             freelance                gasman                 gofer 
##                     0                     0                     0 
##           hard worker            hired help              hireling 
##                     0                     0                     0 
##             jobholder           lamplighter           line worker 
##                     0                     0                     0 
##             liveryman            mixologist           moonlighter 
##                     0                     0                     0 
##                mopper               muzzler            office boy 
##                     0                     0                     0 
##      organization man            part-timer            pensionary 
##                     0                     0                     0 
##                 plier                 plyer                potboy 
##                     0                     0                     0 
##        public servant        Pullman porter              quarrier 
##                     0                     0                     0 
##             quarryman             ragsorter          railroad man 
##                     0                     0                     0 
##            railroader           railway man            railwayman 
##                     0                     0                     0 
##             registrar  restaurant attendant             sales rep 
##                     0                     0                     0 
##  sales representative           sandwichman                  scab 
##                     0                     0                     0 
##                scribe             scrivener       seasonal worker 
##                     0                     0                     0 
##  self-employed person               servant               sheller 
##                     0                     0                     0 
##               shelver              shingler        shop assistant 
##                     0                     0                     0 
##            shopwalker              shoveler             shoveller 
##                     0                     0                     0 
##        skilled worker       skilled workman              solderer 
##                     0                     0                     0 
##              sprigger          staff member               staffer 
##                     0                     0                     0 
##      stage technician             stagehand               stainer 
##                     0                     0                     0 
##               stemmer           stock-taker            stocktaker 
##                     0                     0                     0 
##         strikebreaker               striver             supporter 
##                     0                     0                     0 
##                tacker                teaser      temporary worker 
##                     0                     0                     0 
##             throwster               tier up                 tiler 
##                     0                     0                     0 
##                toiler            toll agent        toll collector 
##                     0                     0                     0 
##            toll taker                toller          tollgatherer 
##                     0                     0                     0 
##            tollkeeper               tollman        trade unionist 
##                     0                     0                     0 
##        trained worker              trainman              turncock 
##                     0                     0                     0 
##                twiner                typist          union member 
##                     0                     0                     0 
##              unionist         unpaid worker                wallah 
##                     0                     0                     0 
##              waterman          working girl           working man 
##                     0                     0                     0 
##        working person            workingman              workmate 
##                     0                     0                     0
sort(freqB, decreasing = TRUE)
##                 area                 head               system 
##                 1567                 1131                  683 
##                 body                 face                 back 
##                  621                  608                  493 
##               bottom              process                chest 
##                  351                  348                  344 
##                 neck             shoulder            structure 
##                  303                  282                  215 
##                 rear               member                  hip 
##                  177                  156                  137 
##                 dock                 seat                joint 
##                  134                  128                   90 
##                torso                  toe              abdomen 
##                   87                   82                   74 
##              stomach                 tail            apparatus 
##                   42                   40                   33 
##               tissue            extremity                  lip 
##                   33                   32                   28 
##              buttock                stump                trunk 
##                   27                   24                   24 
##                cheek                 butt               saddle 
##                   23                   18                   17 
##             backside                organ                small 
##                   15                   15                   13 
##              feature               groove                 shin 
##                   12                   11                   10 
##               region                belly               rectum 
##                    9                    6                    6 
##                 lobe                shank               breast 
##                    5                    5                    3 
##                flank               cannon            horseback 
##                    3                    2                    2 
##            posterior                stern               thorax 
##                    2                    2                    2 
##            appendage                 loin               adnexa 
##                    1                    1                    0 
##           ambulacrum              ampulla anatomical structure 
##                    0                    0                    0 
##               annexa                 arse           arthromere 
##                    0                    0                    0 
##          articulatio         articulation                  ass 
##                    0                    0                    0 
##               behind     bodily structure       body structure 
##                    0                    0                    0 
##                  bum                 buns             buttocks 
##                    0                    0                    0 
##                  can                caput               cervix 
##                    0                    0                    0 
##    complex body part                croup               croupe 
##                    0                    0                    0 
##             derriere              dilator               dorsum 
##                    0                    0                    0 
##              energid   external body part                fanny 
##                    0                    0                    0 
##               fornix                frill            fundament 
##                    0                    0                    0 
##               gaskin               haunch             hind end 
##                    0                    0                    0 
##         hindquarters           human face              keister 
##                    0                    0                    0 
##            lineament                loins               lumbus 
##                    0                    0                    0 
##               mentum                nates            outgrowth 
##                    0                    0                    0 
##               pectus                 prat           protoplast 
##                    0                    0                    0 
##             rear end             rudiment                 ruff 
##                    0                    0                    0 
##                 rump               septum             tail end 
##                    0                    0                    0 
##              tooshie                 tush            underpart 
##                    0                    0                    0 
##            vallecula               venter              withers 
##                    0                    0                    0

Write the dtm file to excel

a <- as.matrix(dtmN)
b <- as.matrix(dtmB)
setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Text Analytic/WK 1/mini project")
write.csv(a, "Occupation.csv")
write.csv(b, "BodyParts.csv")

create wordcloud

dark2 <- brewer.pal(6, "Dark2")   
wordcloud(names(freqN), freqN, max.words=60, rot.per=0.5, colors=dark2)

wordcloud(names(freqB), freqB, max.words=60, rot.per=0.5, colors=dark2)