#Package 나중에 자동 인스톨 및 로드할 코드 작성
Packages = c(
  'tidyverse',
  'tm',
  'readxl',
  'psych',
  'dplyr',
  'stm',
  'data.table',
  'ggplot2',
  'extrafont',
  'dplyr',
  'magrittr',
  'tidyverse',
  'data.table',
  'tidytext',
  'quanteda',
  'topicmodels',
  'doParallel',
  'scales'
  
)

for(p in Packages){
  if(!require(p,character.only = TRUE)) install.packages(p)
  library(p,character.only = TRUE)
}
## Loading required package: tidyverse
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.1
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: tm
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## Loading required package: readxl
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## Loading required package: stm
## stm v1.3.3 (2018-1-26) successfully loaded. See ?stm for help. 
##  Papers, resources, and other materials at structuraltopicmodel.com
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## Loading required package: extrafont
## Registering fonts with R
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## Loading required package: tidytext
## Loading required package: quanteda
## Package version: 1.3.4
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
## 
##     View
## Loading required package: topicmodels
## Loading required package: doParallel
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# System
set.seed(12345)
Sys.info()
##                                                                                           sysname 
##                                                                                          "Darwin" 
##                                                                                           release 
##                                                                                          "17.7.0" 
##                                                                                           version 
## "Darwin Kernel Version 17.7.0: Thu Jun 21 22:53:14 PDT 2018; root:xnu-4570.71.2~1/RELEASE_X86_64" 
##                                                                                          nodename 
##                                                                    "Chungils-MacBook-Pro-2.local" 
##                                                                                           machine 
##                                                                                          "x86_64" 
##                                                                                             login 
##                                                                                        "chadchae" 
##                                                                                              user 
##                                                                                        "chadchae" 
##                                                                                    effective_user 
##                                                                                        "chadchae"
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] scales_1.0.0      doParallel_1.0.11 iterators_1.0.10 
##  [4] foreach_1.4.4     topicmodels_0.2-7 quanteda_1.3.4   
##  [7] tidytext_0.1.9    magrittr_1.5      extrafont_0.17   
## [10] data.table_1.11.4 stm_1.3.3         psych_1.8.4      
## [13] readxl_1.1.0      tm_0.7-5          NLP_0.1-11       
## [16] forcats_0.3.0     stringr_1.3.1     dplyr_0.7.6      
## [19] purrr_0.2.5       readr_1.1.1       tidyr_0.8.1      
## [22] tibble_1.4.2      ggplot2_3.0.0     tidyverse_1.2.1  
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.3.1         jsonlite_1.5       modelr_0.1.2      
##  [4] RcppParallel_4.4.1 assertthat_0.2.0   stats4_3.5.0      
##  [7] cellranger_1.1.0   yaml_2.2.0         slam_0.1-43       
## [10] Rttf2pt1_1.3.7     pillar_1.3.0       backports_1.1.2   
## [13] lattice_0.20-35    glue_1.3.0         extrafontdb_1.0   
## [16] digest_0.6.16      rvest_0.3.2        colorspace_1.3-2  
## [19] htmltools_0.3.6    Matrix_1.2-14      plyr_1.8.4        
## [22] pkgconfig_2.0.2    broom_0.5.0        haven_1.1.2       
## [25] withr_2.1.2        lazyeval_0.2.1     cli_1.0.0         
## [28] mnormt_1.5-5       crayon_1.3.4       evaluate_0.11     
## [31] stopwords_0.9.0    tokenizers_0.2.1   janeaustenr_0.1.5 
## [34] nlme_3.1-137       SnowballC_0.5.1    xml2_1.2.0        
## [37] foreign_0.8-71     tools_3.5.0        hms_0.4.2         
## [40] munsell_0.5.0      bindrcpp_0.2.2     compiler_3.5.0    
## [43] rlang_0.2.2        grid_3.5.0         rstudioapi_0.7    
## [46] rmarkdown_1.10     gtable_0.2.0       codetools_0.2-15  
## [49] R6_2.2.2           lubridate_1.7.4    knitr_1.20        
## [52] bindr_0.1.1        fastmatch_1.1-0    rprojroot_1.3-2   
## [55] modeltools_0.2-22  stringi_1.2.4      Rcpp_0.12.18      
## [58] spacyr_0.9.91      tidyselect_0.2.4
Packages
##  [1] "tidyverse"   "tm"          "readxl"      "psych"       "dplyr"      
##  [6] "stm"         "data.table"  "ggplot2"     "extrafont"   "dplyr"      
## [11] "magrittr"    "tidyverse"   "data.table"  "tidytext"    "quanteda"   
## [16] "topicmodels" "doParallel"  "scales"
Sys.time()
## [1] "2018-09-15 17:28:41 EDT"
# 한글설정
#Sys.setlocale("LC_CTYPE", "ko_KR.UTF-8")
#fonts()
#par(family='AppleMyungjo')
#theme_set(theme_gray(base_family='AppleMyungjo'))
# Read Data
rawdata<-read.csv("../Data/accident.csv")
rawdata$NARRATIVE<-as.character(rawdata$NARRATIVE)
table(rawdata$DEGREE_INJURY_CD)
## 
##     ?     0     1    10     2     3     4     5     6     7     8     9 
##   910 25159   902  1722  1920 69931 14822 32159 56086  8039   982   415
# 3 locations(subunit), 4 degree of injury, year

# Record Data
rawdata$DEGREE_INJURY_CD<-as.character(rawdata$DEGREE_INJURY_CD)
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="?"] <- NA
rawdata$DEGREE_INJURY_CD<-as.numeric(rawdata$DEGREE_INJURY_CD)

table(rawdata$DEGREE_INJURY_CD)
## 
##     0     1     2     3     4     5     6     7     8     9    10 
## 25159   902  1920 69931 14822 32159 56086  8039   982   415  1722
summary(rawdata$DEGREE_INJURY_CD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   3.000   4.000   4.036   6.000  10.000     910
# Filter :restricts the dataset to coal mines only, produces an error
rawdata<-filter(rawdata, COAL_METAL_IND == "C")
table(rawdata$COAL_METAL_IND)
## 
##      C      M 
## 104854      0
table(rawdata$DEGREE_INJURY_CD)
## 
##     0     1     2     3     4     5     6     7     8     9    10 
## 21788   460   832 44379  4142  4927 22646  4494   273    60   459
table(rawdata$SUBUNIT_CD)
## 
##     1     2     3     4     5     6    17    30    99 
## 75818  4849 15916   345   167    37   343  7170   209
# SUBUNIT_CD= 1,2 - 1; 3; >3-3; 99-NA 
#1-underground mining, 2-surface mining, 3-others
rawdata$SUBUNIT_CD<-as.character(rawdata$SUBUNIT_CD)
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="1"] <- 1
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="2"] <- 1
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="3"] <- 2
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="4"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="5"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="6"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="12"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="17"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="30"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="99"] <- NA
rawdata$SUBUNIT_CD<-as.numeric(rawdata$SUBUNIT_CD)

table(rawdata$SUBUNIT_CD)
## 
##     1     2     3 
## 80667 15916  8062
table(rawdata$SUBUNIT_CD, rawdata$DEGREE_INJURY_CD)
##    
##         0     1     2     3     4     5     6     7     8     9    10
##   1 21101   318   610 32760  2822  3347 15735  3268   135    18   254
##   2   500    97   140  7533   869  1190  4452   802    98    29   143
##   3   182    44    82  4006   443   382  2385   408    32    11    57
table(rawdata$NO_INJURIES)
## 
##     0     1     2     3     4     5     6     7     8     9    10    13 
## 21789 82265   487    93    56    15    36     7     8     9    10    13 
##    14    16    36 
##    14    16    36
rawdata<-filter(rawdata, NO_INJURIES > 0)
table(rawdata$NO_INJURIES)
## 
##     1     2     3     4     5     6     7     8     9    10    13    14 
## 82265   487    93    56    15    36     7     8     9    10    13    14 
##    16    36 
##    16    36
table(rawdata$SUBUNIT_CD, rawdata$DEGREE_INJURY_CD)
##    
##         1     2     3     4     5     6     7     8     9    10
##   1   318   610 32760  2822  3347 15735  3268   135    18   254
##   2    97   140  7533   869  1190  4452   802    98    29   143
##   3    44    82  4006   443   382  2385   408    32    11    56
table(rawdata$DEGREE_INJURY_CD)
## 
##     1     2     3     4     5     6     7     8     9    10 
##   460   832 44379  4142  4927 22646  4494   273    60   458
# Regrouping
rawdata$DEGREE_INJURY_CD<-as.character(rawdata$DEGREE_INJURY_CD)
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="1"] <- 1
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="2"] <- 2
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="3"] <- 3
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="4"] <- 4
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="5"] <- 5
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="6"] <- 6
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="7"] <- NA
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="8"] <- NA
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="9"] <- NA
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="10"] <- NA
rawdata$DEGREE_INJURY_CD<-as.numeric(rawdata$DEGREE_INJURY_CD)

table(rawdata$DEGREE_INJURY_CD)
## 
##     1     2     3     4     5     6 
##   460   832 44379  4142  4927 22646
# 3,4,5 days lost from work, 6 by itself
rawdata$DEGREE_INJURY_CD<-as.character(rawdata$DEGREE_INJURY_CD)
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="1"] <- 1
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="2"] <- 2
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="3"] <- 3
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="4"] <- 3
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="5"] <- 3
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="6"] <- 4
rawdata$DEGREE_INJURY_CD<-as.numeric(rawdata$DEGREE_INJURY_CD)

table(rawdata$DEGREE_INJURY_CD)
## 
##     1     2     3     4 
##   460   832 53448 22646
nrow(rawdata)
## [1] 83065
table(rawdata$DEGREE_INJURY_CD, rawdata$SUBUNIT_CD)
##    
##         1     2     3
##   1   318    97    44
##   2   610   140    82
##   3 38929  9592  4831
##   4 15735  4452  2385
table(rawdata$DEGREE_INJURY_CD, rawdata$CAL_YR)
##    
##     2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
##   1   38   42   28   30   28   23   47   34   30   18   48   20   20   20
##   2   91   51   42   56   50   65   52   52   50   64   40   62   41   36
##   3 4703 4527 4478 3493 3512 3537 3538 3364 3326 3068 2851 2981 2661 2411
##   4 1597 1679 1492 1286 1468 1557 1618 1441 1529 1372 1400 1559 1321 1124
##    
##     2014 2015 2016
##   1   16   11    7
##   2   33   33   14
##   3 2314 1804  880
##   4 1016  810  377
# short narrative, spell and poor English, what kind of machine, 

#save(rawdata, file="../Data/data1.rds")
# Load Basic Setting and Dataset
source("../Script/0_Cleaning.R")
load("../Data/data1.rds")

# Data Coding
## Data Seleciton
data<-select(rawdata, CAL_YR, SUBUNIT_CD, DEGREE_INJURY_CD, NARRATIVE)
head(data)
##   CAL_YR SUBUNIT_CD DEGREE_INJURY_CD
## 1   2002          3                4
## 2   2003          3                4
## 3   2003          2                3
## 4   2004          2                3
## 5   2002          1                4
## 6   2002          1                4
##                                                                                                                                                                                                                                                                                                                          NARRATIVE
## 1                                               INJURED WAS USING A UTILITY KNIFE TO CUT RUBBER GASKET MATERIAL ON A FLAT TABLE IN THE SHOP. HE HAD THE KNIFE IN HIS RIGHT HAND CUTTING TOWARDS HIMSELF. THE KNIFE SLIPPED & HIT HIS LEFT INDEX FINGER, CUTTING IT. HE WAS NOT WEARING GLOVES. CUT REQUIRED TWO STITCHES TO CLOSE.
## 2 INJURED WAS IN THE ELEVATOR ON THE 1ST FLOOR OF THE PLANT. HE WAS ATTEMPTING TO CLOSE THE OUTER DOOR. HE WAS PULLING DOWN ON THE STRAP WITH BOTH HANDS. HE LEANED TOWARD THE DOOR AS HE PULLED & WHEN THE TWO PARTS OF THE DOOR CAME TOGETHER HE GOT HIS MIDDLE, RING & LITTLE FINGER ON HIS RIGHT HAND BETWEEN THE DOOR HALVES.
## 3                                                                                                                                                                                                                                                  DISMOUNTING RR250, CAME DOWN LADDER AND RT KNEE STRUCK PLATFORM ON THE MACHINE.
## 4                                                                                                                                                               EE WAS AT THE SOUTH END OF PIT TO HELP MOVE A WATER PUMP. HE WAS WALKING NEAR THE REAR OF PUMP WHEN HE SLIPPED & FELL ON HIS RT. ARM, CONDITIONS WERE WET & MUDDY.
## 5                                                                                                                                                                INJURED WAS PULLING A 1" AIRLINE BESIDE L.M. HEADER WHEN HE STRUCK HIS LEFT SHOULDER AGAINST A FIRE SUPPRESSION OUTLET RESULTING IN A LACERATION TO HIS SHOULDER.
## 6                                                                                                                                                                                            INJURED WAS GREASING CRUSHER BEARING WHEN PICK CAME OFF CRUSHER, KNOCKING COVER OFF AND STRIKING INDIVIDUAL IN LEFT HAND AND FOREARM.
## Factors
### SUBUNIT_CO
data$SUBUNIT_CD_F[data$SUBUNIT_CD==1]<-"Underground"
data$SUBUNIT_CD_F[data$SUBUNIT_CD==2]<-"Surface"
data$SUBUNIT_CD_F[data$SUBUNIT_CD==3]<-"Other Locations"
data$SUBUNIT_CD_F<-as.factor(data$SUBUNIT_CD_F)
head(data$SUBUNIT_CD_F)
## [1] Other Locations Other Locations Surface         Surface        
## [5] Underground     Underground    
## Levels: Other Locations Surface Underground
### DEGREE_INJURY_CD : 1 = Death, 2=Disability 3=Lost Time 4=No Lost Time

data$DEGREE_INJURY_CD_F[data$DEGREE_INJURY_CD==1]<-"Death"
data$DEGREE_INJURY_CD_F[data$DEGREE_INJURY_CD==2]<-"Disability"
data$DEGREE_INJURY_CD_F[data$DEGREE_INJURY_CD==3]<-"LostTime"
data$DEGREE_INJURY_CD_F[data$DEGREE_INJURY_CD==4]<-"NoLostTime"
data$DEGREE_INJURY_CD_F<-as.factor(data$DEGREE_INJURY_CD_F)
head(data$DEGREE_INJURY_CD_F)
## [1] NoLostTime NoLostTime LostTime   LostTime   NoLostTime NoLostTime
## Levels: Death Disability LostTime NoLostTime
# Text Analysis

## Corpus
###Cleaning
data$NARRATIVE <- gsub("[^[:alnum:]///' ]", "", data$NARRATIVE)
TEXTBASIC<- Corpus(VectorSource(data$NARRATIVE))
CORPUS <- tm_map(TEXTBASIC, removePunctuation)   
## Warning in tm_map.SimpleCorpus(TEXTBASIC, removePunctuation):
## transformation drops documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] Employee was standing on a crossflow hose hitting an 8 pipe and slipped and fell hurting right lower leg EMPLOYEE DID NOT START LOSING TIME UNTIL 6308 DUE TO INFECTION
## [1] "Employee was standing on a crossflow hose hitting an 8 pipe and slipped and fell hurting right lower leg EMPLOYEE DID NOT START LOSING TIME UNTIL 6308 DUE TO INFECTION"
## [2] "list(language = \"en\")"                                                                                                                                                
## [3] "list()"
CORPUS <- tm_map(CORPUS, removeNumbers)   
## Warning in tm_map.SimpleCorpus(CORPUS, removeNumbers): transformation drops
## documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] Employee was standing on a crossflow hose hitting an  pipe and slipped and fell hurting right lower leg EMPLOYEE DID NOT START LOSING TIME UNTIL  DUE TO INFECTION
## [1] "Employee was standing on a crossflow hose hitting an  pipe and slipped and fell hurting right lower leg EMPLOYEE DID NOT START LOSING TIME UNTIL  DUE TO INFECTION"
## [2] "list(language = \"en\")"                                                                                                                                           
## [3] "list()"
CORPUS <- tm_map(CORPUS, tolower)  
## Warning in tm_map.SimpleCorpus(CORPUS, tolower): transformation drops
## documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] employee was standing on a crossflow hose hitting an  pipe and slipped and fell hurting right lower leg employee did not start losing time until  due to infection
## [1] "employee was standing on a crossflow hose hitting an  pipe and slipped and fell hurting right lower leg employee did not start losing time until  due to infection"
## [2] "list(language = \"en\")"                                                                                                                                           
## [3] "list()"
CORPUS <- tm_map(CORPUS, removeWords, stopwords("english"))  
## Warning in tm_map.SimpleCorpus(CORPUS, removeWords, stopwords("english")):
## transformation drops documents
CORPUS <- tm_map(CORPUS, removeWords, stopwords("SMART"))  
## Warning: 'stopwords(language = "SMART")' is deprecated.
## Use 'stopwords(source = "smart")' instead.
## See help("Deprecated")
## Warning in tm_map.SimpleCorpus(CORPUS, removeWords, stopwords("SMART")):
## transformation drops documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] employee  standing   crossflow hose hitting   pipe  slipped  fell hurting  lower leg employee   start losing time   due  infection
## [1] "employee  standing   crossflow hose hitting   pipe  slipped  fell hurting  lower leg employee   start losing time   due  infection"
## [2] "list(language = \"en\")"                                                                                                           
## [3] "list()"
CORPUS <- tm_map(CORPUS, removeWords, c("and", "for", "that","not", "did", 
                                        "due", "to",  "this", "are", "is", 
                                        "am","with","their","the","can",
                                        "from","article", "study", "ee", 
                                        "employee", "left", "right", "up","down"))
## Warning in tm_map.SimpleCorpus(CORPUS, removeWords, c("and", "for",
## "that", : transformation drops documents
#CORPUS <- tm_map(CORPUS, stemDocument)
#as.character(inspect(CORPUS[100]))
CORPUS <- tm_map(CORPUS, stripWhitespace)  
## Warning in tm_map.SimpleCorpus(CORPUS, stripWhitespace): transformation
## drops documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1]  standing crossflow hose hitting pipe slipped fell hurting lower leg start losing time infection
## [1] " standing crossflow hose hitting pipe slipped fell hurting lower leg start losing time infection"
## [2] "list(language = \"en\")"                                                                         
## [3] "list()"
#CORPUS <- tm_map(CORPUS, PlainTextDocument) 
#as.character(inspect(CORPUS[100]))
#inspect(CORPUS)

tdm <- TermDocumentMatrix(CORPUS)
dtm <- DocumentTermMatrix(CORPUS)
dim(tdm)
## [1] 26281 83065
dim(dtm)
## [1] 83065 26281
inspect(dtm)
## <<DocumentTermMatrix (documents: 83065, terms: 26281)>>
## Non-/sparse entries: 1141901/2181889364
## Sparsity           : 100%
## Maximal term length: 30
## Weighting          : term frequency (tf)
## Sample             :
##        Terms
## Docs    back causing fell finger hand piece rock roof slipped work
##   17219    0       1    1      0    0     0    1    0       0    0
##   21659    1       0    0      0    0     0    0    0       0    1
##   26361    0       2    0      0    0     0    2    0       0    0
##   35028    1       0    1      0    0     0    0    0       0    0
##   36027    0       0    0      2    0     0    0    0       0    0
##   36558    1       0    0      0    0     0    0    0       0    0
##   52021    1       0    0      0    0     0    0    0       0    0
##   55303    0       0    1      0    0     0    0    0       0    0
##   59415    0       1    0      1    0     0    0    0       0    0
##   8261     0       0    0      0    0     0    0    0       0    0
inspect(tdm)
## <<TermDocumentMatrix (terms: 26281, documents: 83065)>>
## Non-/sparse entries: 1141901/2181889364
## Sparsity           : 100%
## Maximal term length: 30
## Weighting          : term frequency (tf)
## Sample             :
##          Docs
## Terms     17219 21659 26361 35028 36027 36558 52021 55303 59415 8261
##   back        0     1     0     1     0     1     1     0     0    0
##   causing     1     0     2     0     0     0     0     0     1    0
##   fell        1     0     0     1     0     0     0     1     0    0
##   finger      0     0     0     0     2     0     0     0     1    0
##   hand        0     0     0     0     0     0     0     0     0    0
##   piece       0     0     0     0     0     0     0     0     0    0
##   rock        1     0     2     0     0     0     0     0     0    0
##   roof        0     0     0     0     0     0     0     0     0    0
##   slipped     0     0     0     0     0     0     0     0     0    0
##   work        0     1     0     0     0     0     0     0     0    0
df <- data.frame(text = get("content", CORPUS))
df$text<-as.character(df$text)
data$NARRATIVE_DF<-df$text

## Word Frequency
dfm<-dfm(df$text)
features_dfm <- textstat_frequency(dfm(df$text), n = 20)
features_dfm
##    feature frequency rank docfreq group
## 1     back     19747    1   17273   all
## 2     fell     16879    2   16269   all
## 3     rock     14755    3   11937   all
## 4     hand     13971    4   11216   all
## 5     roof     13144    5   10022   all
## 6  causing     12325    6   11796   all
## 7     work     11976    7   10077   all
## 8   finger     11178    8    8929   all
## 9  slipped     10977    9   10889   all
## 10   piece     10684   10    9977   all
## 11    pain     10648   11    9932   all
## 12  struck     10504   12   10117   all
## 13    belt     10388   13    7557   all
## 14    knee      8918   14    7328   all
## 15   miner      8881   15    7088   all
## 16 injured      8812   16    7163   all
## 17    felt      8393   17    8259   all
## 18     hit      7606   18    7105   all
## 19   cable      7481   19    5485   all
## 20    head      7250   20    6322   all
# Sort by reverse frequency order
features_dfm$feature <- with(features_dfm, reorder(feature, -frequency))

p <- ggplot(features_dfm, aes(feature, frequency))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p  

## TF-IDF
tfidfdfm<-dfm_tfidf(dfm)
head(tfidfdfm[, 5:10])
## Document-feature matrix of: 6 documents, 6 features (83.3% sparse).
## 6 x 6 sparse Matrix of class "dfm"
##        features
## docs      rubber   gasket material     flat   table     shop
##   text1 2.270083 3.743327  1.86634 2.345387 2.94629 2.249172
##   text2 0        0         0       0        0       0       
##   text3 0        0         0       0        0       0       
##   text4 0        0         0       0        0       0       
##   text5 0        0         0       0        0       0       
##   text6 0        0         0       0        0       0
features_tfidfdfm <- textstat_frequency(tfidfdfm, n = 20)
features_tfidfdfm
##    feature frequency rank docfreq group
## 1     back 13468.447    1   17273   all
## 2     rock 12431.425    2   11937   all
## 3     hand 12148.903    3   11216   all
## 4     roof 12072.286    4   10022   all
## 5     fell 11951.298    5   16269   all
## 6     work 10971.056    6   10077   all
## 7   finger 10827.181    7    8929   all
## 8     belt 10814.621    8    7557   all
## 9  causing 10447.697    9   11796   all
## 10   piece  9833.747   10    9977   all
## 11    pain  9821.517   11    9932   all
## 12 slipped  9686.435   12   10889   all
## 13  struck  9604.504   13   10117   all
## 14   miner  9492.851   14    7088   all
## 15    knee  9403.430   15    7328   all
## 16 injured  9378.815   16    7163   all
## 17   cable  8829.386   17    5485   all
## 18    bolt  8606.047   18    5153   all
## 19    felt  8413.904   19    8259   all
## 20   truck  8270.390   20    4703   all
## Kwic by top 30 words
quacor<-quanteda::corpus(data$NARRATIVE_DF)
head(kwic(quacor, "work*", window = 3, valuetype = "glob"), 30)
##                                                       
##    [text7, 28]           lt shoulder lost |   work   |
##    [text8, 13]          rotator cuff tear |   work   |
##    [text9, 16]    shoulder employees lost |   work   |
##    [text10, 2]                     stated | working  |
##   [text11, 13]         tears surgery lost |   work   |
##   [text13, 12]         torn meniscus lost |   work   |
##    [text14, 9]        hand employees lost |   work   |
##   [text46, 23] protect longer accommodate |   work   |
##   [text48, 15]      mashed finger surgery |   work   |
##   [text52, 25]        ease opinion injury |   work   |
##   [text52, 28]        work related agreed |   work   |
##    [text56, 1]                            | working  |
##    [text91, 8]         putting joint pipe | working  |
##  [text115, 18]     finger started missing |   work   |
##   [text116, 3]            started missing |   work   |
##  [text118, 12]       back started missing |   work   |
##   [text120, 9]     shoulder began missing |   work   |
##   [text154, 9]           ees shift failed | workable |
##   [text159, 5] office personnel condition |  worked  |
##  [text163, 17] physical therapy treatment |   work   |
##  [text163, 20]     work restrictions lost | workdays |
##  [text166, 14]   shoulder released return |   work   |
##   [text172, 1]                            |   work   |
##   [text172, 5]           injury turned sa | workers  |
##   [text176, 9]             eye washed eye |  worked  |
##   [text184, 8]        hands feels related |   work   |
##  [text189, 16]      knee impact continued |   work   |
##   [text193, 9]       hit forearm returned |   work   |
##  [text213, 13]     shoulder pop continued | working  |
##  [text214, 20]        lower back returned |   work   |
##                            
##  day occur untul           
##  day occur surgery         
##  day surgery               
##  feeder ram car            
##  day                       
##  day occur                 
##  day occurred surgery      
##  restrictions starting     
##                            
##  related agreed work       
##  related started losing    
##  inert atmosphere wearing  
##  december surgery          
##  surgery                   
##  diagnosed alleged carpal  
##                            
##                            
##  power ladder installed    
##  doctor underworkers comp  
##  restrictions lost workdays
##  occurred result injury    
##  making lost time          
##  injury turned sa          
##  comp bulged discs         
##  shift doctor              
##  belt maintenance years    
##  knee surgery              
##  reevaluated missed damage 
##  surgery performed rt      
##  restricted duty
head(kwic(quacor, "dead*", window = 3, valuetype = "glob"), 30)
##                                                                   
##   [text1624, 23]             tram accidentally hit |    dead     |
##   [text2539, 17]               desired height time |    dead     |
##   [text2663, 14]    transported surface pronounced |    dead     |
##   [text2771, 24] regained consciousness pronounced |    dead     |
##   [text2785, 16]    revive unsuccessful pronounced |    dead     |
##    [text4093, 7]               mine mechanic found |    dead     |
##    [text5324, 4]              security guard found |    dead     |
##   [text5604, 16]         grabbing handrail located |  deadlink   |
##   [text5604, 24]                    lower leg foot |  deadlink   |
##   [text6591, 14]                   feb pmman found |    dead     |
##   [text6591, 21]                   paper man found |    dead     |
##    [text7187, 2]                           pulling |    dead     |
##   [text7917, 19]       ill unsuccessful pronounced |    dead     |
##    [text8224, 2]                            states | deadheading |
##    [text8504, 6]          slumped dozer pronounced |    dead     |
##   [text9751, 24]      memorial hospital pronounced |    dead     |
##  [text12574, 16]     emergency hospital pronounced |    dead     |
##  [text13131, 15]                   tub coming back |  deadhead   |
##  [text13484, 13]                   top tree pulled |    dead     |
##  [text13594, 14]  proceeded employeewas pronounced |    dead     |
##  [text13741, 26]   transported hospital pronounced |    dead     |
##  [text14690, 15]   transported hospital pronounced |    dead     |
##   [text15759, 4]          injured plugging charger |    dead     |
##  [text15905, 21]     community hospital pronounced |    dead     |
##  [text15958, 10]                   cable tray foot |   deadman   |
##  [text16318, 30]           thick victim pronounced |    dead     |
##  [text18027, 15]   ambulance technician pronounced |    dead     |
##   [text20384, 5]                twine pocket knife |    dead     |
##   [text21935, 8]      area subsequently pronounced |    dead     |
##  [text23512, 14]     ambulance hospital pronounced |    dead     |
##                                
##  man tram moved                
##  end cable bracket             
##  helicopter ambulance personnel
##  apparent heart attack         
##                                
##  cab fuel truck                
##  inside guard shack            
##  causing fall backward         
##  pusharm causing fall          
##  mountain work thursday        
##  weiser state forest           
##  mantrip chain broke           
##  hospital work related         
##  endloaderforklift drill site  
##  emergency room doctor         
##  pm                            
##  apparent heart attack         
##  machine foottoes pinched      
##  tree approx diameter          
##  hospital monday nov           
##  pm natural                    
##  approximately pm              
##  set ram batteries             
##                                
##  switch                        
##  scene assistant knox          
##  arrival hospital              
##  end shoe dropped              
##  result natural                
##  pm death listed
head(kwic(quacor, "injury*", window = 3, valuetype = "glob"), 30)
##                                                          
##   [text52, 24]                dis ease opinion | injury |
##  [text163, 23]        workdays occurred result | injury |
##  [text164, 13]           equipment deck ground | injury |
##  [text171, 17]          finger msha reportable | injury |
##   [text172, 2]                            work | injury |
##  [text172, 15]       attention incident relate | injury |
##  [text215, 11] confirming workrelatedness foot | injury |
##  [text274, 14]             shift work pinpoint | injury |
##  [text288, 11]                  sled lost time | injury |
##  [text298, 11]          face medical treatment | injury |
##  [text299, 13]      stitches medical treatment | injury |
##  [text301, 13]            rail restricted duty | injury |
##  [text302, 12]            reportable lost time | injury |
##  [text303, 11]          foot medical treatment | injury |
##  [text307, 13]                   pop lost time | injury |
##  [text308, 26]               abdomen lost time | injury |
##  [text309, 10]            reportable lost time | injury |
##   [text319, 8]               knee swollen date | injury |
##  [text327, 24]     reoccurrance prior shoulder | injury |
##  [text331, 18]         forehead belt structure | injury |
##  [text354, 19]          member truss resulting | injury |
##   [text376, 8]       forearm medical treatment | injury |
##  [text377, 16]                   leg lost time | injury |
##   [text378, 7]         wrist medical treatment | injury |
##  [text379, 12]            reportable lost time | injury |
##  [text395, 20]         recently trimmed corner | injury |
##  [text404, 14]           worked week reporting | injury |
##  [text418, 27]          point causing crushing | injury |
##  [text436, 14]        rear timber reaggravated | injury |
##   [text439, 7] aggravated preexisting personal | injury |
##                          
##  work related agreed     
##                          
##  resulted ear laceration 
##                          
##  turned sa workers       
##  degree                  
##                          
##                          
##  november                
##                          
##                          
##                          
##  surgery                 
##                          
##                          
##                          
##                          
##  reported day missed     
##  stated dr               
##  required sutures        
##  rib cage                
##                          
##  surgery                 
##                          
##  surgery                 
##  required stitches repair
##                          
##                          
##  day operating shuttle   
##  dr days put
## quanteda corpus
dd<-corpus(data$NARRATIVE_DF)
docvars(dd, "Year") <- data$CAL_YR
docvars(dd, "Location") <- data$SUBUNIT_CD_F
docvars(dd, "Degree") <- data$DEGREE_INJURY_CD_F
head(docvars(dd))
##       Year        Location     Degree
## text1 2002 Other Locations NoLostTime
## text2 2003 Other Locations NoLostTime
## text3 2003         Surface   LostTime
## text4 2004         Surface   LostTime
## text5 2002     Underground NoLostTime
## text6 2002     Underground NoLostTime
## Lexical dispersion plot : Plotting a kwic object produces a lexical dispersion plot which allows us to visualize the occurrences of particular terms throughout the text. We call these “x-ray” plots due to their similarity.
#ldp <- corpus_subset(dd, Location == "Underground")
#kwic(ldp, pattern = "work") %>%
#  textplot_xray()

#textplot_xray(
#  kwic(ldp, "work"),
#  kwic(ldp, "dead"),
#  kwic(ldp, "injury")
#)