#Package 나중에 자동 인스톨 및 로드할 코드 작성
Packages = c(
'tidyverse',
'tm',
'readxl',
'psych',
'dplyr',
'stm',
'data.table',
'ggplot2',
'extrafont',
'dplyr',
'magrittr',
'tidyverse',
'data.table',
'tidytext',
'quanteda',
'topicmodels',
'doParallel',
'scales'
)
for(p in Packages){
if(!require(p,character.only = TRUE)) install.packages(p)
library(p,character.only = TRUE)
}
## Loading required package: tidyverse
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.1
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Loading required package: readxl
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: stm
## stm v1.3.3 (2018-1-26) successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## Loading required package: extrafont
## Registering fonts with R
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Loading required package: tidytext
## Loading required package: quanteda
## Package version: 1.3.4
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
## Loading required package: topicmodels
## Loading required package: doParallel
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
## Loading required package: scales
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# System
set.seed(12345)
Sys.info()
## sysname
## "Darwin"
## release
## "17.7.0"
## version
## "Darwin Kernel Version 17.7.0: Thu Jun 21 22:53:14 PDT 2018; root:xnu-4570.71.2~1/RELEASE_X86_64"
## nodename
## "Chungils-MacBook-Pro-2.local"
## machine
## "x86_64"
## login
## "chadchae"
## user
## "chadchae"
## effective_user
## "chadchae"
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] scales_1.0.0 doParallel_1.0.11 iterators_1.0.10
## [4] foreach_1.4.4 topicmodels_0.2-7 quanteda_1.3.4
## [7] tidytext_0.1.9 magrittr_1.5 extrafont_0.17
## [10] data.table_1.11.4 stm_1.3.3 psych_1.8.4
## [13] readxl_1.1.0 tm_0.7-5 NLP_0.1-11
## [16] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.6
## [19] purrr_0.2.5 readr_1.1.1 tidyr_0.8.1
## [22] tibble_1.4.2 ggplot2_3.0.0 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] httr_1.3.1 jsonlite_1.5 modelr_0.1.2
## [4] RcppParallel_4.4.1 assertthat_0.2.0 stats4_3.5.0
## [7] cellranger_1.1.0 yaml_2.2.0 slam_0.1-43
## [10] Rttf2pt1_1.3.7 pillar_1.3.0 backports_1.1.2
## [13] lattice_0.20-35 glue_1.3.0 extrafontdb_1.0
## [16] digest_0.6.16 rvest_0.3.2 colorspace_1.3-2
## [19] htmltools_0.3.6 Matrix_1.2-14 plyr_1.8.4
## [22] pkgconfig_2.0.2 broom_0.5.0 haven_1.1.2
## [25] withr_2.1.2 lazyeval_0.2.1 cli_1.0.0
## [28] mnormt_1.5-5 crayon_1.3.4 evaluate_0.11
## [31] stopwords_0.9.0 tokenizers_0.2.1 janeaustenr_0.1.5
## [34] nlme_3.1-137 SnowballC_0.5.1 xml2_1.2.0
## [37] foreign_0.8-71 tools_3.5.0 hms_0.4.2
## [40] munsell_0.5.0 bindrcpp_0.2.2 compiler_3.5.0
## [43] rlang_0.2.2 grid_3.5.0 rstudioapi_0.7
## [46] rmarkdown_1.10 gtable_0.2.0 codetools_0.2-15
## [49] R6_2.2.2 lubridate_1.7.4 knitr_1.20
## [52] bindr_0.1.1 fastmatch_1.1-0 rprojroot_1.3-2
## [55] modeltools_0.2-22 stringi_1.2.4 Rcpp_0.12.18
## [58] spacyr_0.9.91 tidyselect_0.2.4
Packages
## [1] "tidyverse" "tm" "readxl" "psych" "dplyr"
## [6] "stm" "data.table" "ggplot2" "extrafont" "dplyr"
## [11] "magrittr" "tidyverse" "data.table" "tidytext" "quanteda"
## [16] "topicmodels" "doParallel" "scales"
Sys.time()
## [1] "2018-09-15 17:28:41 EDT"
# 한글설정
#Sys.setlocale("LC_CTYPE", "ko_KR.UTF-8")
#fonts()
#par(family='AppleMyungjo')
#theme_set(theme_gray(base_family='AppleMyungjo'))
# Read Data
rawdata<-read.csv("../Data/accident.csv")
rawdata$NARRATIVE<-as.character(rawdata$NARRATIVE)
table(rawdata$DEGREE_INJURY_CD)
##
## ? 0 1 10 2 3 4 5 6 7 8 9
## 910 25159 902 1722 1920 69931 14822 32159 56086 8039 982 415
# 3 locations(subunit), 4 degree of injury, year
# Record Data
rawdata$DEGREE_INJURY_CD<-as.character(rawdata$DEGREE_INJURY_CD)
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="?"] <- NA
rawdata$DEGREE_INJURY_CD<-as.numeric(rawdata$DEGREE_INJURY_CD)
table(rawdata$DEGREE_INJURY_CD)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 25159 902 1920 69931 14822 32159 56086 8039 982 415 1722
summary(rawdata$DEGREE_INJURY_CD)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 3.000 4.000 4.036 6.000 10.000 910
# Filter :restricts the dataset to coal mines only, produces an error
rawdata<-filter(rawdata, COAL_METAL_IND == "C")
table(rawdata$COAL_METAL_IND)
##
## C M
## 104854 0
table(rawdata$DEGREE_INJURY_CD)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 21788 460 832 44379 4142 4927 22646 4494 273 60 459
table(rawdata$SUBUNIT_CD)
##
## 1 2 3 4 5 6 17 30 99
## 75818 4849 15916 345 167 37 343 7170 209
# SUBUNIT_CD= 1,2 - 1; 3; >3-3; 99-NA
#1-underground mining, 2-surface mining, 3-others
rawdata$SUBUNIT_CD<-as.character(rawdata$SUBUNIT_CD)
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="1"] <- 1
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="2"] <- 1
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="3"] <- 2
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="4"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="5"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="6"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="12"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="17"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="30"] <- 3
rawdata$SUBUNIT_CD[rawdata$SUBUNIT_CD=="99"] <- NA
rawdata$SUBUNIT_CD<-as.numeric(rawdata$SUBUNIT_CD)
table(rawdata$SUBUNIT_CD)
##
## 1 2 3
## 80667 15916 8062
table(rawdata$SUBUNIT_CD, rawdata$DEGREE_INJURY_CD)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1 21101 318 610 32760 2822 3347 15735 3268 135 18 254
## 2 500 97 140 7533 869 1190 4452 802 98 29 143
## 3 182 44 82 4006 443 382 2385 408 32 11 57
table(rawdata$NO_INJURIES)
##
## 0 1 2 3 4 5 6 7 8 9 10 13
## 21789 82265 487 93 56 15 36 7 8 9 10 13
## 14 16 36
## 14 16 36
rawdata<-filter(rawdata, NO_INJURIES > 0)
table(rawdata$NO_INJURIES)
##
## 1 2 3 4 5 6 7 8 9 10 13 14
## 82265 487 93 56 15 36 7 8 9 10 13 14
## 16 36
## 16 36
table(rawdata$SUBUNIT_CD, rawdata$DEGREE_INJURY_CD)
##
## 1 2 3 4 5 6 7 8 9 10
## 1 318 610 32760 2822 3347 15735 3268 135 18 254
## 2 97 140 7533 869 1190 4452 802 98 29 143
## 3 44 82 4006 443 382 2385 408 32 11 56
table(rawdata$DEGREE_INJURY_CD)
##
## 1 2 3 4 5 6 7 8 9 10
## 460 832 44379 4142 4927 22646 4494 273 60 458
# Regrouping
rawdata$DEGREE_INJURY_CD<-as.character(rawdata$DEGREE_INJURY_CD)
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="1"] <- 1
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="2"] <- 2
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="3"] <- 3
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="4"] <- 4
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="5"] <- 5
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="6"] <- 6
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="7"] <- NA
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="8"] <- NA
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="9"] <- NA
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="10"] <- NA
rawdata$DEGREE_INJURY_CD<-as.numeric(rawdata$DEGREE_INJURY_CD)
table(rawdata$DEGREE_INJURY_CD)
##
## 1 2 3 4 5 6
## 460 832 44379 4142 4927 22646
# 3,4,5 days lost from work, 6 by itself
rawdata$DEGREE_INJURY_CD<-as.character(rawdata$DEGREE_INJURY_CD)
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="1"] <- 1
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="2"] <- 2
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="3"] <- 3
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="4"] <- 3
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="5"] <- 3
rawdata$DEGREE_INJURY_CD[rawdata$DEGREE_INJURY_CD=="6"] <- 4
rawdata$DEGREE_INJURY_CD<-as.numeric(rawdata$DEGREE_INJURY_CD)
table(rawdata$DEGREE_INJURY_CD)
##
## 1 2 3 4
## 460 832 53448 22646
nrow(rawdata)
## [1] 83065
table(rawdata$DEGREE_INJURY_CD, rawdata$SUBUNIT_CD)
##
## 1 2 3
## 1 318 97 44
## 2 610 140 82
## 3 38929 9592 4831
## 4 15735 4452 2385
table(rawdata$DEGREE_INJURY_CD, rawdata$CAL_YR)
##
## 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
## 1 38 42 28 30 28 23 47 34 30 18 48 20 20 20
## 2 91 51 42 56 50 65 52 52 50 64 40 62 41 36
## 3 4703 4527 4478 3493 3512 3537 3538 3364 3326 3068 2851 2981 2661 2411
## 4 1597 1679 1492 1286 1468 1557 1618 1441 1529 1372 1400 1559 1321 1124
##
## 2014 2015 2016
## 1 16 11 7
## 2 33 33 14
## 3 2314 1804 880
## 4 1016 810 377
# short narrative, spell and poor English, what kind of machine,
#save(rawdata, file="../Data/data1.rds")
# Load Basic Setting and Dataset
source("../Script/0_Cleaning.R")
load("../Data/data1.rds")
# Data Coding
## Data Seleciton
data<-select(rawdata, CAL_YR, SUBUNIT_CD, DEGREE_INJURY_CD, NARRATIVE)
head(data)
## CAL_YR SUBUNIT_CD DEGREE_INJURY_CD
## 1 2002 3 4
## 2 2003 3 4
## 3 2003 2 3
## 4 2004 2 3
## 5 2002 1 4
## 6 2002 1 4
## NARRATIVE
## 1 INJURED WAS USING A UTILITY KNIFE TO CUT RUBBER GASKET MATERIAL ON A FLAT TABLE IN THE SHOP. HE HAD THE KNIFE IN HIS RIGHT HAND CUTTING TOWARDS HIMSELF. THE KNIFE SLIPPED & HIT HIS LEFT INDEX FINGER, CUTTING IT. HE WAS NOT WEARING GLOVES. CUT REQUIRED TWO STITCHES TO CLOSE.
## 2 INJURED WAS IN THE ELEVATOR ON THE 1ST FLOOR OF THE PLANT. HE WAS ATTEMPTING TO CLOSE THE OUTER DOOR. HE WAS PULLING DOWN ON THE STRAP WITH BOTH HANDS. HE LEANED TOWARD THE DOOR AS HE PULLED & WHEN THE TWO PARTS OF THE DOOR CAME TOGETHER HE GOT HIS MIDDLE, RING & LITTLE FINGER ON HIS RIGHT HAND BETWEEN THE DOOR HALVES.
## 3 DISMOUNTING RR250, CAME DOWN LADDER AND RT KNEE STRUCK PLATFORM ON THE MACHINE.
## 4 EE WAS AT THE SOUTH END OF PIT TO HELP MOVE A WATER PUMP. HE WAS WALKING NEAR THE REAR OF PUMP WHEN HE SLIPPED & FELL ON HIS RT. ARM, CONDITIONS WERE WET & MUDDY.
## 5 INJURED WAS PULLING A 1" AIRLINE BESIDE L.M. HEADER WHEN HE STRUCK HIS LEFT SHOULDER AGAINST A FIRE SUPPRESSION OUTLET RESULTING IN A LACERATION TO HIS SHOULDER.
## 6 INJURED WAS GREASING CRUSHER BEARING WHEN PICK CAME OFF CRUSHER, KNOCKING COVER OFF AND STRIKING INDIVIDUAL IN LEFT HAND AND FOREARM.
## Factors
### SUBUNIT_CO
data$SUBUNIT_CD_F[data$SUBUNIT_CD==1]<-"Underground"
data$SUBUNIT_CD_F[data$SUBUNIT_CD==2]<-"Surface"
data$SUBUNIT_CD_F[data$SUBUNIT_CD==3]<-"Other Locations"
data$SUBUNIT_CD_F<-as.factor(data$SUBUNIT_CD_F)
head(data$SUBUNIT_CD_F)
## [1] Other Locations Other Locations Surface Surface
## [5] Underground Underground
## Levels: Other Locations Surface Underground
### DEGREE_INJURY_CD : 1 = Death, 2=Disability 3=Lost Time 4=No Lost Time
data$DEGREE_INJURY_CD_F[data$DEGREE_INJURY_CD==1]<-"Death"
data$DEGREE_INJURY_CD_F[data$DEGREE_INJURY_CD==2]<-"Disability"
data$DEGREE_INJURY_CD_F[data$DEGREE_INJURY_CD==3]<-"LostTime"
data$DEGREE_INJURY_CD_F[data$DEGREE_INJURY_CD==4]<-"NoLostTime"
data$DEGREE_INJURY_CD_F<-as.factor(data$DEGREE_INJURY_CD_F)
head(data$DEGREE_INJURY_CD_F)
## [1] NoLostTime NoLostTime LostTime LostTime NoLostTime NoLostTime
## Levels: Death Disability LostTime NoLostTime
# Text Analysis
## Corpus
###Cleaning
data$NARRATIVE <- gsub("[^[:alnum:]///' ]", "", data$NARRATIVE)
TEXTBASIC<- Corpus(VectorSource(data$NARRATIVE))
CORPUS <- tm_map(TEXTBASIC, removePunctuation)
## Warning in tm_map.SimpleCorpus(TEXTBASIC, removePunctuation):
## transformation drops documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
##
## [1] Employee was standing on a crossflow hose hitting an 8 pipe and slipped and fell hurting right lower leg EMPLOYEE DID NOT START LOSING TIME UNTIL 6308 DUE TO INFECTION
## [1] "Employee was standing on a crossflow hose hitting an 8 pipe and slipped and fell hurting right lower leg EMPLOYEE DID NOT START LOSING TIME UNTIL 6308 DUE TO INFECTION"
## [2] "list(language = \"en\")"
## [3] "list()"
CORPUS <- tm_map(CORPUS, removeNumbers)
## Warning in tm_map.SimpleCorpus(CORPUS, removeNumbers): transformation drops
## documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
##
## [1] Employee was standing on a crossflow hose hitting an pipe and slipped and fell hurting right lower leg EMPLOYEE DID NOT START LOSING TIME UNTIL DUE TO INFECTION
## [1] "Employee was standing on a crossflow hose hitting an pipe and slipped and fell hurting right lower leg EMPLOYEE DID NOT START LOSING TIME UNTIL DUE TO INFECTION"
## [2] "list(language = \"en\")"
## [3] "list()"
CORPUS <- tm_map(CORPUS, tolower)
## Warning in tm_map.SimpleCorpus(CORPUS, tolower): transformation drops
## documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
##
## [1] employee was standing on a crossflow hose hitting an pipe and slipped and fell hurting right lower leg employee did not start losing time until due to infection
## [1] "employee was standing on a crossflow hose hitting an pipe and slipped and fell hurting right lower leg employee did not start losing time until due to infection"
## [2] "list(language = \"en\")"
## [3] "list()"
CORPUS <- tm_map(CORPUS, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(CORPUS, removeWords, stopwords("english")):
## transformation drops documents
CORPUS <- tm_map(CORPUS, removeWords, stopwords("SMART"))
## Warning: 'stopwords(language = "SMART")' is deprecated.
## Use 'stopwords(source = "smart")' instead.
## See help("Deprecated")
## Warning in tm_map.SimpleCorpus(CORPUS, removeWords, stopwords("SMART")):
## transformation drops documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
##
## [1] employee standing crossflow hose hitting pipe slipped fell hurting lower leg employee start losing time due infection
## [1] "employee standing crossflow hose hitting pipe slipped fell hurting lower leg employee start losing time due infection"
## [2] "list(language = \"en\")"
## [3] "list()"
CORPUS <- tm_map(CORPUS, removeWords, c("and", "for", "that","not", "did",
"due", "to", "this", "are", "is",
"am","with","their","the","can",
"from","article", "study", "ee",
"employee", "left", "right", "up","down"))
## Warning in tm_map.SimpleCorpus(CORPUS, removeWords, c("and", "for",
## "that", : transformation drops documents
#CORPUS <- tm_map(CORPUS, stemDocument)
#as.character(inspect(CORPUS[100]))
CORPUS <- tm_map(CORPUS, stripWhitespace)
## Warning in tm_map.SimpleCorpus(CORPUS, stripWhitespace): transformation
## drops documents
as.character(inspect(CORPUS[100]))
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
##
## [1] standing crossflow hose hitting pipe slipped fell hurting lower leg start losing time infection
## [1] " standing crossflow hose hitting pipe slipped fell hurting lower leg start losing time infection"
## [2] "list(language = \"en\")"
## [3] "list()"
#CORPUS <- tm_map(CORPUS, PlainTextDocument)
#as.character(inspect(CORPUS[100]))
#inspect(CORPUS)
tdm <- TermDocumentMatrix(CORPUS)
dtm <- DocumentTermMatrix(CORPUS)
dim(tdm)
## [1] 26281 83065
dim(dtm)
## [1] 83065 26281
inspect(dtm)
## <<DocumentTermMatrix (documents: 83065, terms: 26281)>>
## Non-/sparse entries: 1141901/2181889364
## Sparsity : 100%
## Maximal term length: 30
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs back causing fell finger hand piece rock roof slipped work
## 17219 0 1 1 0 0 0 1 0 0 0
## 21659 1 0 0 0 0 0 0 0 0 1
## 26361 0 2 0 0 0 0 2 0 0 0
## 35028 1 0 1 0 0 0 0 0 0 0
## 36027 0 0 0 2 0 0 0 0 0 0
## 36558 1 0 0 0 0 0 0 0 0 0
## 52021 1 0 0 0 0 0 0 0 0 0
## 55303 0 0 1 0 0 0 0 0 0 0
## 59415 0 1 0 1 0 0 0 0 0 0
## 8261 0 0 0 0 0 0 0 0 0 0
inspect(tdm)
## <<TermDocumentMatrix (terms: 26281, documents: 83065)>>
## Non-/sparse entries: 1141901/2181889364
## Sparsity : 100%
## Maximal term length: 30
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 17219 21659 26361 35028 36027 36558 52021 55303 59415 8261
## back 0 1 0 1 0 1 1 0 0 0
## causing 1 0 2 0 0 0 0 0 1 0
## fell 1 0 0 1 0 0 0 1 0 0
## finger 0 0 0 0 2 0 0 0 1 0
## hand 0 0 0 0 0 0 0 0 0 0
## piece 0 0 0 0 0 0 0 0 0 0
## rock 1 0 2 0 0 0 0 0 0 0
## roof 0 0 0 0 0 0 0 0 0 0
## slipped 0 0 0 0 0 0 0 0 0 0
## work 0 1 0 0 0 0 0 0 0 0
df <- data.frame(text = get("content", CORPUS))
df$text<-as.character(df$text)
data$NARRATIVE_DF<-df$text
## Word Frequency
dfm<-dfm(df$text)
features_dfm <- textstat_frequency(dfm(df$text), n = 20)
features_dfm
## feature frequency rank docfreq group
## 1 back 19747 1 17273 all
## 2 fell 16879 2 16269 all
## 3 rock 14755 3 11937 all
## 4 hand 13971 4 11216 all
## 5 roof 13144 5 10022 all
## 6 causing 12325 6 11796 all
## 7 work 11976 7 10077 all
## 8 finger 11178 8 8929 all
## 9 slipped 10977 9 10889 all
## 10 piece 10684 10 9977 all
## 11 pain 10648 11 9932 all
## 12 struck 10504 12 10117 all
## 13 belt 10388 13 7557 all
## 14 knee 8918 14 7328 all
## 15 miner 8881 15 7088 all
## 16 injured 8812 16 7163 all
## 17 felt 8393 17 8259 all
## 18 hit 7606 18 7105 all
## 19 cable 7481 19 5485 all
## 20 head 7250 20 6322 all
# Sort by reverse frequency order
features_dfm$feature <- with(features_dfm, reorder(feature, -frequency))
p <- ggplot(features_dfm, aes(feature, frequency))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p

## TF-IDF
tfidfdfm<-dfm_tfidf(dfm)
head(tfidfdfm[, 5:10])
## Document-feature matrix of: 6 documents, 6 features (83.3% sparse).
## 6 x 6 sparse Matrix of class "dfm"
## features
## docs rubber gasket material flat table shop
## text1 2.270083 3.743327 1.86634 2.345387 2.94629 2.249172
## text2 0 0 0 0 0 0
## text3 0 0 0 0 0 0
## text4 0 0 0 0 0 0
## text5 0 0 0 0 0 0
## text6 0 0 0 0 0 0
features_tfidfdfm <- textstat_frequency(tfidfdfm, n = 20)
features_tfidfdfm
## feature frequency rank docfreq group
## 1 back 13468.447 1 17273 all
## 2 rock 12431.425 2 11937 all
## 3 hand 12148.903 3 11216 all
## 4 roof 12072.286 4 10022 all
## 5 fell 11951.298 5 16269 all
## 6 work 10971.056 6 10077 all
## 7 finger 10827.181 7 8929 all
## 8 belt 10814.621 8 7557 all
## 9 causing 10447.697 9 11796 all
## 10 piece 9833.747 10 9977 all
## 11 pain 9821.517 11 9932 all
## 12 slipped 9686.435 12 10889 all
## 13 struck 9604.504 13 10117 all
## 14 miner 9492.851 14 7088 all
## 15 knee 9403.430 15 7328 all
## 16 injured 9378.815 16 7163 all
## 17 cable 8829.386 17 5485 all
## 18 bolt 8606.047 18 5153 all
## 19 felt 8413.904 19 8259 all
## 20 truck 8270.390 20 4703 all
## Kwic by top 30 words
quacor<-quanteda::corpus(data$NARRATIVE_DF)
head(kwic(quacor, "work*", window = 3, valuetype = "glob"), 30)
##
## [text7, 28] lt shoulder lost | work |
## [text8, 13] rotator cuff tear | work |
## [text9, 16] shoulder employees lost | work |
## [text10, 2] stated | working |
## [text11, 13] tears surgery lost | work |
## [text13, 12] torn meniscus lost | work |
## [text14, 9] hand employees lost | work |
## [text46, 23] protect longer accommodate | work |
## [text48, 15] mashed finger surgery | work |
## [text52, 25] ease opinion injury | work |
## [text52, 28] work related agreed | work |
## [text56, 1] | working |
## [text91, 8] putting joint pipe | working |
## [text115, 18] finger started missing | work |
## [text116, 3] started missing | work |
## [text118, 12] back started missing | work |
## [text120, 9] shoulder began missing | work |
## [text154, 9] ees shift failed | workable |
## [text159, 5] office personnel condition | worked |
## [text163, 17] physical therapy treatment | work |
## [text163, 20] work restrictions lost | workdays |
## [text166, 14] shoulder released return | work |
## [text172, 1] | work |
## [text172, 5] injury turned sa | workers |
## [text176, 9] eye washed eye | worked |
## [text184, 8] hands feels related | work |
## [text189, 16] knee impact continued | work |
## [text193, 9] hit forearm returned | work |
## [text213, 13] shoulder pop continued | working |
## [text214, 20] lower back returned | work |
##
## day occur untul
## day occur surgery
## day surgery
## feeder ram car
## day
## day occur
## day occurred surgery
## restrictions starting
##
## related agreed work
## related started losing
## inert atmosphere wearing
## december surgery
## surgery
## diagnosed alleged carpal
##
##
## power ladder installed
## doctor underworkers comp
## restrictions lost workdays
## occurred result injury
## making lost time
## injury turned sa
## comp bulged discs
## shift doctor
## belt maintenance years
## knee surgery
## reevaluated missed damage
## surgery performed rt
## restricted duty
head(kwic(quacor, "dead*", window = 3, valuetype = "glob"), 30)
##
## [text1624, 23] tram accidentally hit | dead |
## [text2539, 17] desired height time | dead |
## [text2663, 14] transported surface pronounced | dead |
## [text2771, 24] regained consciousness pronounced | dead |
## [text2785, 16] revive unsuccessful pronounced | dead |
## [text4093, 7] mine mechanic found | dead |
## [text5324, 4] security guard found | dead |
## [text5604, 16] grabbing handrail located | deadlink |
## [text5604, 24] lower leg foot | deadlink |
## [text6591, 14] feb pmman found | dead |
## [text6591, 21] paper man found | dead |
## [text7187, 2] pulling | dead |
## [text7917, 19] ill unsuccessful pronounced | dead |
## [text8224, 2] states | deadheading |
## [text8504, 6] slumped dozer pronounced | dead |
## [text9751, 24] memorial hospital pronounced | dead |
## [text12574, 16] emergency hospital pronounced | dead |
## [text13131, 15] tub coming back | deadhead |
## [text13484, 13] top tree pulled | dead |
## [text13594, 14] proceeded employeewas pronounced | dead |
## [text13741, 26] transported hospital pronounced | dead |
## [text14690, 15] transported hospital pronounced | dead |
## [text15759, 4] injured plugging charger | dead |
## [text15905, 21] community hospital pronounced | dead |
## [text15958, 10] cable tray foot | deadman |
## [text16318, 30] thick victim pronounced | dead |
## [text18027, 15] ambulance technician pronounced | dead |
## [text20384, 5] twine pocket knife | dead |
## [text21935, 8] area subsequently pronounced | dead |
## [text23512, 14] ambulance hospital pronounced | dead |
##
## man tram moved
## end cable bracket
## helicopter ambulance personnel
## apparent heart attack
##
## cab fuel truck
## inside guard shack
## causing fall backward
## pusharm causing fall
## mountain work thursday
## weiser state forest
## mantrip chain broke
## hospital work related
## endloaderforklift drill site
## emergency room doctor
## pm
## apparent heart attack
## machine foottoes pinched
## tree approx diameter
## hospital monday nov
## pm natural
## approximately pm
## set ram batteries
##
## switch
## scene assistant knox
## arrival hospital
## end shoe dropped
## result natural
## pm death listed
head(kwic(quacor, "injury*", window = 3, valuetype = "glob"), 30)
##
## [text52, 24] dis ease opinion | injury |
## [text163, 23] workdays occurred result | injury |
## [text164, 13] equipment deck ground | injury |
## [text171, 17] finger msha reportable | injury |
## [text172, 2] work | injury |
## [text172, 15] attention incident relate | injury |
## [text215, 11] confirming workrelatedness foot | injury |
## [text274, 14] shift work pinpoint | injury |
## [text288, 11] sled lost time | injury |
## [text298, 11] face medical treatment | injury |
## [text299, 13] stitches medical treatment | injury |
## [text301, 13] rail restricted duty | injury |
## [text302, 12] reportable lost time | injury |
## [text303, 11] foot medical treatment | injury |
## [text307, 13] pop lost time | injury |
## [text308, 26] abdomen lost time | injury |
## [text309, 10] reportable lost time | injury |
## [text319, 8] knee swollen date | injury |
## [text327, 24] reoccurrance prior shoulder | injury |
## [text331, 18] forehead belt structure | injury |
## [text354, 19] member truss resulting | injury |
## [text376, 8] forearm medical treatment | injury |
## [text377, 16] leg lost time | injury |
## [text378, 7] wrist medical treatment | injury |
## [text379, 12] reportable lost time | injury |
## [text395, 20] recently trimmed corner | injury |
## [text404, 14] worked week reporting | injury |
## [text418, 27] point causing crushing | injury |
## [text436, 14] rear timber reaggravated | injury |
## [text439, 7] aggravated preexisting personal | injury |
##
## work related agreed
##
## resulted ear laceration
##
## turned sa workers
## degree
##
##
## november
##
##
##
## surgery
##
##
##
##
## reported day missed
## stated dr
## required sutures
## rib cage
##
## surgery
##
## surgery
## required stitches repair
##
##
## day operating shuttle
## dr days put
## quanteda corpus
dd<-corpus(data$NARRATIVE_DF)
docvars(dd, "Year") <- data$CAL_YR
docvars(dd, "Location") <- data$SUBUNIT_CD_F
docvars(dd, "Degree") <- data$DEGREE_INJURY_CD_F
head(docvars(dd))
## Year Location Degree
## text1 2002 Other Locations NoLostTime
## text2 2003 Other Locations NoLostTime
## text3 2003 Surface LostTime
## text4 2004 Surface LostTime
## text5 2002 Underground NoLostTime
## text6 2002 Underground NoLostTime
## Lexical dispersion plot : Plotting a kwic object produces a lexical dispersion plot which allows us to visualize the occurrences of particular terms throughout the text. We call these “x-ray” plots due to their similarity.
#ldp <- corpus_subset(dd, Location == "Underground")
#kwic(ldp, pattern = "work") %>%
# textplot_xray()
#textplot_xray(
# kwic(ldp, "work"),
# kwic(ldp, "dead"),
# kwic(ldp, "injury")
#)