20181013 Verbatim Practice - TM
if("rJava" %in% installed.packages("rJava") == FALSE)install.packages("rJava")
library(rJava)
if("memoise" %in% installed.packages("memoise") == FALSE)install.packages("memoise")
library(memoise)
if("stringr" %in% installed.packages("stringr") == FALSE)install.packages("stringr")
library(stringr)
if("KoNLP" %in% installed.packages("KoNLP") == FALSE)install.packages("KoNLP")
library(KoNLP)
if("tm" %in% installed.packages("tm") == FALSE)install.packages("tm")
library(tm)
if("wordcloud" %in% installed.packages("wordcloud") == FALSE)install.packages("wordcloud")
library(wordcloud)
if("RColorBrewer" %in% installed.packages("RColorBrewer") == FALSE)install.packages("RColorBrewer")
library(RColorBrewer)
if("dplyr" %in% installed.packages("dplyr") == FALSE)install.packages("dplyr")
library("dplyr")
install.packages("readxl")
library("readxl")
setwd('C:\\Users\\Administrator\\rlang_weekend2\\Data_R_181013')
tgt1 <- read_excel("Verbatim.xlsx", 1)
# 만약 sheet가 여러개이면 숫자만, 2, 3, 4로 변경가능
head(tgt1)
# useSejongDic()
# KoNLP::buildDictionary(
# ext_dic = c('sejong','woorimalsam')
# )
#
# verbatim <- str_replace_all(verbatim,"\\W"," ")
# head(verbatim)
#
# nouns <- sapply(
# verbatim,
# extractNoun,
# USE.NAMES = F
# )
#
# nouns <- extractNoun(verbatim)
# wordcount <- table(unlist(nouns))
# df_word <- as.data.frame(wordcount,stringAsFactor = F)
# df_word <- dplyr::rename(df_word,
# word = Var1,
# freq = Freq)
# # head(df_word)
# df_word <- filter(df_word, nchar(word)>=2)
#
# top20 <- df_word %>%
# arranage(desc(freq)) %>%
# head(20)
tgt2 <- VCorpus(VectorSource(tgt1))
tgt2 <- tm_map(tgt2, tm::stripWhitespace) # 공백처리
tgt2 <- tm_map(tgt2, tolower) # 알파벳이면 소문자
tgt2 <- tm_map(tgt2, removePunctuation) # 마침표, 공백, 세미콜론, 콜론제거
gsb <- c(stopwords(
('english')
))
tgt2 <- tm_map(tgt2, removeWords, gsb)
tgt2 <- tm_map(tgt2, PlainTextDocument)
tgt3 <- TermDocumentMatrix(tgt2)
findFreqTerms(tgt3, 5)
findAssocs(tgt3, "BMW", 0.5)
tgt4 <- as.matrix(tgt3)
head(tgt4)
tgt4 <- sort(rowSums(tgt4),decreasing = T)
tgt4
# KoNLP 사용해서 gsb function 사요
pal <- brewer.pal(8,"Dark2")
set.seed(1234)
wordcloud(
names(tgt4),
freq = tgt4,
scale = c(2.5,0.1), # 단어크기 0.1 ~ 2.5
rot.per =0.25, # 회전비율
min.freq = 2, # 최저 빈도수 2회이상
random.order = F, # 고빈도 단어 중앙배치
random.color = T,
colors = pal
)
LS0tDQp0aXRsZTogIjIwMTgxMDEzIFZlcmJhdGltIFByYWN0aWNlIC1UTSDsgqzsmqkiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoyMDE4MTAxMyBWZXJiYXRpbSBQcmFjdGljZSAtIFRNDQoNCmBgYHtyfQ0KaWYoInJKYXZhIiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygickphdmEiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJySmF2YSIpDQpsaWJyYXJ5KHJKYXZhKQ0KaWYoIm1lbW9pc2UiICVpbiUgaW5zdGFsbGVkLnBhY2thZ2VzKCJtZW1vaXNlIikgPT0gRkFMU0UpaW5zdGFsbC5wYWNrYWdlcygibWVtb2lzZSIpDQpsaWJyYXJ5KG1lbW9pc2UpDQppZigic3RyaW5nciIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoInN0cmluZ3IiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJzdHJpbmdyIikNCmxpYnJhcnkoc3RyaW5ncikNCmlmKCJLb05MUCIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoIktvTkxQIikgPT0gRkFMU0UpaW5zdGFsbC5wYWNrYWdlcygiS29OTFAiKQ0KbGlicmFyeShLb05MUCkNCmlmKCJ0bSIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoInRtIikgPT0gRkFMU0UpaW5zdGFsbC5wYWNrYWdlcygidG0iKQ0KbGlicmFyeSh0bSkNCmlmKCJ3b3JkY2xvdWQiICVpbiUgaW5zdGFsbGVkLnBhY2thZ2VzKCJ3b3JkY2xvdWQiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJ3b3JkY2xvdWQiKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQppZigiUkNvbG9yQnJld2VyIiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygiUkNvbG9yQnJld2VyIikgPT0gRkFMU0UpaW5zdGFsbC5wYWNrYWdlcygiUkNvbG9yQnJld2VyIikNCmxpYnJhcnkoUkNvbG9yQnJld2VyKQ0KaWYoImRwbHlyIiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygiZHBseXIiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJkcGx5ciIpDQpsaWJyYXJ5KCJkcGx5ciIpDQoNCmluc3RhbGwucGFja2FnZXMoInJlYWR4bCIpDQpsaWJyYXJ5KCJyZWFkeGwiKQ0KDQpzZXR3ZCgnQzpcXFVzZXJzXFxBZG1pbmlzdHJhdG9yXFxybGFuZ193ZWVrZW5kMlxcRGF0YV9SXzE4MTAxMycpDQp0Z3QxICA8LSByZWFkX2V4Y2VsKCJWZXJiYXRpbS54bHN4IiwgMSkNCiMg66eM7JW9IHNoZWV06rCAIOyXrOufrOqwnOydtOuptCDsiKvsnpDrp4wsIDIsIDMsIDTroZwg67OA6rK96rCA64qlDQpoZWFkKHRndDEpDQoNCiMgdXNlU2Vqb25nRGljKCkNCiMgS29OTFA6OmJ1aWxkRGljdGlvbmFyeSgNCiMgICBleHRfZGljID0gYygnc2Vqb25nJywnd29vcmltYWxzYW0nKQ0KIyApDQojIA0KIyB2ZXJiYXRpbSA8LSBzdHJfcmVwbGFjZV9hbGwodmVyYmF0aW0sIlxcVyIsIiAiKQ0KIyBoZWFkKHZlcmJhdGltKQ0KIyANCiMgbm91bnMgPC0gc2FwcGx5KA0KIyAgICAgICAgICAgICAgICAgdmVyYmF0aW0sDQojICAgICAgICAgICAgICAgICBleHRyYWN0Tm91biwNCiMgICAgICAgICAgICAgICAgIFVTRS5OQU1FUyA9IEYNCiMgKQ0KIyANCiMgbm91bnMgPC0gZXh0cmFjdE5vdW4odmVyYmF0aW0pDQojIHdvcmRjb3VudCA8LSB0YWJsZSh1bmxpc3Qobm91bnMpKQ0KIyBkZl93b3JkIDwtIGFzLmRhdGEuZnJhbWUod29yZGNvdW50LHN0cmluZ0FzRmFjdG9yID0gRikNCiMgZGZfd29yZCA8LSBkcGx5cjo6cmVuYW1lKGRmX3dvcmQsDQojICAgICAgICAgICAgICAgICAgICAgICAgICB3b3JkID0gVmFyMSwNCiMgICAgICAgICAgICAgICAgICAgICAgICAgIGZyZXEgPSBGcmVxKQ0KIyAjIGhlYWQoZGZfd29yZCkNCiMgZGZfd29yZCA8LSBmaWx0ZXIoZGZfd29yZCwgbmNoYXIod29yZCk+PTIpDQojIA0KIyB0b3AyMCA8LSBkZl93b3JkICU+JSANCiMgICBhcnJhbmFnZShkZXNjKGZyZXEpKSAlPiUgDQojICAgaGVhZCgyMCkNCg0KdGd0MiA8LSBWQ29ycHVzKFZlY3RvclNvdXJjZSh0Z3QxKSkNCnRndDIgPC0gdG1fbWFwKHRndDIsIHRtOjpzdHJpcFdoaXRlc3BhY2UpICMg6rO167Cx7LKY66asDQp0Z3QyIDwtIHRtX21hcCh0Z3QyLCB0b2xvd2VyKSAjIOyVjO2MjOuys+ydtOuptCDshozrrLjsnpANCnRndDIgPC0gdG1fbWFwKHRndDIsIHJlbW92ZVB1bmN0dWF0aW9uKSAjIOuniOy5qO2RnCwg6rO167CxLCDshLjrr7jsvZzroaAsIOy9nOuhoOygnOqxsA0KDQpnc2IgPC0gYyhzdG9wd29yZHMoDQogICgnZW5nbGlzaCcpDQopKQ0KdGd0MiA8LSB0bV9tYXAodGd0MiwgcmVtb3ZlV29yZHMsIGdzYikNCnRndDIgPC0gdG1fbWFwKHRndDIsIFBsYWluVGV4dERvY3VtZW50KQ0KdGd0MyA8LSBUZXJtRG9jdW1lbnRNYXRyaXgodGd0MikNCmZpbmRGcmVxVGVybXModGd0MywgNSkNCmZpbmRBc3NvY3ModGd0MywgIkJNVyIsIDAuNSkNCnRndDQgPC0gYXMubWF0cml4KHRndDMpDQpoZWFkKHRndDQpDQp0Z3Q0IDwtIHNvcnQocm93U3Vtcyh0Z3Q0KSxkZWNyZWFzaW5nID0gVCkNCnRndDQNCg0KIyBLb05MUCDsgqzsmqntlbTshJwgZ3NiIGZ1bmN0aW9uIOyCrOyalA0KDQpwYWwgPC0gYnJld2VyLnBhbCg4LCJEYXJrMiIpDQpzZXQuc2VlZCgxMjM0KSANCndvcmRjbG91ZCgNCiAgbmFtZXModGd0NCksDQogIGZyZXEgPSB0Z3Q0LA0KICBzY2FsZSA9IGMoMi41LDAuMSksICMg64uo7Ja07YGs6riwIDAuMSB+IDIuNQ0KICByb3QucGVyID0wLjI1LCAjIO2ajOyghOu5hOycqA0KICBtaW4uZnJlcSA9IDIsICMg7LWc7KCAIOu5iOuPhOyImCAy7ZqM7J207IOBDQogIHJhbmRvbS5vcmRlciA9IEYsICMg6rOg67mI64+EIOuLqOyWtCDspJHslZnrsLDsuZgNCiAgcmFuZG9tLmNvbG9yID0gVCwNCiAgY29sb3JzID0gcGFsDQopDQoNCmBgYA0KDQoNCg==