if("rJava" %in% installed.packages("rJava") == FALSE)install.packages("rJava")
library(rJava)
if("memoise" %in% installed.packages("memoise") == FALSE)install.packages("memoise")
library(memoise)
if("KoNLP" %in% installed.packages("KoNLP") == FALSE)install.packages("KoNLP")
library(KoNLP)
if("tm" %in% installed.packages("tm") == FALSE)install.packages("tm")
library(tm)
if("wordcloud" %in% installed.packages("wordcloud") == FALSE)install.packages("wordcloud")
library(wordcloud)
if("dplyr" %in% installed.packages("dplyr") == FALSE)install.packages("dplyr")
library(dplyr)
if("stringr" %in% installed.packages("stringr") == FALSE)install.packages("stringr")
library(stringr)
# 글자에 색 입혀줌
if("RColorBrewer" %in% installed.packages("RColorBrewer") == FALSE)install.packages("RColorBrewer")
library(RColorBrewer)
KoNLP::useSejongDic()
# step 1. 대상파일 읽어들이기.(데이터를 로딩한다.)
txt <- readLines("hiphop.txt")
head(txt)
# step 2. 특수문자 제거
txt <- stringr::str_replace_all(txt,"\\W"," ")
head(txt)
txt <- stringr::str_replace_all(txt,"[^[:alpha:]]"," ")
txt
# step 3. 명사만 추출
nouns <-
sapply(
txt,
KoNLP::extractNoun,
USE.NAMES = F
)
class(nouns)
nouns
# step 4. 단어별 빈도표 작성
wordcount <- table(unlist(nouns))
df_word <- as.data.frame(wordcount, stringsAsFactors = F)
# step 5. 변수명 수정
names(df_word) # 자동으로 Var1, Freq 생성됨
df_word <- dplyr::rename(
df_word,
word = Var1,
freq = Freq
)
# step 6. 두 글자 이상 단어 추출
df_word <- dplyr::filter(df_word, nchar(word)>=2)
df_word
# step 7. 빈도순 정렬 후 상위 20개 단어만 추출
top_20 <- df_word %>% dplyr::arrange(desc(freq)) %>% head(20)
top_20
wordcloud::wordcloud(
words = df_word$word,
freq = df_word$freq,
min.freq = 2,
max.words = 200,
random.order = F,
rot.per = .1,
scale = c(4,0.3),
colors = brewer.pal(8,"Dark2")
)
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCg0KYGBge3J9DQppZigickphdmEiICVpbiUgaW5zdGFsbGVkLnBhY2thZ2VzKCJySmF2YSIpID09IEZBTFNFKWluc3RhbGwucGFja2FnZXMoInJKYXZhIikNCmxpYnJhcnkockphdmEpDQppZigibWVtb2lzZSIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoIm1lbW9pc2UiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJtZW1vaXNlIikNCmxpYnJhcnkobWVtb2lzZSkNCmlmKCJLb05MUCIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoIktvTkxQIikgPT0gRkFMU0UpaW5zdGFsbC5wYWNrYWdlcygiS29OTFAiKQ0KbGlicmFyeShLb05MUCkNCmlmKCJ0bSIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoInRtIikgPT0gRkFMU0UpaW5zdGFsbC5wYWNrYWdlcygidG0iKQ0KbGlicmFyeSh0bSkNCmlmKCJ3b3JkY2xvdWQiICVpbiUgaW5zdGFsbGVkLnBhY2thZ2VzKCJ3b3JkY2xvdWQiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJ3b3JkY2xvdWQiKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQppZigiZHBseXIiICVpbiUgaW5zdGFsbGVkLnBhY2thZ2VzKCJkcGx5ciIpID09IEZBTFNFKWluc3RhbGwucGFja2FnZXMoImRwbHlyIikNCmxpYnJhcnkoZHBseXIpDQppZigic3RyaW5nciIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoInN0cmluZ3IiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJzdHJpbmdyIikNCmxpYnJhcnkoc3RyaW5ncikNCiMg6riA7J6Q7JeQIOyDiSDsnoXtmIDspIwNCmlmKCJSQ29sb3JCcmV3ZXIiICVpbiUgaW5zdGFsbGVkLnBhY2thZ2VzKCJSQ29sb3JCcmV3ZXIiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJSQ29sb3JCcmV3ZXIiKQ0KbGlicmFyeShSQ29sb3JCcmV3ZXIpDQoNCktvTkxQOjp1c2VTZWpvbmdEaWMoKQ0KIyBzdGVwIDEuIOuMgOyDge2MjOydvCDsnb3slrTrk6TsnbTquLAuKOuNsOydtO2EsOulvCDroZzrlKntlZzri6QuKQ0KdHh0IDwtIHJlYWRMaW5lcygiaGlwaG9wLnR4dCIpDQpoZWFkKHR4dCkNCiMgc3RlcCAyLiDtirnsiJjrrLjsnpAg7KCc6rGwDQp0eHQgPC0gc3RyaW5ncjo6c3RyX3JlcGxhY2VfYWxsKHR4dCwiXFxXIiwiICIpDQpoZWFkKHR4dCkNCnR4dCA8LSBzdHJpbmdyOjpzdHJfcmVwbGFjZV9hbGwodHh0LCJbXls6YWxwaGE6XV0iLCIgIikNCnR4dA0KIyBzdGVwIDMuIOuqheyCrOunjCDstpTstpwNCm5vdW5zIDwtIA0KICBzYXBwbHkoDQogICAgdHh0LA0KICAgIEtvTkxQOjpleHRyYWN0Tm91biwNCiAgICBVU0UuTkFNRVMgPSBGDQogICkNCg0KY2xhc3Mobm91bnMpDQpub3Vucw0KIyBzdGVwIDQuIOuLqOyWtOuzhCDruYjrj4TtkZwg7J6R7ISxDQp3b3JkY291bnQgPC0gdGFibGUodW5saXN0KG5vdW5zKSkNCmRmX3dvcmQgPC0gYXMuZGF0YS5mcmFtZSh3b3JkY291bnQsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGKQ0KDQojIHN0ZXAgNS4g67OA7IiY66qFIOyImOyglQ0KbmFtZXMoZGZfd29yZCkgIyDsnpDrj5nsnLzroZwgVmFyMSwgRnJlcSDsg53shLHrkKgNCmRmX3dvcmQgPC0gZHBseXI6OnJlbmFtZSgNCiAgZGZfd29yZCwNCiAgd29yZCA9IFZhcjEsDQogIGZyZXEgPSBGcmVxDQopDQoNCiMgc3RlcCA2LiDrkZAg6riA7J6QIOydtOyDgSDri6jslrQg7LaU7LacDQpkZl93b3JkIDwtIGRwbHlyOjpmaWx0ZXIoZGZfd29yZCwgbmNoYXIod29yZCk+PTIpDQoNCmRmX3dvcmQNCg0KIyBzdGVwIDcuIOu5iOuPhOyInCDsoJXroKwg7ZuEIOyDgeychCAyMOqwnCDri6jslrTrp4wg7LaU7LacDQp0b3BfMjAgPC0gZGZfd29yZCAlPiUgZHBseXI6OmFycmFuZ2UoZGVzYyhmcmVxKSkgJT4lIGhlYWQoMjApDQp0b3BfMjANCg0Kd29yZGNsb3VkOjp3b3JkY2xvdWQoDQogIHdvcmRzID0gZGZfd29yZCR3b3JkLA0KICBmcmVxID0gZGZfd29yZCRmcmVxLA0KICBtaW4uZnJlcSA9IDIsDQogIG1heC53b3JkcyA9IDIwMCwNCiAgcmFuZG9tLm9yZGVyID0gIEYsDQogIHJvdC5wZXIgPSAuMSwNCiAgc2NhbGUgPSBjKDQsMC4zKSwNCiAgY29sb3JzID0gYnJld2VyLnBhbCg4LCJEYXJrMiIpDQopDQoNCmBgYA0K