library(rJava)
if("memoise" %in% installed.packages("memoise") == FALSE)install.packages("memoise")
library(memoise)
if("KoNLP" %in% installed.packages("KoNLP") == FALSE)install.packages("KoNLP")
library(KoNLP)
if("tm" %in% installed.packages("tm") == FALSE)install.packages("tm")
library(tm)
if("wordcloud" %in% installed.packages("wordcloud") == FALSE)install.packages("wordcloud")
library(wordcloud)
if("dplyr" %in% installed.packages("dplyr") == FALSE)install.packages("dplyr")
library(dplyr)
library(ggplot2)
if("stringr" %in% installed.packages("stringr") == FALSE)install.packages("stringr")
library(stringr)
if("RColorBrewer" %in% installed.packages("RColorBrewer") == FALSE)install.packages("RColorBrewer")
library(RColorBrewer)

KoNLP::useSejongDic()
getwd()
# setp 1. 데이터를 로딩한다
txt <- readLines("jeju.txt")
head(txt)

# step 3. 명사만 추출
nouns <-
  sapply(
    txt,
    extractNoun,
    USE.NAMES = F
  )
class(nouns)
head(nouns, 10)
cdata <- unlist(nouns) # 각 단어를 낱개로 분리
cdata <- stringr::str_replace_all(cdata,"[^[:alpha:]]"," ") 
cdata <- gsub(" ","",cdata)

gsubTxt <- readLines("제주도여행코스gsub(1).txt")
cnt <- length(gsubTxt)
for (i in 1:cnt) {
  cdata <- gsub(gsubTxt[i],"", cdata)
}
cdata
cdata <- Filter(function(x){nchar(x) >= 2}, cdata)
write(unlist(cdata), "jeju_2.txt")
nouns <- read.table("jeju_2.txt")
nrow(nouns)
wordcloud <- table(nouns)
head(sort(wordcount, decreasing = T),30)
top10 <- head(sort(wordcount, decreasing = T),10)
pie(top10, 
    col = rainbow(10),
    radius = 1,
    main="제주도 추천 여행코스 TOP 10")

# step 2. 특수문자를 제거
# txt <- stringr::str_replace_all(txt, "\\W", " ") # 특수문자 제거, 한글만 남기는건 대문자 W
# txt <- stringr::str_replace_all(txt,"[^[:alpha:]]"," ") 

# step 3-1. 특정단어 삭제하기
gsubTxt <- readLines("제주도여행코스gsub(1).txt")
gsubTxt
cnt <- length(gsubTxt)
print(paste("삭제하려는 단어의 수:", cnt))
for (i in 1:cnt) {
  nouns <- gsub(gsubTxt[i],"", nouns)
}
nouns

# step 4. 단어별로 빈도표 작성
wordcount <- table(unlist(nouns))
df_word <- as.data.frame(wordcount, stringsAsFactors = F)

# step 5. 변수명 수정
names(df_word)
df_word <- dplyr::rename(
  df_word,
  word = Var1,
  freq = Freq
)
df_word

# step 6. 2글자 이상만 추출
df_word <- dplyr::filter(df_word, nchar(word)>=2)
df_word

# step 7. 빈도순 정렬 후 상위 20단어만 추출
top_20 <- df_word %>% 
  dplyr::arrange(desc(freq)) %>% 
  head(20)
top_20

# step 8. Word cloud 만들기
wordcloud::wordcloud(
  words = df_word$word,
  freq = df_word$freq,
  min.freq = 2,
  max.words = 200,
  random.order = F,
  rot.per = .1,
  scale = c(4, 0.3),
  colors = brewer.pal(8, "Dark2")
)
LS0tDQp0aXRsZTogIuygnOyjvOuPhCB0ZXh0ICplcnJvciINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCmBgYHtyfQ0KbGlicmFyeShySmF2YSkNCmlmKCJtZW1vaXNlIiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygibWVtb2lzZSIpID09IEZBTFNFKWluc3RhbGwucGFja2FnZXMoIm1lbW9pc2UiKQ0KbGlicmFyeShtZW1vaXNlKQ0KaWYoIktvTkxQIiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygiS29OTFAiKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJLb05MUCIpDQpsaWJyYXJ5KEtvTkxQKQ0KaWYoInRtIiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygidG0iKSA9PSBGQUxTRSlpbnN0YWxsLnBhY2thZ2VzKCJ0bSIpDQpsaWJyYXJ5KHRtKQ0KaWYoIndvcmRjbG91ZCIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoIndvcmRjbG91ZCIpID09IEZBTFNFKWluc3RhbGwucGFja2FnZXMoIndvcmRjbG91ZCIpDQpsaWJyYXJ5KHdvcmRjbG91ZCkNCmlmKCJkcGx5ciIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoImRwbHlyIikgPT0gRkFMU0UpaW5zdGFsbC5wYWNrYWdlcygiZHBseXIiKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoZ2dwbG90MikNCmlmKCJzdHJpbmdyIiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygic3RyaW5nciIpID09IEZBTFNFKWluc3RhbGwucGFja2FnZXMoInN0cmluZ3IiKQ0KbGlicmFyeShzdHJpbmdyKQ0KaWYoIlJDb2xvckJyZXdlciIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoIlJDb2xvckJyZXdlciIpID09IEZBTFNFKWluc3RhbGwucGFja2FnZXMoIlJDb2xvckJyZXdlciIpDQpsaWJyYXJ5KFJDb2xvckJyZXdlcikNCg0KS29OTFA6OnVzZVNlam9uZ0RpYygpDQpnZXR3ZCgpDQojIHNldHAgMS4g642w7J207YSw66W8IOuhnOuUqe2VnOuLpA0KdHh0IDwtIHJlYWRMaW5lcygiamVqdS50eHQiKQ0KaGVhZCh0eHQpDQoNCiMgc3RlcCAzLiDrqoXsgqzrp4wg7LaU7LacDQpub3VucyA8LQ0KICBzYXBwbHkoDQogICAgdHh0LA0KICAgIGV4dHJhY3ROb3VuLA0KICAgIFVTRS5OQU1FUyA9IEYNCiAgKQ0KY2xhc3Mobm91bnMpDQpoZWFkKG5vdW5zLCAxMCkNCmNkYXRhIDwtIHVubGlzdChub3VucykgIyDqsIEg64uo7Ja066W8IOuCseqwnOuhnCDrtoTrpqwNCmNkYXRhIDwtIHN0cmluZ3I6OnN0cl9yZXBsYWNlX2FsbChjZGF0YSwiW15bOmFscGhhOl1dIiwiICIpIA0KY2RhdGEgPC0gZ3N1YigiICIsIiIsY2RhdGEpDQoNCmdzdWJUeHQgPC0gcmVhZExpbmVzKCLsoJzso7zrj4Tsl6ztlonsvZTsiqRnc3ViKDEpLnR4dCIpDQpjbnQgPC0gbGVuZ3RoKGdzdWJUeHQpDQpmb3IgKGkgaW4gMTpjbnQpIHsNCiAgY2RhdGEgPC0gZ3N1Yihnc3ViVHh0W2ldLCIiLCBjZGF0YSkNCn0NCmNkYXRhDQpjZGF0YSA8LSBGaWx0ZXIoZnVuY3Rpb24oeCl7bmNoYXIoeCkgPj0gMn0sIGNkYXRhKQ0Kd3JpdGUodW5saXN0KGNkYXRhKSwgImplanVfMi50eHQiKQ0Kbm91bnMgPC0gcmVhZC50YWJsZSgiamVqdV8yLnR4dCIpDQpucm93KG5vdW5zKQ0Kd29yZGNsb3VkIDwtIHRhYmxlKG5vdW5zKQ0KaGVhZChzb3J0KHdvcmRjb3VudCwgZGVjcmVhc2luZyA9IFQpLDMwKQ0KdG9wMTAgPC0gaGVhZChzb3J0KHdvcmRjb3VudCwgZGVjcmVhc2luZyA9IFQpLDEwKQ0KcGllKHRvcDEwLCANCiAgICBjb2wgPSByYWluYm93KDEwKSwNCiAgICByYWRpdXMgPSAxLA0KICAgIG1haW49IuygnOyjvOuPhCDstpTsspwg7Jes7ZaJ7L2U7IqkIFRPUCAxMCIpDQoNCiMgc3RlcCAyLiDtirnsiJjrrLjsnpDrpbwg7KCc6rGwDQojIHR4dCA8LSBzdHJpbmdyOjpzdHJfcmVwbGFjZV9hbGwodHh0LCAiXFxXIiwgIiAiKSAjIO2KueyImOusuOyekCDsoJzqsbAsIO2VnOq4gOunjCDrgqjquLDripTqsbQg64yA66y47J6QIFcNCiMgdHh0IDwtIHN0cmluZ3I6OnN0cl9yZXBsYWNlX2FsbCh0eHQsIlteWzphbHBoYTpdXSIsIiAiKSANCg0KIyBzdGVwIDMtMS4g7Yq57KCV64uo7Ja0IOyCreygnO2VmOq4sA0KZ3N1YlR4dCA8LSByZWFkTGluZXMoIuygnOyjvOuPhOyXrO2Wiey9lOyKpGdzdWIoMSkudHh0IikNCmdzdWJUeHQNCmNudCA8LSBsZW5ndGgoZ3N1YlR4dCkNCnByaW50KHBhc3RlKCLsgq3soJztlZjroKTripQg64uo7Ja07J2YIOyImDoiLCBjbnQpKQ0KZm9yIChpIGluIDE6Y250KSB7DQogIG5vdW5zIDwtIGdzdWIoZ3N1YlR4dFtpXSwiIiwgbm91bnMpDQp9DQpub3Vucw0KDQojIHN0ZXAgNC4g64uo7Ja067OE66GcIOu5iOuPhO2RnCDsnpHshLENCndvcmRjb3VudCA8LSB0YWJsZSh1bmxpc3Qobm91bnMpKQ0KZGZfd29yZCA8LSBhcy5kYXRhLmZyYW1lKHdvcmRjb3VudCwgc3RyaW5nc0FzRmFjdG9ycyA9IEYpDQoNCiMgc3RlcCA1LiDrs4DsiJjrqoUg7IiY7KCVDQpuYW1lcyhkZl93b3JkKQ0KZGZfd29yZCA8LSBkcGx5cjo6cmVuYW1lKA0KICBkZl93b3JkLA0KICB3b3JkID0gVmFyMSwNCiAgZnJlcSA9IEZyZXENCikNCmRmX3dvcmQNCg0KIyBzdGVwIDYuIDLquIDsnpAg7J207IOB66eMIOy2lOy2nA0KZGZfd29yZCA8LSBkcGx5cjo6ZmlsdGVyKGRmX3dvcmQsIG5jaGFyKHdvcmQpPj0yKQ0KZGZfd29yZA0KDQojIHN0ZXAgNy4g67mI64+E7IicIOygleugrCDtm4Qg7IOB7JyEIDIw64uo7Ja066eMIOy2lOy2nA0KdG9wXzIwIDwtIGRmX3dvcmQgJT4lIA0KICBkcGx5cjo6YXJyYW5nZShkZXNjKGZyZXEpKSAlPiUgDQogIGhlYWQoMjApDQp0b3BfMjANCg0KIyBzdGVwIDguIFdvcmQgY2xvdWQg66eM65Ok6riwDQp3b3JkY2xvdWQ6OndvcmRjbG91ZCgNCiAgd29yZHMgPSBkZl93b3JkJHdvcmQsDQogIGZyZXEgPSBkZl93b3JkJGZyZXEsDQogIG1pbi5mcmVxID0gMiwNCiAgbWF4LndvcmRzID0gMjAwLA0KICByYW5kb20ub3JkZXIgPSBGLA0KICByb3QucGVyID0gLjEsDQogIHNjYWxlID0gYyg0LCAwLjMpLA0KICBjb2xvcnMgPSBicmV3ZXIucGFsKDgsICJEYXJrMiIpDQopDQoNCmBgYA0K