library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.0     √ purrr   0.3.2
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   0.8.3     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## Warning: package 'readr' was built under R version 3.6.1
## Warning: package 'dplyr' was built under R version 3.6.1
## Warning: package 'forcats' was built under R version 3.6.1
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(XML)
library(tm)
## Warning: package 'tm' was built under R version 3.6.1
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(KoNLP)
## Warning: package 'KoNLP' was built under R version 3.6.1
## Checking user defined dictionary!
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.6.1
library(httr)
## Warning: package 'httr' was built under R version 3.6.1
## 
## Attaching package: 'httr'
## The following object is masked from 'package:NLP':
## 
##     content
#Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_211')
cleanTags <- function(x) {
  return(gsub("<.*?>", "", x))
  }
client_id <- "XNIFQpFXsUfxBYwpNJrJ"
client_secret <- "kXltH2V_qd"
search.word <- c("엔터테인먼트")
search.enc <- URLencode(enc2utf8(paste(search.word)))
search.result <- data.frame(terms=character(),
                            links=character(),
                            descriptions=character(),
                            pubDate=character())
api_url_xml = "https://openapi.naver.com/v1/search/news.xml"
for(i in 1:length(search.enc)) {
  for(idx in seq(1, 901, 100)) {
    query <- paste("?query=", search.enc, sep="")
    query <- paste(query, "&display=", 100, "&start=", idx, "&sort=", "sim",
                   sep="")
    result <- GET(paste(api_url_xml, query, sep=""),
                  add_headers("X-Naver-Client-Id"=client_id,
                              "X-Naver-Client-Secret"=client_secret))
    result.xml <- xmlParse(result)
    link <- xpathSApply(result.xml, "/rss/channel/item/originallink", xmlValue)
    description<-xpathSApply(result.xml, "/rss/channel/item/description",
                             xmlValue)
    pDate <- xpathSApply(result.xml, "/rss/channel/item/pubDate", xmlValue)
    if(length(link)) {
      search.result <- rbind(search.result,
                             data.frame(
                               terms = search.enc,
                               links = link,
                               descriptions = cleanTags(description),
                               pubDate = pDate,
                               stringsAsFactors = FALSE))
    }
  }
}


search.result2 <- search.result[!duplicated(search.result[,2]), ]
cps <- VCorpus( VectorSource(search.result2$descriptions) )
tdm <- tm_map(cps, removeNumbers)
tdm <- tm_map(tdm, removePunctuation)
tdm <- tm_map(tdm, stripWhitespace)
rm_words <- c(search.word,"밝혔다", "에서","에게","$은","$는",
              "“", "”", "‘", "’", "quot", "·","jinphototvreportcok",
              "rainbowsportsseoulcom", "◇", "▶", "■", "▲",
              "㎏", "△","《","》","【","】","▷")
#rm_words2 <- c("에서","$을","$를","$이","$가","$의","에서","에게","$는","$은","$와","$과")
rm_words2 <- c("에서","을$","를$","이$","가$","의$","에서","에게","는$","은$","와$","과$","으로","로$")
for(i in seq_along(tdm)) {
  tdm[[i]]$content <- gsub( c("송암스포츠경기장에서"), "송암경기장",
                            tdm[[i]]$content )
  tdm[[i]]$content <- gsub( paste( c("bts의","bts이","bts가","BTS을","BTS를","BTS가"), collapse="|"), "bts",
                            tdm[[i]]$content )
  #tdm[[i]]$content <- gsub( c("bts이","bts의"), "bts",
  #                          tdm[[i]]$content )
  tdm[[i]]$content <- gsub( paste( rm_words, collapse="|"), " ",
                            tdm[[i]]$content )
#  tdm[[i]]$content <- gsub( paste( rm_words2, collapse="|"), " ",
#                            tdm[[i]]$content )
}
tdm2 <- tdm
for(i in seq_along(tdm2)) {
  
  tdm2[[i]]$content <- paste(gsub( paste( rm_words2, collapse="|"), " ",
                            strsplit(tdm2[[i]]$content, split=" " )[[1]]), collapse=" ")
}
tdm2
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1000
tdm <- tdm2
myTdm <- TermDocumentMatrix(tdm,
                            control = list(tokenize=words,
                                           wordlLengths=c(2,9)))
myTdm
## <<TermDocumentMatrix (terms: 2651, documents: 1000)>>
## Non-/sparse entries: 9466/2641534
## Sparsity           : 100%
## Maximal term length: 21
## Weighting          : term frequency (tf)
nTerms(myTdm)
## [1] 2651
nDocs(myTdm)
## [1] 1000
myTdm.mat <- as.matrix(myTdm)
wordFreq <- rowSums(myTdm.mat)
sort.wordFreq <- sort(wordFreq, decreasing = TRUE)
wc.naver <- data.frame(word=as.character(names(sort.wordFreq)),
                       freq=sort.wordFreq, stringsAsFactors = FALSE)
min.wf.20p <- wc.naver$freq[nrow(wc.naver)/5]
wordcloud2(wc.naver[wc.naver$freq >= min.wf.20p, ], shape="star",
           minRotation = -pi/6, maxRotation = pi/6,
           size=.5, grid=1,
           rotateRatio = 1,
           color="random-dark",
           backgroundColor = "#fefefe")
findAssocs(myTdm,'아티스트',0.25)
## $아티스트
##       골든차일드         러블리즈         인피니트           네이버 
##             0.76             0.76             0.70             0.65 
##         프로젝트           새로운           소속돼         비아이bi 
##             0.62             0.50             0.47             0.41 
##           선언한         휩싸이며           유튜브   공개…로켓펀치 
##             0.41             0.41             0.35             0.29 
## 공개했다인피니트    방탄소년단bts         비아이와   사진제공빅히트 
##             0.29             0.29             0.29             0.29 
##         엄격하게 울림언터테인먼트           이슬기           제기에 
##             0.29             0.29             0.29             0.29 
##         진행하고     한국정경신문         해지한다       해지한다고 
##             0.29             0.29             0.29             0.29 
##         공개했다 
##             0.28
myTdm2<-removeSparseTerms(myTdm,sparse=0.95)
m2<-as.matrix(myTdm2)
distMatrix<-dist(scale(m2))
fit<-hclust(distMatrix)
plot(fit)
rect.hclust(fit,k=5)