install.packages("rvest")
install.packages("stringi")
install.packages("XML")
install.packages("Rcpp")
install.packages("yaml")
install.packages("xml2")
library(rvest)
library(stringi)
library(XML)
library(Rcpp)
library(yaml)
library(xml2)
library(rJava)
library(memoise)
library(KoNLP)
library(wordcloud)
library(dplyr)
library(stringr)
library(RColorBrewer)
url_base <- "https://movie.naver.com/movie/point/af/list.nhn?st=mcode&sword=136315&target=after&page="
head(url_base)
all.reviews <- c()
for(page in 1:20){
url <- paste(
url_base,
page,
sep='',
Encoding = "euc-kr"
)
htxt <- read_html(url)
table <- html_nodes(htxt,'.list_netizen') # .list_netizen의 .은 class가 list_netizen인
content <- html_nodes(table,'.title')
reviews <- html_text(content)
cat(head(reviews))
if(length(reviews)==0){
break;
} # 리뷰없이 평점만 준 것은 제외
all.reviews <- c(all.reviews,reviews)
cat("검색한 페이지:", page)
}
# getwd()
# setwd('C:\\Users\\Administrator\\rlang_weekend2\\rvest')
t1 <- write.table(all.reviews,'review.txt')
t2 <- table(t1)
t3 <- head(sort(t2,decreasing = T),30)
t3
tgt1 <- readLines("review.txt")
tgt1
useSejongDic()
KoNLP::buildDictionary(
ext_dic = c('sejong','woorimalsam')
)
tgt1 <- sapply(tgt1,
extractNoun,
USE.NAMES = F,
autoSpacing=T)
tgt2 <- unlist(tgt1)
tgt2
tgt3 <- stringr::str_replace_all(tgt2,'[^[:alpha:]]','')
tgt3
# gsub1 <- function(){
# gsb <- c(
# ' ', '[~!@#$%&*()_+=?<>]',"\\[",
# '[ㄱ-ㅎ]','(ㅜ|ㅠ)',"\\d+"
# )
# i <- 0
# for(i in 1:length(gsb)){
# tgt3 <- gsub(gsb[i],"",tgt2)
# }
# return(tgt3)
# }
#
# gsub2 <- function(){
# tgt4 <- gsub("인피니티","",tgt3)
# tgt4 <- gsub("어벤저스","",tgt3)
# }
#
# tgt3 <- gsub1()
# tgt4 <- gsub2()
tgt4 <- Filter(function(x){nchar(x)>=2},tgt3)
tgt4
tgt4 <- unlist(tgt4)
tgt4
tgt5 <- table(tgt4)
tgt5
tgt6 <- head(sort(tgt5,decreasing = T),30)
tgt6
pal <- brewer.pal(8,"Dark2")
set.seed(1234) # 모양이 일치 안하면 숫자를 바꿔가며 모양 찾기
wordcloud(
names(tgt6),
freq = tgt6,
scale = c(2.5,0.1), # 단어크기 0.1 ~ 2.5
rot.per =0.25, # 회전비율
min.freq = 2, # 최저 빈도수 2회이상
max.words = 200,
random.order = F, # 고빈도 단어 중앙배치
random.color = T,
colors = pal
)
LS0tDQp0aXRsZTogIkNyYXdsaW5nIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCiANCg0KYGBge3J9DQppbnN0YWxsLnBhY2thZ2VzKCJydmVzdCIpDQppbnN0YWxsLnBhY2thZ2VzKCJzdHJpbmdpIikNCmluc3RhbGwucGFja2FnZXMoIlhNTCIpDQppbnN0YWxsLnBhY2thZ2VzKCJSY3BwIikNCmluc3RhbGwucGFja2FnZXMoInlhbWwiKQ0KaW5zdGFsbC5wYWNrYWdlcygieG1sMiIpDQpsaWJyYXJ5KHJ2ZXN0KQ0KbGlicmFyeShzdHJpbmdpKQ0KbGlicmFyeShYTUwpDQpsaWJyYXJ5KFJjcHApDQpsaWJyYXJ5KHlhbWwpDQpsaWJyYXJ5KHhtbDIpDQpsaWJyYXJ5KHJKYXZhKQ0KbGlicmFyeShtZW1vaXNlKQ0KbGlicmFyeShLb05MUCkNCmxpYnJhcnkod29yZGNsb3VkKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoc3RyaW5ncikNCmxpYnJhcnkoUkNvbG9yQnJld2VyKQ0KDQp1cmxfYmFzZSA8LSAiaHR0cHM6Ly9tb3ZpZS5uYXZlci5jb20vbW92aWUvcG9pbnQvYWYvbGlzdC5uaG4/c3Q9bWNvZGUmc3dvcmQ9MTM2MzE1JnRhcmdldD1hZnRlciZwYWdlPSINCmhlYWQodXJsX2Jhc2UpDQphbGwucmV2aWV3cyA8LSBjKCkNCmZvcihwYWdlIGluIDE6MjApew0KICB1cmwgPC0gcGFzdGUoDQogICAgdXJsX2Jhc2UsDQogICAgcGFnZSwNCiAgICBzZXA9JycsDQogICAgRW5jb2RpbmcgPSAiZXVjLWtyIg0KICApDQogIGh0eHQgPC0gcmVhZF9odG1sKHVybCkNCiAgdGFibGUgPC0gaHRtbF9ub2RlcyhodHh0LCcubGlzdF9uZXRpemVuJykgICMgLmxpc3RfbmV0aXplbuydmCAu7J2AIGNsYXNz6rCAIGxpc3RfbmV0aXplbuyduA0KICBjb250ZW50IDwtIGh0bWxfbm9kZXModGFibGUsJy50aXRsZScpDQogIHJldmlld3MgPC0gaHRtbF90ZXh0KGNvbnRlbnQpDQogIGNhdChoZWFkKHJldmlld3MpKQ0KICBpZihsZW5ndGgocmV2aWV3cyk9PTApew0KICAgIGJyZWFrOw0KICB9ICMg66as67ew7JeG7J20IO2PieygkOunjCDspIAg6rKD7J2AIOygnOyZuA0KICBhbGwucmV2aWV3cyA8LSBjKGFsbC5yZXZpZXdzLHJldmlld3MpDQogIGNhdCgi6rKA7IOJ7ZWcIO2OmOydtOyngDoiLCBwYWdlKQ0KfQ0KIyBnZXR3ZCgpDQojIHNldHdkKCdDOlxcVXNlcnNcXEFkbWluaXN0cmF0b3JcXHJsYW5nX3dlZWtlbmQyXFxydmVzdCcpDQp0MSA8LSB3cml0ZS50YWJsZShhbGwucmV2aWV3cywncmV2aWV3LnR4dCcpDQp0MiA8LSB0YWJsZSh0MSkNCnQzIDwtIGhlYWQoc29ydCh0MixkZWNyZWFzaW5nID0gVCksMzApDQp0Mw0KDQp0Z3QxIDwtIHJlYWRMaW5lcygicmV2aWV3LnR4dCIpDQp0Z3QxDQp1c2VTZWpvbmdEaWMoKQ0KS29OTFA6OmJ1aWxkRGljdGlvbmFyeSgNCiAgZXh0X2RpYyA9IGMoJ3Nlam9uZycsJ3dvb3JpbWFsc2FtJykNCikNCg0KdGd0MSA8LSBzYXBwbHkodGd0MSwNCiAgICAgICAgICAgICAgIGV4dHJhY3ROb3VuLA0KICAgICAgICAgICAgICAgVVNFLk5BTUVTID0gRiwNCiAgICAgICAgICAgICAgIGF1dG9TcGFjaW5nPVQpDQoNCnRndDIgPC0gdW5saXN0KHRndDEpDQp0Z3QyDQp0Z3QzIDwtIHN0cmluZ3I6OnN0cl9yZXBsYWNlX2FsbCh0Z3QyLCdbXls6YWxwaGE6XV0nLCcnKQ0KdGd0Mw0KDQojIGdzdWIxIDwtIGZ1bmN0aW9uKCl7DQojICAgZ3NiIDwtIGMoDQojICAgICAnICcsICdbfiFAIyQlJiooKV8rPT88Pl0nLCJcXFsiLA0KIyAgICAgJ1vjhLEt44WOXScsJyjjhZx844WgKScsIlxcZCsiDQojICAgKQ0KIyAgIGkgPC0gMA0KIyAgIGZvcihpIGluIDE6bGVuZ3RoKGdzYikpew0KIyAgICAgdGd0MyA8LSBnc3ViKGdzYltpXSwiIix0Z3QyKQ0KIyAgIH0NCiMgICByZXR1cm4odGd0MykNCiMgfQ0KIyANCiMgZ3N1YjIgPC0gZnVuY3Rpb24oKXsNCiMgICB0Z3Q0IDwtICBnc3ViKCLsnbjtlLzri4jti7AiLCIiLHRndDMpDQojICAgdGd0NCA8LSAgZ3N1Yigi7Ja067Kk7KCA7IqkIiwiIix0Z3QzKQ0KIyB9DQojIA0KIyB0Z3QzIDwtIGdzdWIxKCkNCiMgdGd0NCA8LSBnc3ViMigpDQoNCnRndDQgPC0gRmlsdGVyKGZ1bmN0aW9uKHgpe25jaGFyKHgpPj0yfSx0Z3QzKQ0KdGd0NA0KdGd0NCA8LSB1bmxpc3QodGd0NCkNCnRndDQNCnRndDUgPC0gdGFibGUodGd0NCkNCnRndDUNCnRndDYgPC0gaGVhZChzb3J0KHRndDUsZGVjcmVhc2luZyA9IFQpLDMwKQ0KdGd0Ng0KDQpwYWwgPC0gYnJld2VyLnBhbCg4LCJEYXJrMiIpDQpzZXQuc2VlZCgxMjM0KSAjIOuqqOyWkeydtCDsnbzsuZgg7JWI7ZWY66m0IOyIq+yekOulvCDrsJTqv5TqsIDrqbAg66qo7JaRIOywvuq4sA0Kd29yZGNsb3VkKA0KICBuYW1lcyh0Z3Q2KSwNCiAgZnJlcSA9IHRndDYsDQogIHNjYWxlID0gYygyLjUsMC4xKSwgIyDri6jslrTtgazquLAgMC4xIH4gMi41DQogIHJvdC5wZXIgPTAuMjUsICMg7ZqM7KCE67mE7JyoDQogIG1pbi5mcmVxID0gMiwgIyDstZzsoIAg67mI64+E7IiYIDLtmozsnbTsg4ENCiAgbWF4LndvcmRzID0gMjAwLA0KICByYW5kb20ub3JkZXIgPSBGLCAjIOqzoOu5iOuPhCDri6jslrQg7KSR7JWZ67Cw7LmYDQogIHJhbmRvbS5jb2xvciA9IFQsDQogIGNvbG9ycyA9IHBhbA0KKQ0KDQpgYGANCg0KDQo=