배경

Rstudio/Chromote를 활용하여 웹페이지의 일부 데이터를 가져오는 연습

#remotes::install_github("rstudio/chromote")
#install.packages('dplyr')
#install.packages('utf8')
library(chromote,quietly=TRUE)
library(dplyr,quietly=TRUE)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(utf8,quietly=TRUE)

연습코드

Views, Likes

link="https://view.shoppinglive.naver.com/replays/74801"
b<-ChromoteSession$new() #new session
{
  invisible({
    b$Page$navigate(link,wait_=FALSE)
    b$Page$loadEventFired()
    Sys.sleep(5)
  })
}
trial=5
while(trial>0) {
  output<-b$Runtime$evaluate('document.querySelector("#root > div > div > div > div > div > div > div > div.TagItemLayout_wrap_1tXSl > a:nth-child(1) > span").innerHTML')
  if(!is.null(output$result$value)) break
  cat(".")
  trial=trial-1
  Sys.sleep(1)
}

print(output$result$value)
## [1] "48,000"
b$close()
## [1] TRUE
rm(trial,link,output,b)

Comment

코멘트는 300개까지 업데이트 된다. 실시간으로 올라오는 것만 볼 수 있다.

library(chromote,quietly = TRUE)
link="https://view.shoppinglive.naver.com/replays/74801"
b<-ChromoteSession$new() #new session
{
  invisible({
    b$Page$navigate(link,wait_=FALSE)
    b$Page$loadEventFired()
    Sys.sleep(5)
  })
}
cat("Let's wait for 2 minutes...")
## Let's wait for 2 minutes...
Sys.sleep(60*2)
cat("\nOK.")
## 
## OK.
code="#root > div > div > div > div > div > div > div > div.Comment_wrap_3ylpz > div > div:nth-child(1) > div > span"
codeEval=sprintf('document.querySelectorAll("%s").length',code)
output<-b$Runtime$evaluate(codeEval)
output<-output$result$value
output
## [1] 11
items=1:output
codeEvala=sprintf('document.querySelector("#root > div > div > div > div > div > div > div > div.Comment_wrap_3ylpz > div > div:nth-child(1) > div:nth-child(%d) > span").innerHTML',items)
fun <- function(x) {
  output<-b$Runtime$evaluate(x)
  output<-output$result$value
  output
  utf8::utf8_format(output)
}
x=as.character(sapply(codeEvala,fun))
knitr::kable(data.frame(no=1:10,comment=x[1:10]))
no comment
1 🖐🖐🖐🖐
2 grn과 함께하는 34,900원 분홍이 초록이 세트 다이어트 시작합니다!
3 이제 봄 오니까 필수죠
4 hello
5 봄 왔어요
6 방가
7 안녕하세용
8 혜택 기대돼요
9 혜택 기대돼요
10 안녕하세요~
b$close()
## [1] TRUE
rm(x,fun,codeEvala,items,output,codeEval,code,b)

알고리즘

createBrowser <- function() {
  b<-ChromoteSession$new() #new session
  b
}
visitLink <- function(b,link,waitForSec=5) {
  invisible({
    b$Page$navigate(link,wait_=FALSE)
    b$Page$loadEventFired()
    Sys.sleep(waitForSec)
  })
}
getVideoLength <- function(b) {
  code="#root > div > div > div > div > div > div > div > div.NativePlayer_wrap_2FTTe > video"
  output=b$Runtime$evaluate(sprintf('document.querySelector("%s").duration',code))
  as.numeric(output$result$value)
}
getCurrentVideoTime <- function(b) {
  code="#root > div > div > div > div > div > div > div > div.NativePlayer_wrap_2FTTe > video"
  output=b$Runtime$evaluate(sprintf('document.querySelector("%s").currentTime',code))
  as.numeric(output$result$value)
}
extractData <- function(b,css_code,trial=5,verbose=TRUE) {
  while(trial>0) {
    selectorCode=sprintf('document.querySelector("%s").innerHTML',css_code)
    output<-b$Runtime$evaluate(selectorCode)
    if(!is.null(output$result$value)) break
    if(verbose) cat('.')
      trial=trial-1
    Sys.sleep(1)
  }
  if(verbose) cat("\n")
  output$result$value
}
getComments <- function(b,waitFor=60*2) {
  fun <- function(x) {
    output<-b$Runtime$evaluate(x)
    output<-output$result$value
    output
    utf8::utf8_format(output)
  }
  result=list()
  timeTick=0
  moment=1
  video_length=getVideoLength(b)
  while(timeTick<=waitFor) {
    code="#root > div > div > div > div > div > div > div > div.Comment_wrap_3ylpz > div > div:nth-child(1) > div > span"
    codeEval=sprintf('document.querySelectorAll("%s").length',code)
    output<-b$Runtime$evaluate(codeEval)
    output<-as.numeric(output$result$value)
    if(output>0) {
      items=1:output
      codeEvala=sprintf('document.querySelector("#root > div > div > div > div > div > div > div > div.Comment_wrap_3ylpz > div > div:nth-child(1) > div:nth-child(%d) > span").innerHTML',items)
      codeEvalb=sprintf('document.querySelector("#root > div > div > div > div > div > div > div > div.Comment_wrap_3ylpz > div > div:nth-child(1) > div:nth-child(%d) > strong").innerHTML',items)
      comment=as.character(sapply(codeEvala,fun))
      user=as.character(sapply(codeEvalb,fun))
      result[[moment]]=data.frame(user,comment)
      moment=moment+1
    }
    #
    current_time=getCurrentVideoTime(b)
    if(current_time>=video_length) {
      break
    }
    timeTick=timeTick+1
    Sys.sleep(1)
  }
  result
}

exitBrowser <- function(b) {
  invisible(b$close())
}

View와 Like 수 가져오기

library(chromote)
code1="#root > div > div > div > div > div > div > div > div.TagItemLayout_wrap_1tXSl > a:nth-child(1) > span"
code2="#root > div > div > div > div > div > div > div > div.TagItemLayout_wrap_1tXSl > a:nth-child(2) > span"
link="https://view.shoppinglive.naver.com/replays/74801"
b=createBrowser()
visitLink(b,link)
result1=extractData(b,code1)
result2=extractData(b,code2)
cat("-- RESULT --\n")
## -- RESULT --
cat(" N of View: ",result1,"\n")
##  N of View:  48,000
cat(" N of Like: ",result2,'\n')
##  N of Like:  28,800
exitBrowser(b)
rm(code1,code2,link)
rm(b)

코멘트 가져오기

link="https://view.shoppinglive.naver.com/replays/74801"
b=createBrowser()
visitLink(b,link)
comments=getComments(b,60*5) #Naver Shopping Live ONLY! (5min)
comments=bind_rows(comments) %>% 
  distinct() %>%
  group_by(user,comment) %>%
  summarize(numOfDup=n()) %>%
  ungroup()
## `summarise()` regrouping output by 'user' (override with `.groups` argument)
knitr::kable(head(comments,20))
user comment numOfDup
3gd9 ㄷ자이어트다이어트 1
3gd9 다이어트 1
3gd9 분홍이 1
3gd9 새해목표아닌가연ㅇ 1
3gd9 이번에꼭!! 1
3gd9 초록이 1
3gd9 해야돼요 1
3gd9 1
Be 안녕하세요 1
cake**** 식사 전후 먹어야 하는 거네,, 1
Da y 좋네요 1
Elisabeth 저렴하네요 오늘 1
FLY 믿을수있네요 1
foxs**** 괜찮네요 1
gre 다이어트식품이군요 1
gre 자주 보던 상품이네요 1
imuuu 좋아요 1
isna**** 오늘도 화이팅 하세요 ㅎㅎ 1
jang**** 반가워요 1
jang**** 색상이 더 이뻐요 1
exitBrowser(b)
rm(comments,b,link)