套件

Row

Packages

Selenium

  • RSelenium

Data processing

  • tidyverse

Parallel computing

  • foreach
  • doParallel

Selenium

selenium-3.4.0 下載

將下載的檔案放到與 cmd 預設路徑相同的位置,方便我們快速啟動

下圖範例為 C:\Users\User

ChromeDriver

ChromeDriver 下載 (win32)

ChromeDriver 下載 (mac64)

下載完後將檔案解壓縮到 Chrome.exe 所在的資料夾,以下為範例路徑

C:\Program Files (x86)\Google\Chrome\Application

  • 注意! 避免不必要的錯誤, chrome 請更新至最新版本

再到環境變數的 PATH 增加 C:\Program Files (x86)\Google\Chrome\Application;


若無 PATH 請自行新增

Mac教學

啟動 Selenium

Column

開啟 cmd 輸入

java -jar selenium-server-standalone-3.4.0.jar

出現 Selenium Server is up and running 代表成功了

常用函數

Column

常用函數

選擇瀏覽器(以 chrome 為例)

remDr <- remoteDriver(browserName = "chrome")

開啟瀏覽器

remDr$open()

轉至所輸入的網址

remDr$navigate('https://tw.yahoo.com/')

抓特定位置

remDr$findElement()

抓多個位置

remDr$findElements()

顯示目前的網址

remDr$getCurrentUrl()

上一頁

remDr$goBack()
函數名稱非常直觀

函數名稱非常直觀

注意事項

  • 不要手動控制網頁,例如自己按重新整理或上一頁,所有動作必須由函數執行,否則會產生 Error
  • 即使由函數控制返回上一頁,只要離開該頁面之後,該頁面所執行的函數必須重新執行一次

以下為錯誤示範

remDr$navigate(url = 'https://tw.yahoo.com/')

test <- remDr$findElement(using = 'css',value = '#UHSearchBox')
test$setElementAttribute(attributeName = 'value',value = 'NSYSU')
test$submitElement()

# 完成搜尋之後,使用 goBack() 返回上一頁
remDr$goBack()

test$setElementAttribute(attributeName = 'value',value = 'NSYSU')

# 直接執行 setElementAttribute() 會遇到 error
# 必須再執行一次 remDr$findElement()

test <- remDr$findElement(using = 'css',value = '#UHSearchBox')
test$setElementAttribute(attributeName = 'value',value = 'NSYSU')

PTT 自動瀏覽

PTT 自動瀏覽

Column

library(RSelenium)
library(tidyverse)
library(foreach)
library(doParallel)

#
Crawl <- function(board = board,pagefrom = 2189,pageto = 2190,speed = speed,p=1){
  library(RSelenium)
  library(tidyverse)
  options(error = dump.frames)
  remDr <- remoteDriver(browserName = "chrome")

  remDr$open()
  switch(p,
  remDr$setWindowPosition(x = -5,y = -20),
  remDr$setWindowPosition(x = 680,y = -20)
  )
  remDr$setWindowSize(width = 692,height = 788)
  
  for(i in pagefrom:pageto) {
    for(j in 1:20){
      Sys.sleep(speed)
      remDr$navigate(paste('https://www.ptt.cc/bbs/',board,'/index',i,'.html',sep=""))
      Sys.sleep(speed)
      tryCatch(a <- remDr$findElement(using = 'xpath',value = paste('//*[@id="main-container"]/div[2]/div[',j,']/div[3]/a',sep="")),                                       
               error = function(e) {},
               warning = function(e) {},
               finally = {
                 tryCatch(a$clickElement(),error = function(e){print('被刪了')})
                 })
      
      cc <- remDr$findElements(using = 'css selector',value = '#main-content > a')
      ccc <- remDr$findElements(using = 'css selector',value = '#main-content > span:nth-child(5)')
      
      for(k in 1:length(cc)){
        Sys.sleep(speed)
        tryCatch(remDr$mouseMoveToLocation(webElement = cc[[k]]),
                 error = function(e) {})
        tryCatch(remDr$mouseMoveToLocation(webElement = cc[[k-1]]),
                 error = function(e) {})
        tryCatch(remDr$mouseMoveToLocation(webElement = ccc[[1]]),
                 error = function(e) {})
        }
      }
  }
}


# 雙視窗瀏覽
cl<-makeCluster(2,"SOCK")
registerDoParallel(cl)

foreach(i = 1:2) %dopar% {
  switch(i,
         Crawl(board = 'pet',pagefrom = 1110,pageto = 1121,speed = 0.5,p = i),
         Crawl(board = 'pet',pagefrom = 1122,pageto = 1133,speed = 0.5,p = i))
}

remDr$closeall()
stopImplicitCluster()
stopCluster(cl)
#

Dcard 自動瀏覽

Dcard 自動瀏覽

Column

library(RSelenium)
library(tidyverse)
library(foreach)
library(doParallel)

#
dcard_crawl <- function(board = 'pet', n = 10,speed = 1, p=1){
  library(RSelenium)
  library(tidyverse)
  options(error = dump.frames)
  remDr <- remoteDriver(browserName = "chrome")
  
  remDr$open()
  switch(p,
         remDr$setWindowPosition(x = -5,y = -20),
         remDr$setWindowPosition(x = 680,y = -20)
  )
  remDr$setWindowSize(width = 692,height = 788)
  remDr$navigate(paste0('https://www.dcard.tw/f/',board,'?latest=true'))
  for(i in 1:n) {
      Sys.sleep(speed)
      a <- remDr$findElement('css',paste0('#root > div > div.App_main_38Mbt > div > div > main > div > div > div:nth-child(4) > div:nth-child(',i,') > div > a > article > div.PostEntry_content_g2afg > h3'))
      a$clickElement()
      cc <- remDr$findElements(using = 'css selector',value = "[class='GalleryImage_image_3lGzO']")
      
      for(k in 1:length(cc)){
        Sys.sleep(speed)
        tryCatch(remDr$mouseMoveToLocation(webElement = cc[[k]]),
                 error = function(e) {})
      }
      Sys.sleep(speed)
      a$goBack()
  }
}

#
cl<-makeCluster(2,"SOCK")
registerDoParallel(cl)

foreach(i = 1:2) %dopar% {
  switch(i,
         Crawl(board = 'pet',pagefrom = 1110,pageto = 1121,speed = 0.5,p = i),
         dcard_crawl(board = 'pet',n = 100, speed = 1,p = i))
}

remDr$closeall()
stopImplicitCluster()
stopCluster(cl)