R Crawler

此爬網內容僅供參考與教學,請小心使用,任何爬網行為都屬於駭客行為,注意自己的程式碼並適時在程式當中使用Sys.sleep()函數減輕Server負擔,保持社交距離傑克關心您的健康

RSelenium需使用的額外driver: 瀏覽器 driver http://chromedriver.chromium.org/ Selenium driver https://www.selenium.dev/downloads/

一般網頁

利用Selectorgadget選擇網頁中所想要抓取的內容,抓取刺激1995(電影經典中的經典)評分,由下可以看到不管是利用CSS或是xpath的方式所找到的內容是一致的。

##imbd 
library(rvest);library(httr)
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.6.3
url = 'https://www.imdb.com/title/tt0111161'
top_movie = read_html(url)
ranking = top_movie%>%html_nodes(css = 'strong span') %>% html_text() %>% as.numeric()
ranking
## [1] 9.3
ranking2 = top_movie%>%html_nodes(xpath = '//strong /span') %>% html_text() %>% as.numeric()
ranking2
## [1] 9.3
cast = top_movie %>% html_nodes(css = '.primary_photo+ td a')%>% html_text()
cast
##  [1] " Tim Robbins\n"       " Morgan Freeman\n"    " Bob Gunton\n"       
##  [4] " William Sadler\n"    " Clancy Brown\n"      " Gil Bellows\n"      
##  [7] " Mark Rolston\n"      " James Whitmore\n"    " Jeffrey DeMunn\n"   
## [10] " Larry Brandenburg\n" " Neil Giuntoli\n"     " Brian Libby\n"      
## [13] " David Proval\n"      " Joseph Ragno\n"      " Jude Ciccolella\n"
cast = gsub('^ |\n',"",cast)
cast
##  [1] "Tim Robbins"       "Morgan Freeman"    "Bob Gunton"       
##  [4] "William Sadler"    "Clancy Brown"      "Gil Bellows"      
##  [7] "Mark Rolston"      "James Whitmore"    "Jeffrey DeMunn"   
## [10] "Larry Brandenburg" "Neil Giuntoli"     "Brian Libby"      
## [13] "David Proval"      "Joseph Ragno"      "Jude Ciccolella"

熱情女孩dashboard

##ps sister
library(rvest);library(httr)
url  =  "https://www.brothers.tw/ps_teammatelist.php"
ps  =  read_html(url)
rating  =  ps %>% html_nodes(css = "#content_all div td img")%>% html_attr("src")
mylove = paste0('https://www.brothers.tw/',rating)
for(i in 1:length(mylove)) download_html(mylove[i])
imager::load.image("oy5fu8jh20190423150445.jpg") %>% plot

需傳送資訊的網頁爬網

網頁有些時候需要傳送一些特定資訊讓使用者進入網站,利用EditThisCookies這個擴充套件觀察進入ptt表特版的內容,得到over18 = 1時,即可進入頁面。

##ptt列表
library(rvest);library(httr)
url  =  "https://www.ptt.cc/bbs/Beauty/index.html"%>% GET( set_cookies(over18 = 1))%>% read_html () %>% 
  html_nodes(css = ".title a") %>% html_attr('href')

##觀察page的變化
all_url_page = paste0('https://www.ptt.cc/bbs/Beauty/index',3266:3267,'.html') #按頁數調整

all_url_data = c()
##抓每頁的網址
for(i in 1:length(all_url_page)){
  all_url_data = c(all_url_data,
  GET(all_url_page[i] ,set_cookies(over18 = 1))%>% read_html () %>% 
    html_nodes(css = ".title a") %>% html_attr('href'))
}
all_url_data = paste0('https://www.ptt.cc/',all_url_data)

##抓出每頁有關網頁的所有內容
final_data = c()
for(i in 1:length(all_url_data)){
  final_data = c(final_data, 
                 all_url_data[i] %>% GET( set_cookies(over18 = 1))%>% read_html () %>% 
                   html_nodes('#main-content a')%>% html_attr('href'))
  if(i %% 30 == 0) Sys.sleep(0.5)
}

##只取出圖片相關
temp = final_data[grepl('http(s)',final_data)] 
final_data2 = c(temp[grepl('\\.jpg',temp)],temp[grepl('\\.png',temp)])

##利用error handle 去避免下載錯誤,可以用sink記錄錯誤內容回推錯誤資訊,因不想要執行成功地提醒,finally那裡為空,恭喜你完成美女dashboard
for(i in 1:length(final_data2)){
  if(is.na(final_data2[i]) == F){ 
    
    tryCatch({
      download_html(final_data2[i])
      },
      warning = function(msg) {
        msg
      },
      error = function(msg) {
        msg
      },finally ={
        
        }
      )
    
    }
  }

imager::load.image("AF0KVvd.jpg") %>% plot

動態網頁範例

現在許多網頁都為動態網頁,利用Selenium Driver在R裡面的應用,模擬真實用戶觀看網頁,並抓取想要的內容。

#devtools::install_github("ropensci/RSelenium")

#cmd以管理員執行以下程式碼
#java -Dwebdriver.chrome.driver=G://Rcrawler//chromedriver.exe -jar G://Rcrawler//selenium-server-standalone-3.8.0.jar
library(RSelenium)

#設定瀏覽器基本資訊
remDr  =  remoteDriver(
  remoteServerAddr = "localhost",
  port = 4444,
  browserName = "chrome")

#開啟
remDr$open()
#設定大小
remDr$setWindowSize(width = 800, height = 800)
#到達網頁
remDr$navigate("https://www.ettoday.net/news/news-list.htm")
webElem  =  remDr$findElement("css","body")

#利用模擬滾輪的方式,網頁資訊變動通常在xhr(大部分)下觀察
for(i in 1:10) #看你要滾幾次 
{
  webElem$sendKeysToElement(list(key ="end"))
  webElem$sendKeysToElement(list(key ="down_arrow"))
}

#滾動你要的頁數後,一次性讀取所有資訊
html = read_html(remDr$getPageSource()[[1]][1])
url = html%>% html_nodes(css = ".part_list_2 a")%>% html_attr("href")
url = paste0('https://www.ettoday.net',url)

#解析讀取內容並儲存
my_data = data.frame(title = c(),content = c() )

for(i in 1:length(url)){
  
  title = read_html(url[i]) %>% html_nodes(css = ".title")%>% html_text()
  content = read_html(url[i]) %>% html_nodes(css = ".story")%>% html_text()
  my_data = rbind(my_data, data.frame(ifelse(length(title)>0,title,NA),ifelse(length(content)>0,content,NULL)))
  
}

#head(my_data)
#data.table::fwrite(my_data,'G://Rcrawler//newlist.csv') #寫出資訊