此爬網內容僅供參考與教學,請小心使用,任何爬網行為都屬於駭客行為,注意自己的程式碼並適時在程式當中使用Sys.sleep()函數減輕Server負擔,保持社交距離傑克關心您的健康
RSelenium需使用的額外driver: 瀏覽器 driver http://chromedriver.chromium.org/ Selenium driver https://www.selenium.dev/downloads/
利用Selectorgadget選擇網頁中所想要抓取的內容,抓取刺激1995(電影經典中的經典)評分,由下可以看到不管是利用CSS或是xpath的方式所找到的內容是一致的。
##imbd
library(rvest);library(httr)
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.6.3
url = 'https://www.imdb.com/title/tt0111161'
top_movie = read_html(url)
ranking = top_movie%>%html_nodes(css = 'strong span') %>% html_text() %>% as.numeric()
ranking
## [1] 9.3
ranking2 = top_movie%>%html_nodes(xpath = '//strong /span') %>% html_text() %>% as.numeric()
ranking2
## [1] 9.3
cast = top_movie %>% html_nodes(css = '.primary_photo+ td a')%>% html_text()
cast
## [1] " Tim Robbins\n" " Morgan Freeman\n" " Bob Gunton\n"
## [4] " William Sadler\n" " Clancy Brown\n" " Gil Bellows\n"
## [7] " Mark Rolston\n" " James Whitmore\n" " Jeffrey DeMunn\n"
## [10] " Larry Brandenburg\n" " Neil Giuntoli\n" " Brian Libby\n"
## [13] " David Proval\n" " Joseph Ragno\n" " Jude Ciccolella\n"
cast = gsub('^ |\n',"",cast)
cast
## [1] "Tim Robbins" "Morgan Freeman" "Bob Gunton"
## [4] "William Sadler" "Clancy Brown" "Gil Bellows"
## [7] "Mark Rolston" "James Whitmore" "Jeffrey DeMunn"
## [10] "Larry Brandenburg" "Neil Giuntoli" "Brian Libby"
## [13] "David Proval" "Joseph Ragno" "Jude Ciccolella"
熱情女孩dashboard
##ps sister
library(rvest);library(httr)
url = "https://www.brothers.tw/ps_teammatelist.php"
ps = read_html(url)
rating = ps %>% html_nodes(css = "#content_all div td img")%>% html_attr("src")
mylove = paste0('https://www.brothers.tw/',rating)
for(i in 1:length(mylove)) download_html(mylove[i])
imager::load.image("oy5fu8jh20190423150445.jpg") %>% plot
網頁有些時候需要傳送一些特定資訊讓使用者進入網站,利用EditThisCookies這個擴充套件觀察進入ptt表特版的內容,得到over18 = 1時,即可進入頁面。
##ptt列表
library(rvest);library(httr)
url = "https://www.ptt.cc/bbs/Beauty/index.html"%>% GET( set_cookies(over18 = 1))%>% read_html () %>%
html_nodes(css = ".title a") %>% html_attr('href')
##觀察page的變化
all_url_page = paste0('https://www.ptt.cc/bbs/Beauty/index',3266:3267,'.html') #按頁數調整
all_url_data = c()
##抓每頁的網址
for(i in 1:length(all_url_page)){
all_url_data = c(all_url_data,
GET(all_url_page[i] ,set_cookies(over18 = 1))%>% read_html () %>%
html_nodes(css = ".title a") %>% html_attr('href'))
}
all_url_data = paste0('https://www.ptt.cc/',all_url_data)
##抓出每頁有關網頁的所有內容
final_data = c()
for(i in 1:length(all_url_data)){
final_data = c(final_data,
all_url_data[i] %>% GET( set_cookies(over18 = 1))%>% read_html () %>%
html_nodes('#main-content a')%>% html_attr('href'))
if(i %% 30 == 0) Sys.sleep(0.5)
}
##只取出圖片相關
temp = final_data[grepl('http(s)',final_data)]
final_data2 = c(temp[grepl('\\.jpg',temp)],temp[grepl('\\.png',temp)])
##利用error handle 去避免下載錯誤,可以用sink記錄錯誤內容回推錯誤資訊,因不想要執行成功地提醒,finally那裡為空,恭喜你完成美女dashboard
for(i in 1:length(final_data2)){
if(is.na(final_data2[i]) == F){
tryCatch({
download_html(final_data2[i])
},
warning = function(msg) {
msg
},
error = function(msg) {
msg
},finally ={
}
)
}
}
imager::load.image("AF0KVvd.jpg") %>% plot
現在許多網頁都為動態網頁,利用Selenium Driver在R裡面的應用,模擬真實用戶觀看網頁,並抓取想要的內容。
#devtools::install_github("ropensci/RSelenium")
#cmd以管理員執行以下程式碼
#java -Dwebdriver.chrome.driver=G://Rcrawler//chromedriver.exe -jar G://Rcrawler//selenium-server-standalone-3.8.0.jar
library(RSelenium)
#設定瀏覽器基本資訊
remDr = remoteDriver(
remoteServerAddr = "localhost",
port = 4444,
browserName = "chrome")
#開啟
remDr$open()
#設定大小
remDr$setWindowSize(width = 800, height = 800)
#到達網頁
remDr$navigate("https://www.ettoday.net/news/news-list.htm")
webElem = remDr$findElement("css","body")
#利用模擬滾輪的方式,網頁資訊變動通常在xhr(大部分)下觀察
for(i in 1:10) #看你要滾幾次
{
webElem$sendKeysToElement(list(key ="end"))
webElem$sendKeysToElement(list(key ="down_arrow"))
}
#滾動你要的頁數後,一次性讀取所有資訊
html = read_html(remDr$getPageSource()[[1]][1])
url = html%>% html_nodes(css = ".part_list_2 a")%>% html_attr("href")
url = paste0('https://www.ettoday.net',url)
#解析讀取內容並儲存
my_data = data.frame(title = c(),content = c() )
for(i in 1:length(url)){
title = read_html(url[i]) %>% html_nodes(css = ".title")%>% html_text()
content = read_html(url[i]) %>% html_nodes(css = ".story")%>% html_text()
my_data = rbind(my_data, data.frame(ifelse(length(title)>0,title,NA),ifelse(length(content)>0,content,NULL)))
}
#head(my_data)
#data.table::fwrite(my_data,'G://Rcrawler//newlist.csv') #寫出資訊