UDN 新聞爬蟲
抓取新聞列表
library(rvest)
## Loading required package: xml2
domain <- 'http://udn.com'
res <- read_html('http://udn.com/news/breaknews/1')
urls <-res %>% html_nodes('#breaknews_body a') %>% html_attr('href') %>% paste0(domain, x=.)
內容頁剖析器
parseUDN = function(ele){
title = read_html(ele) %>% html_nodes('#story_art_title') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
meta = read_html(ele) %>% html_nodes('#story_bady_info') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
article = read_html(ele) %>% html_nodes('#story_body_content') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
data.frame(title = title, meta=meta, article = article)
}
抓取內容資訊
udndf <- lapply(urls, parseUDN)
合併抓取新聞資料
udn <- do.call("rbind", udndf)