LTN 新聞爬蟲
抓取新聞列表
library(rvest)
## Loading required package: xml2
res <- read_html('http://news.ltn.com.tw/list/BreakingNews')
lipic <-res %>% html_nodes('.lipic')
link <- lipic %>% html_nodes('a.picword') %>% html_attr('href')
title <- lipic %>% html_nodes('a.picword') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
datetime <- lipic %>% html_nodes('span') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
category <- lipic %>% html_nodes('span > a') %>% html_attr('class')
抓取css
css <- read_html('http://news.ltn.com.tw/css/news/style.css?201509', encoding='UTF-8')
取得類別對應表
library(stringr)
map_table <- css %>% iconv(from='UTF-8', to='UTF-8') %>% str_match_all(pattern='.list span a.(.*?):after\\{content:"(.*?)";\\}', string =.)
map_list = list()
map_list[map_table[[1]][,2]] = map_table[[1]][,3]
cat = sapply(category, function(e)map_list[e]) %>% unlist()
合併欄位成Data Frame
ltn <- data.frame(title = title, link = link, datetime = datetime, category = cat)