LTN 新聞爬蟲

抓取新聞列表

library(rvest)
## Loading required package: xml2
res <-  read_html('http://news.ltn.com.tw/list/BreakingNews')
lipic <-res %>% html_nodes('.lipic') 
link <- lipic %>% html_nodes('a.picword') %>% html_attr('href') 
title <- lipic %>% html_nodes('a.picword') %>% html_text() %>%  iconv(from='UTF-8', to='UTF-8')
datetime <- lipic %>% html_nodes('span') %>% html_text() %>%  iconv(from='UTF-8', to='UTF-8') 
category <- lipic %>% html_nodes('span > a') %>% html_attr('class') 

抓取css

css <-  read_html('http://news.ltn.com.tw/css/news/style.css?201509', encoding='UTF-8')

取得類別對應表

library(stringr)
map_table <- css   %>%  iconv(from='UTF-8', to='UTF-8') %>% str_match_all(pattern='.list span a.(.*?):after\\{content:"(.*?)";\\}', string =.)   
map_list = list()
map_list[map_table[[1]][,2]] = map_table[[1]][,3]
cat = sapply(category, function(e)map_list[e]) %>% unlist()

合併欄位成Data Frame

ltn <- data.frame(title = title, link = link, datetime = datetime, category = cat)