ltn_news_crawler
library(rvest)
## Warning: package 'rvest' was built under R version 3.2.5
## Loading required package: xml2
library(stringr)
res <- read_html('http://news.ltn.com.tw/list/BreakingNews')
lipic <- res %>% html_nodes('li.lipic')
title <- lipic %>% html_nodes('a.picword') %>%
html_text() %>% iconv(from='UTF-8', to='UTF-8')
link <- lipic %>% html_nodes('a.picword') %>%
html_attr('href')
dt <- lipic %>% html_nodes('span') %>%
html_text()
category <- lipic %>% html_nodes('span a') %>%
html_attr('class')
map_table = read_html('http://news.ltn.com.tw/css/news/style.css?201509', encoding='UTF-8') %>%
iconv(from='UTF-8', to='UTF-8') %>%
str_match_all(pattern='.list span a.(.*?):after\\{content:"(.*?)";.*\\}', string =.)
#map_table[[1]][,2]
#map_table[[1]][,3]
map_list = list()
map_list[map_table[[1]][,2]] = map_table[[1]][,3]
tag = map_list[category] %>% unlist()
ltn_news = data.frame(datetime = dt,
link = link,
title = title,
category = tag)