簡單的ltn 爬蟲
library(rvest)
## Warning: package 'rvest' was built under R version 3.2.5
## Loading required package: xml2
lipic <- read_html('http://news.ltn.com.tw/list/BreakingNews') %>%
html_nodes('.lipic')
title <- lipic %>%
html_nodes('.picword') %>%
html_text() %>% iconv(from='UTF-8', to='UTF-8')
link <- lipic %>%
html_nodes('.picword') %>%
html_attr('href')
datetime <- lipic %>%
html_nodes('span') %>%
html_text()
ltn_news <- data.frame(link, title, datetime)
View(ltn_news)
進階版ltn 爬蟲- 增添類別資訊
# 自由時報爬蟲
library(rvest)
lipic <- read_html('http://news.ltn.com.tw/list/BreakingNews') %>%
html_nodes('.lipic')
title <- lipic %>%
html_nodes('.picword') %>%
html_text() %>% iconv(from='UTF-8', to='UTF-8')
link <- lipic %>%
html_nodes('.picword') %>%
html_attr('href')
datetime <- lipic %>%
html_nodes('span') %>%
html_text()
category <- lipic %>%
html_nodes('span a') %>%
html_attr('class')
ltn_news <- data.frame(link, title, datetime, category)
# 產生標籤對應表
library(stringr)
# .list span a.tab12:after{content:"副刊";}
map_table = read_html('http://news.ltn.com.tw/css/news/style.css?201509', encoding='UTF-8') %>%
iconv(from='UTF-8', to='UTF-8') %>%
str_match_all(pattern='.list span a.(.*?):after\\{content:"(.*?)";.*\\}', string =.)
map_df <- as.data.frame(map_table[[1]][,2:3])
names(map_df) = c('category', 'tagname')
# 合併 DataFrame
?merge
## starting httpd help server ...
## done
ltn_df <- merge(ltn_news, map_df, by='category')