Class3_Homework1

簡單的ltn 爬蟲

library(rvest)

## Warning: package 'rvest' was built under R version 3.2.5

## Loading required package: xml2

lipic <- read_html('http://news.ltn.com.tw/list/BreakingNews') %>% 
  html_nodes('.lipic')

title <- lipic %>% 
  html_nodes('.picword') %>% 
  html_text() %>% iconv(from='UTF-8', to='UTF-8')

link <- lipic %>% 
  html_nodes('.picword') %>% 
  html_attr('href') 

datetime <- lipic %>% 
  html_nodes('span') %>% 
  html_text()

ltn_news <- data.frame(link, title, datetime)

View(ltn_news)

進階版ltn 爬蟲- 增添類別資訊

# 自由時報爬蟲
library(rvest)
lipic <- read_html('http://news.ltn.com.tw/list/BreakingNews') %>% 
  html_nodes('.lipic')

title <- lipic %>% 
  html_nodes('.picword') %>% 
  html_text() %>% iconv(from='UTF-8', to='UTF-8')

link <- lipic %>% 
  html_nodes('.picword') %>% 
  html_attr('href') 

datetime <- lipic %>% 
  html_nodes('span') %>% 
  html_text()

category <- lipic %>%
  html_nodes('span a') %>%
  html_attr('class')

ltn_news <- data.frame(link, title, datetime, category)



# 產生標籤對應表
library(stringr)
# .list span a.tab12:after{content:"副刊";}
map_table = read_html('http://news.ltn.com.tw/css/news/style.css?201509', encoding='UTF-8') %>%
  iconv(from='UTF-8', to='UTF-8') %>% 
str_match_all(pattern='.list span a.(.*?):after\\{content:"(.*?)";.*\\}', string =.)

map_df <- as.data.frame(map_table[[1]][,2:3])
names(map_df) = c('category', 'tagname')

# 合併 DataFrame 
?merge

## starting httpd help server ...

##  done

ltn_df <- merge(ltn_news, map_df, by='category')

Class3_Homework1

David Chiu

2016年9月25日

簡單的ltn 爬蟲

進階版ltn 爬蟲- 增添類別資訊