GET

# Method1
library(rvest)
## Warning: package 'rvest' was built under R version 3.4.2
## Loading required package: xml2
newsurl <- 'https://tw.appledaily.com/new/realtime'
apple <- read_html(newsurl)
#as.character(apple)

# Method2
library(httr)
apple <- GET(newsurl)
content(apple)
## {xml_document}
## <html lang="zh-TW">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body id="article" class="all">\n    <div class="wrapper">\n         ...

POST

library(httr)
url <- 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'
payload <- list(
StartStation='977abb69-413a-4ccf-a109-0272c24fd490',
EndStation='a7a04c89-900b-4798-95a3-c01c455622f4',
SearchDate='2017/11/20',
SearchTime='11:00',
SearchWay='DepartureInMandarin'
  
)
res <- POST(url, body = payload, encode = 'form')
content(res)
## {xml_document}
## <html lang="zh-tw">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="revision_cn">\r\n\t\r\n\t\r\n<header><div class="revisi ...
## [3] <script src="/Content/Script/moveScroll.js" type="text/javaScript">< ...
## [4] <script src="/Content/Script/init_index.js" type="text/javaScript">< ...
## [5] <script src="/Content/Script/jquery.ellipsis.js" type="text/javaScri ...
## [6] <script src="/Content/Script/script-2.js" type="text/javaScript"></s ...
## [7] <script type="text/javascript">\r\n\r\n    var _gaq = _gaq || [];\r\ ...
## [8] <script src="/Content/Script/revision.js" type="text/javaScript"></s ...
#as.character(content(res))

台鐵簡易查詢

res <- read_html('http://twtraffic.tra.gov.tw/twrail/SearchResult.aspx?searchtype=0&searchdate=2017/11/20&fromstation=1810&tostation=1008&trainclass=%271100%27,%271101%27,%271102%27,%271107%27,%271110%27,%271120%27&fromtime=0600&totime=2359')

PCHOME

#res <- read_html('http://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/DCAKAD-A90061LV6-000&store=DCAKAD&fields=Seq,Id,Name,Nick,Store,PreOrdDate,SpeOrdDate,Price,Discount,Pic,Weight,ISBN,Qty,Bonus,isBig,isSpec,isCombine,isDiy,isRecyclable,isCarrier,isMedical,isBigCart,isSnapUp,isDescAndIntroSync,isFoodContents,isHuge,isEnergySubsidy&_callback=jsonp_prod&1511147880?_callback=jsonp_prod')
#as.character(res)

res <- read_html('http://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/DGBJBH-A90083AYO-000&store=DGBJBY&fields=Seq,Id,Name,Nick,Store,PreOrdDate,SpeOrdDate,Price,Discount,Pic,Weight,ISBN,Qty,Bonus,isBig,isSpec,isCombine,isDiy,isRecyclable,isCarrier,isMedical,isBigCart,isSnapUp,isDescAndIntroSync,isFoodContents,isHuge,isEnergySubsidy&_callback=jsonp_prod&1511156520?_callback=jsonp_prod')
as.character(res)
## [1] "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html><body><p>try{jsonp_prod({\"DGBJBH-A90083AYO-000\":{\"Seq\":17907797,\"Id\":\"DGBJBH-A90083AYO-000\",\"Name\":\"Nintendo Switch\\u300a\\u85a9\\u723e\\u9054\\u50b3\\u8aaa\\uff1a\\u8352\\u91ce\\u4e4b\\u606f The Legend of Zelda: Breath of the Wild\\u300b\\u82f1\\u65e5\\u591a\\u570b\\u8a9e\\u6587\\u7f8e\\u7248\",\"Nick\":\"<font color=\"RED\"><b>\\u767c\\u552e\\u65e5\\ufe31<font color=\"BLUE\"><b>2017-03-03 <br>Nintendo Switch\\u300a\\u85a9\\u723e\\u9054\\u50b3\\u8aaa\\uff1a\\u8352\\u91ce\\u4e4b\\u606f The Legend of Zelda: Breath of the Wild\\u300b\\u82f1\\u65e5\\u591a\\u570b\\u8a9e\\u6587\\u7f8e\\u7248\",\"Store\":\"DGBJBH\",\"PreOrdDate\":\"\",\"SpeOrdDate\":\"\",\"Price\":{\"M\":0,\"P\":2190},\"Discount\":0,\"Pic\":{\"B\":\"\\/items\\/DGBJBHA90083AYO\\/000001_1493886375.jpg\",\"S\":\"\\/items\\/DGBJBHA90083AYO\\/000002_1493886375.jpg\"},\"Weight\":0.06,\"ISBN\":\"\",\"Qty\":0,\"Bonus\":0,\"isBig\":0,\"isSpec\":0,\"isCombine\":0,\"isDiy\":0,\"isRecyclable\":0,\"isCarrier\":0,\"isMedical\":0,\"isBigCart\":1,\"isSnapUp\":0,\"isDescAndIntroSync\":0,\"isFoodContents\":0,\"isHuge\":0,\"isEnergySubsidy\":0}});}catch(e){if(window.console){console.log(e);}}</b></font></b></font></p></body></html>\n"

591

res <- read_html('https://sale.591.com.tw/home/search/list?type=2&&regionid=1&timestamp=1511148080360')
#as.character(res)

HTML, CSS, Javascript

Magrittr Sample

library(magrittr)
sum(tail(head(iris) , 3)$Sepal.Length)
## [1] 15
iris %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()
## [1] 15

Rvest Parsing Example

html_sample <- '
<html> 
 <body> 
 <h1 id="title">Hello World</h1> 
 <a href="#" class="link">This is link1</a> 
 <a href="# link2" class="link">This is link2</a> 
 </body> 
</html>
'

read_html(html_sample) %>% html_node('h1')
## {xml_node}
## <h1 id="title">
read_html(html_sample) %>% html_node('h1') %>% html_text()
## [1] "Hello World"
# id => #
read_html(html_sample) %>% html_node('#title') %>% html_text()
## [1] "Hello World"
read_html(html_sample) %>% html_node('h1#title') %>% html_text()
## [1] "Hello World"
read_html(html_sample) %>% html_node('body h1#title') %>% html_text()
## [1] "Hello World"
read_html(html_sample) %>% html_node('body #title') %>% html_text()
## [1] "Hello World"
read_html(html_sample) %>% html_node('a') %>% html_text()
## [1] "This is link1"
read_html(html_sample) %>% html_nodes('a') %>% html_text()
## [1] "This is link1" "This is link2"
# class => .
read_html(html_sample) %>% html_nodes('.link') %>% html_text()
## [1] "This is link1" "This is link2"
read_html(html_sample) %>% html_nodes('a.link') %>% html_text()
## [1] "This is link1" "This is link2"
read_html(html_sample) %>% html_nodes('body a.link') %>% html_text()
## [1] "This is link1" "This is link2"
read_html(html_sample) %>% html_nodes('body .link') %>% html_attr('href')
## [1] "#"       "# link2"

591 爬蟲範例

library(rvest)
newsurl <- 'https://tw.appledaily.com/new/realtime/'
apple <- read_html(newsurl)

rtddt <- apple %>% html_nodes('.rtddt a')
rtddt
## {xml_nodeset (30)}
##  [1] <a href="https://tw.news.appledaily.com/life/realtime/20171120/1244 ...
##  [2] <a href="https://tw.news.appledaily.com/life/realtime/20171120/1244 ...
##  [3] <a href="https://tw.news.appledaily.com/local/realtime/20171120/124 ...
##  [4] <a href="https://tw.finance.appledaily.com/realtime/20171120/124457 ...
##  [5] <a href="https://tw.news.appledaily.com/local/realtime/20171120/124 ...
##  [6] <a href="https://tw.entertainment.appledaily.com/realtime/20171120/ ...
##  [7] <a href="https://tw.news.appledaily.com/local/realtime/20171120/124 ...
##  [8] <a href="https://tw.news.appledaily.com/international/realtime/2017 ...
##  [9] <a href="https://tw.news.appledaily.com/life/realtime/20171120/1244 ...
## [10] <a href="https://tw.finance.appledaily.com/realtime/20171120/124457 ...
## [11] <a href="https://tw.news.appledaily.com/politics/realtime/20171120/ ...
## [12] <a href="https://tw.news.appledaily.com/local/realtime/20171120/124 ...
## [13] <a href="https://tw.finance.appledaily.com/realtime/20171120/124456 ...
## [14] <a href="https://tw.news.appledaily.com/nextmag/realtime/20171120/1 ...
## [15] <a href="https://tw.entertainment.appledaily.com/realtime/20171120/ ...
## [16] <a href="https://tw.finance.appledaily.com/realtime/20171120/124456 ...
## [17] <a href="https://tw.lifestyle.appledaily.com/lifestyle/realtime/201 ...
## [18] <a href="https://tw.news.appledaily.com/nextmag/realtime/20171120/1 ...
## [19] <a href="https://tw.news.appledaily.com/international/realtime/2017 ...
## [20] <a href="https://tw.news.appledaily.com/forum/realtime/20171120/124 ...
## ...
as.character(rtddt[1])
## [1] "<a href=\"https://tw.news.appledaily.com/life/realtime/20171120/1244579/\" target=\"_blank\">\n                                        <time>16:15</time><h2>生活</h2>\n                                        <h1><font color=\"#383c40\">響應聯合國兒童權利公約 新北共融遊戲場誕...</font></h1>\n                                    </a>"
title    <- rtddt %>% html_nodes('h1') %>% html_text()
category <- rtddt %>% html_nodes('h2') %>% html_text()
dt       <- rtddt %>% html_nodes('time') %>% html_text()
link     <- rtddt %>% html_attr('href')

applenews <- data.frame(title = title, category = category, time = dt, url = link, stringsAsFactors = FALSE)

#applenews

Tesseract

抓 PTT

ptt <- read_html('https://www.ptt.cc/bbs/Creditcard/index.html')
ent <- ptt %>% html_nodes('.r-ent')
as.character(ent[1])
## [1] "<div class=\"r-ent\">\n\t\t\t<div class=\"nrec\"><span class=\"hl f2\">3</span></div>\n\t\t\t<div class=\"mark\"></div>\n\t\t\t<div class=\"title\">\n\t\t\t\n\t\t\t\t<a href=\"/bbs/creditcard/M.1511157944.A.011.html\">[問題] 中信的聯徵</a>\n\t\t\t\n\t\t\t</div>\n\t\t\t<div class=\"meta\">\n\t\t\t\t<div class=\"date\">11/20</div>\n\t\t\t\t<div class=\"author\">HyperactiveX</div>\n\t\t\t</div>\n\t\t</div>"
title  <- ent %>% html_nodes('.title') %>% html_text() %>% trimws()
nrec    <- ent %>% html_nodes('.nrec') %>% html_text() %>% trimws()

author <- ent %>% html_nodes('.author') %>% html_text() %>% trimws()
date   <- ent %>% html_nodes('.date') %>% html_text() %>% trimws()

df <- data.frame(title = title, nrec= nrec, author= author, date= date, stringsAsFactors = FALSE)
#df

抓取內容頁

res <- read_html('https://tw.news.appledaily.com/local/realtime/20171120/1244445/')

title    <- res %>% html_node('h1') %>% html_text()
article  <- res %>% html_node('.ndArticle_margin p') %>% html_text()
dt       <-res %>% html_node('.ndArticle_creat') %>% html_text()
view_cnt <- res %>% html_node('.ndArticle_view') %>% html_text()

newsdf <- data.frame(title, article, dt, view_cnt, stringsAsFactors = FALSE)
newsdf
##                                    title
## 1 問卦實況主腳踏兩條船? 20網友留言挨告
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          article
## 1 有網友在巴哈姆特電玩資訊站詢問網友,是否有《神魔之塔》邱姓實況主「平民百姓」用真實身分騙20歲小女友的八卦?許多網友紛紛在下面留言,指他腳踏兩條船。邱男認為名譽受辱,對留言的20名網友提告誹謗,但台北地檢署認為實況主為公眾人物,網友對可受公評之事留言批評不違法,今將其中17名成年網友不起訴,另3名未成年網友則移請少年法庭審理。另外,邱男還起違反《著作權法》告訴,指控網友將他玩遊戲進行遊戲攻略時的畫面截圖,還加註意見成為簽名檔散布,不過檢察官認為網友做此事並未營利、屬合理使用範圍,並無違反《著作權法》,今一併不起訴處分。據了解,暱稱「平民百姓」的邱男,為「神魔之塔攻略網站G8GAME」的創站成員之一。2015年底,暱稱「牛肉騰」的網友,使用邱男玩遊戲的截圖,加註自己的意見,說他提供的關卡資訊從不更新,成為簽名檔,張貼在巴哈姆特網站,邱男見到到特地發文澄清,並無從不更新的情形。沒想到牛肉騰事件在網路發酵,有網友在巴哈姆特網站、PTT,在巴哈姆特自己的小屋詢問有沒有「平民百姓」用真實身分騙20歲女友的八卦?結果引發許多網友留言指責他腳踏兩條船,甚至連邱男的女友也加入戰局,上網留言。邱男認為這些都是不實的言論,已經損及他的名譽,去年曾透過律師發表聲明,指稱「諸多網友不明究理,於自己的巴哈姆特小屋上散布不實謠言妨害本人名譽,甚至散布地點擴及G8GAME網站、本人個人粉絲頁、YOUTUBE頻道、均有網友前來張貼損害本人名譽之文字,顯見已有特定人士刻意幕後操弄網友,對本人及G8GAME進行惡意詆毀及集體霸凌之行徑。」邱男也表示,他已完成相關事證的蒐集,並委任律師處理,同時也「敬告所有曾經張貼、轉貼任何涉及本人、損及本人名譽、詆毀G8GAME網站文字的網友,自即日起立即停止相關行為,並自本聲明刊登之日起五日內,主動聯繫本人道歉,並協商和解事宜。」只不過台北地檢署調查後,認為截圖為合理使用,網友對可受公評之事留言批評也不違法,今將其中17名成年網友不起訴,另3名未成年網友則移請少年法庭審理。(呂志明/台北報導)【更多新聞,請看《蘋果陪審團》粉絲團】
##                           dt view_cnt
## 1 建立時間:2017/11/20 13:48     1703
getArticle <- function(url){
  e        <- read_html(url)
  title    <- e %>% html_node('h1') %>% html_text()
  article  <- e %>% html_node('.ndArticle_margin p') %>%
          html_text()
  dt       <- e %>% html_node('.ndArticle_creat') %>% 
          html_text()
  clicked  <- e %>% html_node('.ndArticle_view') %>% 
          html_text()
  
  category <- e %>% html_node('.ndgTag .current') %>%
          html_text()
  
  newsdf <- data.frame(title, article, dt, category, clicked, stringsAsFactors = FALSE)  
  newsdf
}

df1 <- getArticle('https://tw.news.appledaily.com/local/realtime/20171120/1244445/')

df2 <- getArticle('https://tw.finance.appledaily.com/realtime/20171120/1244452/')

rbind(df1,df2)
##                                               title
## 1            問卦實況主腳踏兩條船? 20網友留言挨告
## 2 <U+200B>7-ELEVEN熱食區年賣2.4億個 營業額上看30億
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          article
## 1 有網友在巴哈姆特電玩資訊站詢問網友,是否有《神魔之塔》邱姓實況主「平民百姓」用真實身分騙20歲小女友的八卦?許多網友紛紛在下面留言,指他腳踏兩條船。邱男認為名譽受辱,對留言的20名網友提告誹謗,但台北地檢署認為實況主為公眾人物,網友對可受公評之事留言批評不違法,今將其中17名成年網友不起訴,另3名未成年網友則移請少年法庭審理。另外,邱男還起違反《著作權法》告訴,指控網友將他玩遊戲進行遊戲攻略時的畫面截圖,還加註意見成為簽名檔散布,不過檢察官認為網友做此事並未營利、屬合理使用範圍,並無違反《著作權法》,今一併不起訴處分。據了解,暱稱「平民百姓」的邱男,為「神魔之塔攻略網站G8GAME」的創站成員之一。2015年底,暱稱「牛肉騰」的網友,使用邱男玩遊戲的截圖,加註自己的意見,說他提供的關卡資訊從不更新,成為簽名檔,張貼在巴哈姆特網站,邱男見到到特地發文澄清,並無從不更新的情形。沒想到牛肉騰事件在網路發酵,有網友在巴哈姆特網站、PTT,在巴哈姆特自己的小屋詢問有沒有「平民百姓」用真實身分騙20歲女友的八卦?結果引發許多網友留言指責他腳踏兩條船,甚至連邱男的女友也加入戰局,上網留言。邱男認為這些都是不實的言論,已經損及他的名譽,去年曾透過律師發表聲明,指稱「諸多網友不明究理,於自己的巴哈姆特小屋上散布不實謠言妨害本人名譽,甚至散布地點擴及G8GAME網站、本人個人粉絲頁、YOUTUBE頻道、均有網友前來張貼損害本人名譽之文字,顯見已有特定人士刻意幕後操弄網友,對本人及G8GAME進行惡意詆毀及集體霸凌之行徑。」邱男也表示,他已完成相關事證的蒐集,並委任律師處理,同時也「敬告所有曾經張貼、轉貼任何涉及本人、損及本人名譽、詆毀G8GAME網站文字的網友,自即日起立即停止相關行為,並自本聲明刊登之日起五日內,主動聯繫本人道歉,並協商和解事宜。」只不過台北地檢署調查後,認為截圖為合理使用,網友對可受公評之事留言批評也不違法,今將其中17名成年網友不起訴,另3名未成年網友則移請少年法庭審理。(呂志明/台北報導)【更多新聞,請看《蘋果陪審團》粉絲團】
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     7-ELEVEN今天宣告推出「蒸美味關東煮」系列,斥資上千萬開發新設備,新品包括蒸的花生米血糕、蒸糯米腸、蒸台灣黑輪等,預估年底導入500間店,明年目標為2000店。統一表示,蒸美味熱食自助區的「七大天王」一年賣達2.4億個,今年營業額上看30億元。7-ELEVEN表示,現有門市的蒸美味熱食自助區,涵蓋茶葉蛋、大燒包、熱狗大亨、關東煮、蒸地瓜、蒸茶碗蒸及蒸黃金玉米等七大結構,號稱「七大天王」,受到不同族群喜愛,如蒸地瓜主攻女性上班族,多購買當作早餐及排毒餐,一年熱銷300萬條。默默熱銷20年的茶碗蒸則以醫院型、交通轉運站商圈最受歡迎,預估今年底更可賣出超過230萬個,約可疊成307座101大樓。「七大天王」一年賣量高達2.4億個,等於平均每位國人每年到7-ELEVEN蒸美味熱食自助區購買10次以上。7-ELEVEN指出,「蒸美味關東煮」系列,除原有湯鍋式關東煮(原味及麻辣兩種)外,開發蒸的花生米血糕、蒸糯米腸、蒸台灣黑輪…等台式小吃,及花生粉、辣味噌醬、日式黃芥末等沾醬配料,打破國人對關東煮的刻板印象,後續也將開發更多新品,像是蒸蘿蔔糕、蒸燒賣、蒸芋粿巧及蒸肉圓…等台港小點。另外,全新的7-ELEVEN「蒸美味關東煮」,在設備上,研發出「恆溫循環式蒸氣專用設備」,有別於湯鍋以蔬菜、豆類關東煮為主,蒸鍋則主打肉類、魚漿類,以86℃恆溫循環式蒸氣來保持關東煮的鮮甜口感。(彭蕙珍/台北報導) <U+00A0> <U+00A0>
##                           dt category clicked
## 1 建立時間:2017/11/20 13:48     社會    1703
## 2 建立時間:2017/11/20 13:32 財經地產    1073

完成蘋果新聞爬蟲



getArticle <- function(url){
  
  e        <- read_html(url)
  title    <- e %>% html_node('h1') %>% html_text()
  article  <- e %>% html_node('.ndArticle_margin p') %>%
          html_text()
  dt       <- e %>% html_node('.ndArticle_creat') %>% 
          html_text()
  clicked  <- e %>% html_node('.ndArticle_view') %>% 
          html_text()
  
  category <- e %>% html_node('.ndgTag .current') %>%
          html_text()
  
  newsdf   <- data.frame(title, article, dt, category, clicked, stringsAsFactors = FALSE)  
  newsdf
}

newsurl <- 'https://tw.appledaily.com/new/realtime/'
dfall   <- data.frame()

for (page in 1:3){
  
  apple <- read_html(paste0(newsurl, page) )
  rtddt <- apple %>% html_nodes('.rtddt a')
  for(ele in rtddt){
    url   <- ele %>% html_attr('href')
    df    <- getArticle(url)
    dfall <- rbind(dfall,df)
  }
}
write.csv(x= dfall, file= 'applenews.csv')
#dfall

抓取範例

dfall <- data.frame()
apple <- read_html('https://tw.appledaily.com/new/realtime/')
rtddt <- apple %>% html_nodes('.rtddt a')
for( ele in rtddt){
  url <- ele %>% html_attr('href')
  df  <- getArticle(url)
  dfall <- rbind(dfall,df)
}
dfall