GET
# Method1
library(rvest)
## Warning: package 'rvest' was built under R version 3.4.2
## Loading required package: xml2
newsurl <- 'https://tw.appledaily.com/new/realtime'
apple <- read_html(newsurl)
#as.character(apple)
# Method2
library(httr)
apple <- GET(newsurl)
content(apple)
## {xml_document}
## <html lang="zh-TW">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body id="article" class="all">\n <div class="wrapper">\n ...
POST
library(httr)
url <- 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'
payload <- list(
StartStation='977abb69-413a-4ccf-a109-0272c24fd490',
EndStation='a7a04c89-900b-4798-95a3-c01c455622f4',
SearchDate='2017/11/20',
SearchTime='11:00',
SearchWay='DepartureInMandarin'
)
res <- POST(url, body = payload, encode = 'form')
content(res)
## {xml_document}
## <html lang="zh-tw">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="revision_cn">\r\n\t\r\n\t\r\n<header><div class="revisi ...
## [3] <script src="/Content/Script/moveScroll.js" type="text/javaScript">< ...
## [4] <script src="/Content/Script/init_index.js" type="text/javaScript">< ...
## [5] <script src="/Content/Script/jquery.ellipsis.js" type="text/javaScri ...
## [6] <script src="/Content/Script/script-2.js" type="text/javaScript"></s ...
## [7] <script type="text/javascript">\r\n\r\n var _gaq = _gaq || [];\r\ ...
## [8] <script src="/Content/Script/revision.js" type="text/javaScript"></s ...
#as.character(content(res))
台鐵簡易查詢
res <- read_html('http://twtraffic.tra.gov.tw/twrail/SearchResult.aspx?searchtype=0&searchdate=2017/11/20&fromstation=1810&tostation=1008&trainclass=%271100%27,%271101%27,%271102%27,%271107%27,%271110%27,%271120%27&fromtime=0600&totime=2359')
PCHOME
#res <- read_html('http://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/DCAKAD-A90061LV6-000&store=DCAKAD&fields=Seq,Id,Name,Nick,Store,PreOrdDate,SpeOrdDate,Price,Discount,Pic,Weight,ISBN,Qty,Bonus,isBig,isSpec,isCombine,isDiy,isRecyclable,isCarrier,isMedical,isBigCart,isSnapUp,isDescAndIntroSync,isFoodContents,isHuge,isEnergySubsidy&_callback=jsonp_prod&1511147880?_callback=jsonp_prod')
#as.character(res)
res <- read_html('http://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/DGBJBH-A90083AYO-000&store=DGBJBY&fields=Seq,Id,Name,Nick,Store,PreOrdDate,SpeOrdDate,Price,Discount,Pic,Weight,ISBN,Qty,Bonus,isBig,isSpec,isCombine,isDiy,isRecyclable,isCarrier,isMedical,isBigCart,isSnapUp,isDescAndIntroSync,isFoodContents,isHuge,isEnergySubsidy&_callback=jsonp_prod&1511156520?_callback=jsonp_prod')
as.character(res)
## [1] "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html><body><p>try{jsonp_prod({\"DGBJBH-A90083AYO-000\":{\"Seq\":17907797,\"Id\":\"DGBJBH-A90083AYO-000\",\"Name\":\"Nintendo Switch\\u300a\\u85a9\\u723e\\u9054\\u50b3\\u8aaa\\uff1a\\u8352\\u91ce\\u4e4b\\u606f The Legend of Zelda: Breath of the Wild\\u300b\\u82f1\\u65e5\\u591a\\u570b\\u8a9e\\u6587\\u7f8e\\u7248\",\"Nick\":\"<font color=\"RED\"><b>\\u767c\\u552e\\u65e5\\ufe31<font color=\"BLUE\"><b>2017-03-03 <br>Nintendo Switch\\u300a\\u85a9\\u723e\\u9054\\u50b3\\u8aaa\\uff1a\\u8352\\u91ce\\u4e4b\\u606f The Legend of Zelda: Breath of the Wild\\u300b\\u82f1\\u65e5\\u591a\\u570b\\u8a9e\\u6587\\u7f8e\\u7248\",\"Store\":\"DGBJBH\",\"PreOrdDate\":\"\",\"SpeOrdDate\":\"\",\"Price\":{\"M\":0,\"P\":2190},\"Discount\":0,\"Pic\":{\"B\":\"\\/items\\/DGBJBHA90083AYO\\/000001_1493886375.jpg\",\"S\":\"\\/items\\/DGBJBHA90083AYO\\/000002_1493886375.jpg\"},\"Weight\":0.06,\"ISBN\":\"\",\"Qty\":0,\"Bonus\":0,\"isBig\":0,\"isSpec\":0,\"isCombine\":0,\"isDiy\":0,\"isRecyclable\":0,\"isCarrier\":0,\"isMedical\":0,\"isBigCart\":1,\"isSnapUp\":0,\"isDescAndIntroSync\":0,\"isFoodContents\":0,\"isHuge\":0,\"isEnergySubsidy\":0}});}catch(e){if(window.console){console.log(e);}}</b></font></b></font></p></body></html>\n"
591
res <- read_html('https://sale.591.com.tw/home/search/list?type=2&®ionid=1×tamp=1511148080360')
#as.character(res)
Magrittr Sample
library(magrittr)
sum(tail(head(iris) , 3)$Sepal.Length)
## [1] 15
iris %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()
## [1] 15
Rvest Parsing Example
html_sample <- '
<html>
<body>
<h1 id="title">Hello World</h1>
<a href="#" class="link">This is link1</a>
<a href="# link2" class="link">This is link2</a>
</body>
</html>
'
read_html(html_sample) %>% html_node('h1')
## {xml_node}
## <h1 id="title">
read_html(html_sample) %>% html_node('h1') %>% html_text()
## [1] "Hello World"
# id => #
read_html(html_sample) %>% html_node('#title') %>% html_text()
## [1] "Hello World"
read_html(html_sample) %>% html_node('h1#title') %>% html_text()
## [1] "Hello World"
read_html(html_sample) %>% html_node('body h1#title') %>% html_text()
## [1] "Hello World"
read_html(html_sample) %>% html_node('body #title') %>% html_text()
## [1] "Hello World"
read_html(html_sample) %>% html_node('a') %>% html_text()
## [1] "This is link1"
read_html(html_sample) %>% html_nodes('a') %>% html_text()
## [1] "This is link1" "This is link2"
# class => .
read_html(html_sample) %>% html_nodes('.link') %>% html_text()
## [1] "This is link1" "This is link2"
read_html(html_sample) %>% html_nodes('a.link') %>% html_text()
## [1] "This is link1" "This is link2"
read_html(html_sample) %>% html_nodes('body a.link') %>% html_text()
## [1] "This is link1" "This is link2"
read_html(html_sample) %>% html_nodes('body .link') %>% html_attr('href')
## [1] "#" "# link2"
591 爬蟲範例
library(rvest)
newsurl <- 'https://tw.appledaily.com/new/realtime/'
apple <- read_html(newsurl)
rtddt <- apple %>% html_nodes('.rtddt a')
rtddt
## {xml_nodeset (30)}
## [1] <a href="https://tw.news.appledaily.com/life/realtime/20171120/1244 ...
## [2] <a href="https://tw.news.appledaily.com/life/realtime/20171120/1244 ...
## [3] <a href="https://tw.news.appledaily.com/local/realtime/20171120/124 ...
## [4] <a href="https://tw.finance.appledaily.com/realtime/20171120/124457 ...
## [5] <a href="https://tw.news.appledaily.com/local/realtime/20171120/124 ...
## [6] <a href="https://tw.entertainment.appledaily.com/realtime/20171120/ ...
## [7] <a href="https://tw.news.appledaily.com/local/realtime/20171120/124 ...
## [8] <a href="https://tw.news.appledaily.com/international/realtime/2017 ...
## [9] <a href="https://tw.news.appledaily.com/life/realtime/20171120/1244 ...
## [10] <a href="https://tw.finance.appledaily.com/realtime/20171120/124457 ...
## [11] <a href="https://tw.news.appledaily.com/politics/realtime/20171120/ ...
## [12] <a href="https://tw.news.appledaily.com/local/realtime/20171120/124 ...
## [13] <a href="https://tw.finance.appledaily.com/realtime/20171120/124456 ...
## [14] <a href="https://tw.news.appledaily.com/nextmag/realtime/20171120/1 ...
## [15] <a href="https://tw.entertainment.appledaily.com/realtime/20171120/ ...
## [16] <a href="https://tw.finance.appledaily.com/realtime/20171120/124456 ...
## [17] <a href="https://tw.lifestyle.appledaily.com/lifestyle/realtime/201 ...
## [18] <a href="https://tw.news.appledaily.com/nextmag/realtime/20171120/1 ...
## [19] <a href="https://tw.news.appledaily.com/international/realtime/2017 ...
## [20] <a href="https://tw.news.appledaily.com/forum/realtime/20171120/124 ...
## ...
as.character(rtddt[1])
## [1] "<a href=\"https://tw.news.appledaily.com/life/realtime/20171120/1244579/\" target=\"_blank\">\n <time>16:15</time><h2>生活</h2>\n <h1><font color=\"#383c40\">響應聯合國兒童權利公約 新北共融遊戲場誕...</font></h1>\n </a>"
title <- rtddt %>% html_nodes('h1') %>% html_text()
category <- rtddt %>% html_nodes('h2') %>% html_text()
dt <- rtddt %>% html_nodes('time') %>% html_text()
link <- rtddt %>% html_attr('href')
applenews <- data.frame(title = title, category = category, time = dt, url = link, stringsAsFactors = FALSE)
#applenews
抓 PTT
ptt <- read_html('https://www.ptt.cc/bbs/Creditcard/index.html')
ent <- ptt %>% html_nodes('.r-ent')
as.character(ent[1])
## [1] "<div class=\"r-ent\">\n\t\t\t<div class=\"nrec\"><span class=\"hl f2\">3</span></div>\n\t\t\t<div class=\"mark\"></div>\n\t\t\t<div class=\"title\">\n\t\t\t\n\t\t\t\t<a href=\"/bbs/creditcard/M.1511157944.A.011.html\">[問題] 中信的聯徵</a>\n\t\t\t\n\t\t\t</div>\n\t\t\t<div class=\"meta\">\n\t\t\t\t<div class=\"date\">11/20</div>\n\t\t\t\t<div class=\"author\">HyperactiveX</div>\n\t\t\t</div>\n\t\t</div>"
title <- ent %>% html_nodes('.title') %>% html_text() %>% trimws()
nrec <- ent %>% html_nodes('.nrec') %>% html_text() %>% trimws()
author <- ent %>% html_nodes('.author') %>% html_text() %>% trimws()
date <- ent %>% html_nodes('.date') %>% html_text() %>% trimws()
df <- data.frame(title = title, nrec= nrec, author= author, date= date, stringsAsFactors = FALSE)
#df
抓取內容頁
res <- read_html('https://tw.news.appledaily.com/local/realtime/20171120/1244445/')
title <- res %>% html_node('h1') %>% html_text()
article <- res %>% html_node('.ndArticle_margin p') %>% html_text()
dt <-res %>% html_node('.ndArticle_creat') %>% html_text()
view_cnt <- res %>% html_node('.ndArticle_view') %>% html_text()
newsdf <- data.frame(title, article, dt, view_cnt, stringsAsFactors = FALSE)
newsdf
## title
## 1 問卦實況主腳踏兩條船? 20網友留言挨告
## article
## 1 有網友在巴哈姆特電玩資訊站詢問網友,是否有《神魔之塔》邱姓實況主「平民百姓」用真實身分騙20歲小女友的八卦?許多網友紛紛在下面留言,指他腳踏兩條船。邱男認為名譽受辱,對留言的20名網友提告誹謗,但台北地檢署認為實況主為公眾人物,網友對可受公評之事留言批評不違法,今將其中17名成年網友不起訴,另3名未成年網友則移請少年法庭審理。另外,邱男還起違反《著作權法》告訴,指控網友將他玩遊戲進行遊戲攻略時的畫面截圖,還加註意見成為簽名檔散布,不過檢察官認為網友做此事並未營利、屬合理使用範圍,並無違反《著作權法》,今一併不起訴處分。據了解,暱稱「平民百姓」的邱男,為「神魔之塔攻略網站G8GAME」的創站成員之一。2015年底,暱稱「牛肉騰」的網友,使用邱男玩遊戲的截圖,加註自己的意見,說他提供的關卡資訊從不更新,成為簽名檔,張貼在巴哈姆特網站,邱男見到到特地發文澄清,並無從不更新的情形。沒想到牛肉騰事件在網路發酵,有網友在巴哈姆特網站、PTT,在巴哈姆特自己的小屋詢問有沒有「平民百姓」用真實身分騙20歲女友的八卦?結果引發許多網友留言指責他腳踏兩條船,甚至連邱男的女友也加入戰局,上網留言。邱男認為這些都是不實的言論,已經損及他的名譽,去年曾透過律師發表聲明,指稱「諸多網友不明究理,於自己的巴哈姆特小屋上散布不實謠言妨害本人名譽,甚至散布地點擴及G8GAME網站、本人個人粉絲頁、YOUTUBE頻道、均有網友前來張貼損害本人名譽之文字,顯見已有特定人士刻意幕後操弄網友,對本人及G8GAME進行惡意詆毀及集體霸凌之行徑。」邱男也表示,他已完成相關事證的蒐集,並委任律師處理,同時也「敬告所有曾經張貼、轉貼任何涉及本人、損及本人名譽、詆毀G8GAME網站文字的網友,自即日起立即停止相關行為,並自本聲明刊登之日起五日內,主動聯繫本人道歉,並協商和解事宜。」只不過台北地檢署調查後,認為截圖為合理使用,網友對可受公評之事留言批評也不違法,今將其中17名成年網友不起訴,另3名未成年網友則移請少年法庭審理。(呂志明/台北報導)【更多新聞,請看《蘋果陪審團》粉絲團】
## dt view_cnt
## 1 建立時間:2017/11/20 13:48 1703
getArticle <- function(url){
e <- read_html(url)
title <- e %>% html_node('h1') %>% html_text()
article <- e %>% html_node('.ndArticle_margin p') %>%
html_text()
dt <- e %>% html_node('.ndArticle_creat') %>%
html_text()
clicked <- e %>% html_node('.ndArticle_view') %>%
html_text()
category <- e %>% html_node('.ndgTag .current') %>%
html_text()
newsdf <- data.frame(title, article, dt, category, clicked, stringsAsFactors = FALSE)
newsdf
}
df1 <- getArticle('https://tw.news.appledaily.com/local/realtime/20171120/1244445/')
df2 <- getArticle('https://tw.finance.appledaily.com/realtime/20171120/1244452/')
rbind(df1,df2)
## title
## 1 問卦實況主腳踏兩條船? 20網友留言挨告
## 2 <U+200B>7-ELEVEN熱食區年賣2.4億個 營業額上看30億
## article
## 1 有網友在巴哈姆特電玩資訊站詢問網友,是否有《神魔之塔》邱姓實況主「平民百姓」用真實身分騙20歲小女友的八卦?許多網友紛紛在下面留言,指他腳踏兩條船。邱男認為名譽受辱,對留言的20名網友提告誹謗,但台北地檢署認為實況主為公眾人物,網友對可受公評之事留言批評不違法,今將其中17名成年網友不起訴,另3名未成年網友則移請少年法庭審理。另外,邱男還起違反《著作權法》告訴,指控網友將他玩遊戲進行遊戲攻略時的畫面截圖,還加註意見成為簽名檔散布,不過檢察官認為網友做此事並未營利、屬合理使用範圍,並無違反《著作權法》,今一併不起訴處分。據了解,暱稱「平民百姓」的邱男,為「神魔之塔攻略網站G8GAME」的創站成員之一。2015年底,暱稱「牛肉騰」的網友,使用邱男玩遊戲的截圖,加註自己的意見,說他提供的關卡資訊從不更新,成為簽名檔,張貼在巴哈姆特網站,邱男見到到特地發文澄清,並無從不更新的情形。沒想到牛肉騰事件在網路發酵,有網友在巴哈姆特網站、PTT,在巴哈姆特自己的小屋詢問有沒有「平民百姓」用真實身分騙20歲女友的八卦?結果引發許多網友留言指責他腳踏兩條船,甚至連邱男的女友也加入戰局,上網留言。邱男認為這些都是不實的言論,已經損及他的名譽,去年曾透過律師發表聲明,指稱「諸多網友不明究理,於自己的巴哈姆特小屋上散布不實謠言妨害本人名譽,甚至散布地點擴及G8GAME網站、本人個人粉絲頁、YOUTUBE頻道、均有網友前來張貼損害本人名譽之文字,顯見已有特定人士刻意幕後操弄網友,對本人及G8GAME進行惡意詆毀及集體霸凌之行徑。」邱男也表示,他已完成相關事證的蒐集,並委任律師處理,同時也「敬告所有曾經張貼、轉貼任何涉及本人、損及本人名譽、詆毀G8GAME網站文字的網友,自即日起立即停止相關行為,並自本聲明刊登之日起五日內,主動聯繫本人道歉,並協商和解事宜。」只不過台北地檢署調查後,認為截圖為合理使用,網友對可受公評之事留言批評也不違法,今將其中17名成年網友不起訴,另3名未成年網友則移請少年法庭審理。(呂志明/台北報導)【更多新聞,請看《蘋果陪審團》粉絲團】
## 2 7-ELEVEN今天宣告推出「蒸美味關東煮」系列,斥資上千萬開發新設備,新品包括蒸的花生米血糕、蒸糯米腸、蒸台灣黑輪等,預估年底導入500間店,明年目標為2000店。統一表示,蒸美味熱食自助區的「七大天王」一年賣達2.4億個,今年營業額上看30億元。7-ELEVEN表示,現有門市的蒸美味熱食自助區,涵蓋茶葉蛋、大燒包、熱狗大亨、關東煮、蒸地瓜、蒸茶碗蒸及蒸黃金玉米等七大結構,號稱「七大天王」,受到不同族群喜愛,如蒸地瓜主攻女性上班族,多購買當作早餐及排毒餐,一年熱銷300萬條。默默熱銷20年的茶碗蒸則以醫院型、交通轉運站商圈最受歡迎,預估今年底更可賣出超過230萬個,約可疊成307座101大樓。「七大天王」一年賣量高達2.4億個,等於平均每位國人每年到7-ELEVEN蒸美味熱食自助區購買10次以上。7-ELEVEN指出,「蒸美味關東煮」系列,除原有湯鍋式關東煮(原味及麻辣兩種)外,開發蒸的花生米血糕、蒸糯米腸、蒸台灣黑輪…等台式小吃,及花生粉、辣味噌醬、日式黃芥末等沾醬配料,打破國人對關東煮的刻板印象,後續也將開發更多新品,像是蒸蘿蔔糕、蒸燒賣、蒸芋粿巧及蒸肉圓…等台港小點。另外,全新的7-ELEVEN「蒸美味關東煮」,在設備上,研發出「恆溫循環式蒸氣專用設備」,有別於湯鍋以蔬菜、豆類關東煮為主,蒸鍋則主打肉類、魚漿類,以86℃恆溫循環式蒸氣來保持關東煮的鮮甜口感。(彭蕙珍/台北報導) <U+00A0> <U+00A0>
## dt category clicked
## 1 建立時間:2017/11/20 13:48 社會 1703
## 2 建立時間:2017/11/20 13:32 財經地產 1073
完成蘋果新聞爬蟲
getArticle <- function(url){
e <- read_html(url)
title <- e %>% html_node('h1') %>% html_text()
article <- e %>% html_node('.ndArticle_margin p') %>%
html_text()
dt <- e %>% html_node('.ndArticle_creat') %>%
html_text()
clicked <- e %>% html_node('.ndArticle_view') %>%
html_text()
category <- e %>% html_node('.ndgTag .current') %>%
html_text()
newsdf <- data.frame(title, article, dt, category, clicked, stringsAsFactors = FALSE)
newsdf
}
newsurl <- 'https://tw.appledaily.com/new/realtime/'
dfall <- data.frame()
for (page in 1:3){
apple <- read_html(paste0(newsurl, page) )
rtddt <- apple %>% html_nodes('.rtddt a')
for(ele in rtddt){
url <- ele %>% html_attr('href')
df <- getArticle(url)
dfall <- rbind(dfall,df)
}
}
write.csv(x= dfall, file= 'applenews.csv')
#dfall
抓取範例
dfall <- data.frame()
apple <- read_html('https://tw.appledaily.com/new/realtime/')
rtddt <- apple %>% html_nodes('.rtddt a')
for( ele in rtddt){
url <- ele %>% html_attr('href')
df <- getArticle(url)
dfall <- rbind(dfall,df)
}
dfall