PTT 爬蟲
library(rvest)
## Warning: package 'rvest' was built under R version 3.2.5
## Loading required package: xml2
ptturl = 'https://www.ptt.cc/bbs/Food/index.html'
rent = read_html(ptturl) %>% html_nodes('.r-ent')
titles = rent %>% html_nodes('.title') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
authors = rent %>% html_nodes('.author') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
dts = rent %>% html_nodes('.date') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
pttdf = data.frame(title = titles, author = authors, dt = dts)
View(pttdf)
酷航爬蟲
library(httr)
## Warning: package 'httr' was built under R version 3.2.5
payload = list(
`__EVENTTARGET`='AvailabilitySearchInputSearchView$LinkButtonSubmit',
`availabilitySearch.SearchInfo.SearchStations[0].DepartureStationCode`='TPE',
`availabilitySearch.SearchInfo.SearchStations[0].ArrivalStationCode`='NRT',
`availabilitySearch.SearchInfo.SearchStations[0].DepartureDate`='12/16/2016',
`availabilitySearch.SearchInfo.SearchStations[1].DepartureStationCode`='NRT',
`availabilitySearch.SearchInfo.SearchStations[1].ArrivalStationCode`='TPE',
`availabilitySearch.SearchInfo.SearchStations[1].DepartureDate`='12/30/2016',
`availabilitySearch.SearchInfo.Direction`='Return',
`fromDate`='1467993600000',
`returnDate`='1468512000000',
`availabilitySearch.SearchInfo.AdultCount`='5',
`availabilitySearch.SearchInfo.ChildrenCount`='0',
`availabilitySearch.SearchInfo.InfantCount`='0'
)
flySelect = 'http://makeabooking.flyscoot.com/?culture=zh-TW'
res = POST(flySelect, body = payload, encode = 'form' , encoding='UTF-8')
#res
flyInfo = 'http://makeabooking.flyscoot.com/Flight/Select'
res2 = GET(flyInfo)
flightinfo = content(res2) %>% html_nodes('.flight_select')
flightprice = flightinfo %>% html_nodes('.flight_price') %>% html_text()
flightdate = flightinfo %>% html_nodes('.flightdate') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
flightdf = data.frame(price = flightprice, dt = flightdate )
View(flightdf)
蘋果新聞列表抓取
library(rvest)
applenews = 'http://www.appledaily.com.tw/realtimenews/section/new/'
rtddt = read_html(applenews) %>% html_nodes('.rtddt a')
titles = rtddt %>% html_nodes('h1') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
categorys = rtddt %>% html_nodes('h2') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
times = rtddt %>% html_nodes('time') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
links = rtddt %>% html_attr('href') %>% iconv(from='UTF-8', to='UTF-8')
links = paste0('http://www.appledaily.com.tw', links)
applelist = data.frame(title = titles, category = categorys, time = times, link = links)
內容頁抓取
appleurl = 'http://www.appledaily.com.tw/realtimenews/article/entertainment/20160710/904877/伊能靜生女照被當廣告 坐月子還要忙告人'
read_html(appleurl) %>% html_nodes('.trans') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
## [1] "\n 伊能靜上月在美國生下女兒「小米粒」,還在坐月子卻不得閒,和老公秦昊跨海委託律師發聲明,捍衛肖像權。事件起因於她的主治醫生未經她同意,在網路公開他們的合照,結果被其他商家拿來當作赴美生子服務、月子服務、產婦食品等廣告。伊能靜工作室po出律師聲明,並表示:「醫師若要曝光患者,應經患者同意授權。保護患者隱私及保護未成年孩子曝光,乃醫師基本醫德。如今,特意告知私下紀念切勿公開外流的紀念照成了廣告,已超出應有醫德之範圍。經溝通,謝謝醫師願意道歉。請各界勿擅用未經授權的照片做商業使用,以免觸法。」而伊能靜解決心頭大患後,昨晚照例在微博po出母女照,表示當地時間是早上8點,她剛睡醒,準備餵女兒喝奶,但網友都關心小米粒「歪過頭」,怕寶寶脖子不舒服但寶寶不能說。(吳惠菁/綜合報導)\n 【看了這則新聞的人,也看了……】好大牌!半夜奔波225公里買披薩 天后卻不吃了仔仔做人超車F3 年輕阿公不是夢人妻女星修修臉上癮 明明變個人還裝傻伊能靜昨委託律師發聲明。翻攝伊能靜微博伊能靜生女後和主治醫生合照,被對方po上網路,因此發律師函捍衛肖像權。翻攝伊能靜工作室微博 \n\t\t\t\t\tgoogletag.cmd.push(function() {googletag.display(\"InRead\");})\n googletag.cmd.push(function() {googletag.display('teadstv');})\n googletag.cmd.push(function() {googletag.display('goldenhorse');})\n googletag.cmd.push(function() {googletag.display('textlink');})\n 有話要說 投稿「即時論壇」\n "
連結抓取範例
# attr 可以抓取連結屬性
read_html('<a href="largitdata.com" qoo=123> hihi</a>') %>% html_nodes('a') %>% html_attr('href')
## [1] "largitdata.com"
read_html('<a href="largitdata.com" qoo=123> hihi</a>') %>% html_nodes('a') %>% html_attr('qoo')
## [1] "123"
paste0
paste('hello', 'world')
## [1] "hello world"
paste0('hello', 'world')
## [1] "helloworld"
paste('hello', 'world', sep='@')
## [1] "hello@world"
# paste with sep='' equals to paste0
paste('hello', 'world', sep='')
## [1] "helloworld"
剖析內容頁
appleurl = 'http://www.appledaily.com.tw/realtimenews/article/entertainment/20160710/904877/伊能靜生女照被當廣告 坐月子還要忙告人'
h1 = read_html(appleurl) %>% html_nodes('#h1') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
trans = read_html(appleurl) %>% html_nodes('.trans') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
clicked = read_html(appleurl) %>% html_nodes('.clicked') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
gggstime = read_html(appleurl) %>% html_nodes('.gggs time') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
df = data.frame(title = h1, article = trans, click = clicked, dt = gggstime)
函式 (Function)
ftoc = function(F_degree){
((F_degree - 32) * 5) / 9 #Return
}
ftoc(62)
## [1] 16.66667
newscore = function(score){
s1 = sqrt(score)
s2 = s1 * 10
s2 #Return
}
newscore(60)
## [1] 77.45967
addnum = function(a, b = 10){
a + b
}
addnum(3,5)
## [1] 8
addnum(3)
## [1] 13
addnum2 = function(a = 5, b = 10){
a + b
}
addnum2(3)
## [1] 13
addnum2(b = 3)
## [1] 8
建立內文元素剖析函式
getArticle = function(appleurl, category){
h1 = read_html(appleurl) %>% html_nodes('#h1') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
trans = read_html(appleurl) %>% html_nodes('.trans') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
clicked = read_html(appleurl) %>% html_nodes('.clicked') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
gggstime = read_html(appleurl) %>% html_nodes('.gggs time') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
df = data.frame(title = h1, article = trans, click = clicked, dt = gggstime, category = category)
df
}
news = getArticle('http://www.appledaily.com.tw/realtimenews/article/life/20160710/904894/%E6%95%91%E6%95%91%E5%8F%B0%E6%9D%B1%E4%BA%BA%EF%BC%81%E5%8F%B0%E6%9D%B1%E9%A6%AC%E5%81%95%E5%89%B5%E9%99%A229%E5%B9%B4%E4%BE%86%E6%9C%80%E5%9A%B4%E9%87%8D%E7%81%BD%E6%90%8D', 'qpp')
View(news)
FOR 迴圈
for(i in 1:10){
print(paste("hello", i))
}
## [1] "hello 1"
## [1] "hello 2"
## [1] "hello 3"
## [1] "hello 4"
## [1] "hello 5"
## [1] "hello 6"
## [1] "hello 7"
## [1] "hello 8"
## [1] "hello 9"
## [1] "hello 10"
取得各篇文章細節
dfall = data.frame(title = character(),
article= character(),
click = character(),
dt =character(),
category= character())
getURL <- function(applenews){
#applenews = 'http://www.appledaily.com.tw/realtimenews/section/new/'
rtddt = read_html(applenews) %>% html_nodes('.rtddt a')
for (ele in rtddt){
categorys = ele %>% html_nodes('h2') %>%
html_text() %>% iconv(from='UTF-8', to='UTF-8')
links = ele %>% html_attr('href') %>%
iconv(from='UTF-8', to='UTF-8')
links = paste0('http://www.appledaily.com.tw', links)
df = getArticle(links, categorys)
dfall <- rbind(dfall,df)
}
dfall
}
applenews = getURL('http://www.appledaily.com.tw/realtimenews/section/new/')
View(applenews)
list 資料結構
# 建立一個list (with name)
person = list(name = 'Qoo', age = 30, likes=c('coke', 'cola', 'qoo'))
person$name # get value by name
## [1] "Qoo"
# 建立一個list (without name)
a = list(c(1,2,3), c(1,2))
a[[1]] # get value by position
## [1] 1 2 3
a[[2]]
## [1] 1 2
清理文章資料
a = 'aaa@bbb@ccc'
b = strsplit(a, '@')
b[[1]][1]
## [1] "aaa"
s = c('aaa@bbb@ccc', 'ddd@eee')
s1 = strsplit(s, '@')
s1
## [[1]]
## [1] "aaa" "bbb" "ccc"
##
## [[2]]
## [1] "ddd" "eee"
class(s1)
## [1] "list"
s1[[1]]
## [1] "aaa" "bbb" "ccc"
s1[[2]]
## [1] "ddd" "eee"
character v.s. factor
class(c(1,2,3))
## [1] "numeric"
class(c('1','2','3'))
## [1] "character"
as.factor(c(1,2,3,1,2,3,1,2))
## [1] 1 2 3 1 2 3 1 2
## Levels: 1 2 3
# plot as numeric
plot(c(1,2,3,1,2,3,1,2))

# plot as factor
plot(as.factor(c(1,2,3,1,2,3,1,2)))

trimws
a <- ' Hi This is a sample '
a
## [1] " Hi This is a sample "
trimws(a)
## [1] "Hi This is a sample"
trimws(a, 'l')
## [1] "Hi This is a sample "
trimws(a, 'r')
## [1] " Hi This is a sample"
collapse string
a = 'aaa@bbb@ccc'
b = strsplit(a, '@')
b[[1]][1]
## [1] "aaa"
paste(b[[1]], collapse = ' ')
## [1] "aaa bbb ccc"
paste(b[[1]], collapse = '|')
## [1] "aaa|bbb|ccc"
清理文章資料
class(applenews[ 1 , "article" ])
## [1] "factor"
# 將factor 轉型回character
applenews$article = as.character(applenews$article)
# clean up the article
a = strsplit(applenews[ 1 , "article" ], 'googletag')
b = a[[1]][1]
c = trimws(b)
d = strsplit(c, ' +')
e = paste(d, collapse=' ')
# write in magrittr %>%
strsplit(applenews[ 1 , "article" ], 'googletag') %>% .[[1]] %>% .[1] %>% trimws() %>% strsplit(., ' +') %>% .[[1]] %>% paste(., collapse = ' ')
## [1] "更新時間:16:33【更新:加賽結果】今天被迫與中國隊加賽的台灣代表隊,在加賽中火力完全釋放,終場以11比1大勝中國隊,台灣明天上午11點,繼續與菲律賓爭奪冠軍,以及LLB亞太區青棒代表權。【更新:新增影片】在中國參加2016年LLB(Little League Baseball,世界少棒聯盟)亞太區錦標賽的台灣代表隊原訂今天下午要與菲律賓隊打冠軍戰,但昨天台菲戰遭中國隊申訴,違反運動家精神 ,除教練團要被禁賽外,今天還需與中國隊再打一場,勝者才能與菲律賓打冠軍賽,這起事件引起許多網友討論(LLB台灣隊違反運動家精神 網友反應不一)。國際運動場上「策略運用」選對手,或「假球」的事件屢見不鮮,《蘋果》整理以下案例:【倫敦奧運】2012年倫敦奧運羽球小組循環賽,當時中國女雙組合、世界排名第1的王曉理、于洋刻意輸給南韓的「金銀組合」鄭景銀、金河娜,就是為了4強賽時能避開自家人趙雲蕾、田卿。南韓不甘願贏了比賽,另一組女雙河貞恩與金旼貞在面對印尼的比賽中,也刻意想要輸,因為一旦贏了比賽,8強淘汰賽首役,河貞恩與金旼貞就要硬碰世界第1的王曉理、于洋。最後世界羽球總會(BWF)出面滅火,將8名選手全處以失格極刑。倫敦奧運羽球打假球 共8人遭判失格【玉山盃青棒賽】2008年玉山盃青棒賽,是台灣首次因「違反運動精神」宣布比賽重賽的棒球賽事。該年玉山盃,平鎮高中為主體的桃園縣與中道中學為主體的宜蘭縣,兩隊為求晉級而消極比賽,比賽中途遭大會裁定取消,隔日再賽。違反運動家精神 台灣棒壇有前例【LLS女壘比賽】去年LLS(Little League Softball,世界少壘聯盟)的女壘比賽,華盛頓州代表隊跟本次台灣隊情況相同,雖早已確定晉級準決賽,但在最後一場預賽中,若勝過對手北卡羅萊納隊3分以上,將會在準決賽對上實力較強的愛荷華代表隊,因此教練讓主力球員坐板凳,最後華盛頓以0比8敗給北卡,也讓實力較強的愛荷華遭淘汰。愛荷華代表隊立即提出抗議,大會隨即判定除北卡取得準決賽資格外,愛荷華與華盛頓必需要加賽一場,決定晉級隊伍,最後愛荷華在加賽中擊敗華盛頓,獲得晉級。違反運動精神加賽 去年少棒有先例【HBL高中籃球聯賽】HBL高中籃球聯賽也曾發生過策略性選擇對手的案件,2008年12月30日,男子組16強複賽第2輪青年高中與南湖高中的比賽,當時最後14.3秒兩隊46比46打平,青年掌握最後一波進攻權,教練卻下達戰術要球員故意失誤,最後青年以2分之差輸球,也把新榮高中淘汰出局。教練遭處禁賽一年。HBL也曾發生策略性選擇 教練遭禁賽一年10:40今天上午判定台灣隊違反運動精神,除了教練團遭禁賽外,下午1點半還需要先與中國隊打一場加賽,勝者才能爭冠。台灣隊目前由隨隊具有教練資格的吳御瑋負責比賽調度。中華棒協秘書長林宗成表示,目前已經收到世界少棒聯盟所寄來的通知,除了上述教練團停權、需加賽一場兩點外,現階段會如期把比賽打完,待球隊回國後,將立即召開技術委員會討論此重大事件。<U+00A0>7月9日台灣隊昨天賽前已經篤定可以打入冠軍戰,但中國隊因為吞3連敗,僅在昨天勝過菲律賓,因此想要爭冠必需要看台菲戰結果,而台灣隊在刻意保留實力下,派出辛元旭擔任投手對上菲律賓。儘管台灣隊全場敲出8支安打,卻不敵在3局下敲出2分砲的菲律賓隊,終場以2比4輸球,這也確定中國隊遭淘汰,今天將由台菲爭冠,不過中國隊立即提出申訴。(賴德剛/綜合報導)LLB亞太區青棒 台灣明與菲律賓爭冠軍發稿時間:10:40更新時間:14:44<U+00A0>\n 【想知道更多,一定要看……】<U+200B>HBL也曾發生策略性選擇 教練遭禁賽一年LLB台灣隊違反運動家精神 網友反應不一<U+200B>倫敦奧運羽球打假球 共8人遭判失格 【看了這則新聞的人,也看了……】違反運動精神加賽 去年有先例【更新】西武吳念庭一軍首安出爐 李振昌重返一軍【好大膽】NFL球員罵過界 竟砲柯瑞斂財 亞太區青棒台灣被判違反運動家精神。資料照片亞太區青棒台灣被判違反運動家精神。資料照片台菲戰先發投手辛元旭。資料照片"
gsub 範例
a <- 'ABC@123@DEF'
gsub('.*@(.*)@.*', '\\1', a)
## [1] "123"
gsub('.*@(.*)@(.*)', '\\2', a)
## [1] "DEF"
email <- 'qoo@oop.com'
gsub('(.+)@(.+)', '\\1', email)
## [1] "qoo"
gsub('(.+)@(.+)', '\\2', email)
## [1] "oop.com"
轉換人氣數
class(applenews$click)
## [1] "factor"
applenews$click = as.character(applenews$click)
gsub('.{2}\\((\\d+)\\)', '\\1',x = applenews[1,'click'])
## [1] "178059"
gsub('.{2}\\((\\d+)\\)', '\\1',x = applenews[,'click'])
## [1] "178059" "0" "2526" "0" "0" "9706" "63"
## [8] "45" "289" "1249" "561" "1037" "6133" "27502"
## [15] "1167" "983" "5679" "331" "105" "14590" "1220"
## [22] "1220" "895" "1743" "101" "21515" "340" "552"
## [29] "2230" "6360" "1097"
applenews$click =
as.integer(gsub('.{2}\\((\\d+)\\)', '\\1',x = applenews$click))
sort(applenews$click)
## [1] 0 0 0 45 63 101 105 289 331 340
## [11] 552 561 895 983 1037 1097 1167 1220 1220 1249
## [21] 1743 2230 2526 5679 6133 6360 9706 14590 21515 27502
## [31] 178059
head(applenews[order(applenews$click, decreasing = TRUE), c('title', 'click') ], 5)
## title click
## 1 【不斷更新】亞太區青棒台灣遭申訴仍大勝中國 明與菲爭冠 178059
## 14 【我難過】小S台東媳婦伴公公 捐百萬助鄉親 27502
## 26 【有片】開齋節慶祝活動 台北車站大廳湧入穆斯林 21515
## 20 【更新】沒事插花罵警靠北 2男GG了 14590
## 6 蘇嘉全弔唁飛彈誤射喪家 籲最高額國賠處理 9706
字串轉換為時間
ds <- c("18, 2014 12:00")
x1 <- strptime(ds, "%d, %Y %H:%M")
x1
## [1] "2014-07-18 12:00:00 CST"
ds = '2016年04月05日22:47'
x2 <- strptime(ds, "%Y年%m月%d日%H:%M")
x2
## [1] "2016-04-05 22:47:00 CST"
x2 - x1
## Time difference of 627.4493 days
時間轉換
applenews$dt <- strptime(applenews$dt, "%Y年%m月%d日%H:%M")
View(applenews)
*apply family
# for loop
s = 0
for(i in 1:10){
s = s + i
}
s
## [1] 55
# *apply
x = list( c(1,2,3) , c(2,3,4,5) )
lapply(x, sum)
## [[1]]
## [1] 6
##
## [[2]]
## [1] 14
sapply(x, sum)
## [1] 6 14
f = function(e){
e[1]
}
lapply(x, f)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2
sapply(x, f)
## [1] 1 2
lapply(x, function(e)e[1])
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2
sapply(x, function(e)e[1])
## [1] 1 2
清理文章
contentclean <- function(news){
news %>% strsplit("googletag") %>% .[[1]] %>% .[1] %>% trimws() %>% strsplit('[ \n\r]+') %>% unlist() %>% paste(collapse=' ')
}
applenews$article <- sapply(applenews$article, contentclean)
抓取分頁資訊
pageurl = 'http://www.appledaily.com.tw/realtimenews/section/new/'
# use for loop
for (i in 1:30){
print(paste0(pageurl, i) )
}
# use sapply
sapply(1:30, function(e) paste0(pageurl,e))
pageurl <- 'http://www.appledaily.com.tw/realtimenews/section/new/'
dfall <- lapply(1:3,
function(i){
url <- paste0(pageurl, i)
df <- getURL(url)
}
)
appledf <- do.call("rbind",dfall)
appledf$article = as.character(appledf$article)
appledf$dt = as.character(appledf$dt)
appledf$click = as.character(appledf$click)
appledf$article <- sapply(appledf$article, contentclean)
appledf$dt <- strptime(appledf$dt, "%Y年%m月%d日%H:%M")
appledf$click =
as.integer(gsub('.{2}\\((\\d+)\\)', '\\1',x = appledf$click))
合併所有的 data frame
a.frame <- data.frame(x= c(1,2,3), b =c(2,3,4))
b.frame <- data.frame(x= c(3,3,2), b =c(1,2,3))
do.call("rbind", list(a.frame, b.frame))
## x b
## 1 1 2
## 2 2 3
## 3 3 4
## 4 3 1
## 5 3 2
## 6 2 3
do.call("cbind", list(a.frame, b.frame))
## x b x b
## 1 1 2 3 1
## 2 2 3 3 2
## 3 3 4 2 3