PTT 爬蟲

library(rvest)
## Warning: package 'rvest' was built under R version 3.2.5
## Loading required package: xml2
ptturl = 'https://www.ptt.cc/bbs/Food/index.html'
rent = read_html(ptturl) %>% html_nodes('.r-ent')
titles = rent %>% html_nodes('.title') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
authors = rent %>% html_nodes('.author') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
dts = rent %>% html_nodes('.date') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
pttdf = data.frame(title = titles, author = authors, dt = dts)
View(pttdf)

酷航爬蟲

library(httr)
## Warning: package 'httr' was built under R version 3.2.5
payload = list(
  `__EVENTTARGET`='AvailabilitySearchInputSearchView$LinkButtonSubmit',
  `availabilitySearch.SearchInfo.SearchStations[0].DepartureStationCode`='TPE',
  `availabilitySearch.SearchInfo.SearchStations[0].ArrivalStationCode`='NRT',
  `availabilitySearch.SearchInfo.SearchStations[0].DepartureDate`='12/16/2016',
  `availabilitySearch.SearchInfo.SearchStations[1].DepartureStationCode`='NRT',
  `availabilitySearch.SearchInfo.SearchStations[1].ArrivalStationCode`='TPE',
  `availabilitySearch.SearchInfo.SearchStations[1].DepartureDate`='12/30/2016',
  `availabilitySearch.SearchInfo.Direction`='Return',
  `fromDate`='1467993600000',
  `returnDate`='1468512000000',
  `availabilitySearch.SearchInfo.AdultCount`='5',
  `availabilitySearch.SearchInfo.ChildrenCount`='0',
  `availabilitySearch.SearchInfo.InfantCount`='0'  
)

flySelect = 'http://makeabooking.flyscoot.com/?culture=zh-TW'
res = POST(flySelect, body = payload, encode = 'form' , encoding='UTF-8')
#res

flyInfo = 'http://makeabooking.flyscoot.com/Flight/Select'
res2 = GET(flyInfo)
flightinfo = content(res2) %>% html_nodes('.flight_select')
flightprice = flightinfo %>% html_nodes('.flight_price') %>% html_text()
flightdate = flightinfo %>% html_nodes('.flightdate') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
flightdf = data.frame(price = flightprice, dt  =  flightdate )
View(flightdf)

蘋果新聞列表抓取

library(rvest)
applenews = 'http://www.appledaily.com.tw/realtimenews/section/new/'
rtddt = read_html(applenews) %>% html_nodes('.rtddt a')
titles = rtddt %>% html_nodes('h1') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
categorys = rtddt %>% html_nodes('h2') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
times = rtddt %>% html_nodes('time') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
links = rtddt %>% html_attr('href') %>% iconv(from='UTF-8', to='UTF-8')
links = paste0('http://www.appledaily.com.tw', links)
applelist = data.frame(title = titles, category = categorys, time = times, link = links)

內容頁抓取

appleurl = 'http://www.appledaily.com.tw/realtimenews/article/entertainment/20160710/904877/伊能靜生女照被當廣告 坐月子還要忙告人'
read_html(appleurl) %>% html_nodes('.trans') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
## [1] "\n                                        伊能靜上月在美國生下女兒「小米粒」,還在坐月子卻不得閒,和老公秦昊跨海委託律師發聲明,捍衛肖像權。事件起因於她的主治醫生未經她同意,在網路公開他們的合照,結果被其他商家拿來當作赴美生子服務、月子服務、產婦食品等廣告。伊能靜工作室po出律師聲明,並表示:「醫師若要曝光患者,應經患者同意授權。保護患者隱私及保護未成年孩子曝光,乃醫師基本醫德。如今,特意告知私下紀念切勿公開外流的紀念照成了廣告,已超出應有醫德之範圍。經溝通,謝謝醫師願意道歉。請各界勿擅用未經授權的照片做商業使用,以免觸法。」而伊能靜解決心頭大患後,昨晚照例在微博po出母女照,表示當地時間是早上8點,她剛睡醒,準備餵女兒喝奶,但網友都關心小米粒「歪過頭」,怕寶寶脖子不舒服但寶寶不能說。(吳惠菁/綜合報導)\n                    【看了這則新聞的人,也看了……】好大牌!半夜奔波225公里買披薩 天后卻不吃了仔仔做人超車F3 年輕阿公不是夢人妻女星修修臉上癮 明明變個人還裝傻伊能靜昨委託律師發聲明。翻攝伊能靜微博伊能靜生女後和主治醫生合照,被對方po上網路,因此發律師函捍衛肖像權。翻攝伊能靜工作室微博                    \n\t\t\t\t\tgoogletag.cmd.push(function() {googletag.display(\"InRead\");})\n                    googletag.cmd.push(function() {googletag.display('teadstv');})\n                    googletag.cmd.push(function() {googletag.display('goldenhorse');})\n                    googletag.cmd.push(function() {googletag.display('textlink');})\n                    有話要說 投稿「即時論壇」\n                  "

連結抓取範例

# attr 可以抓取連結屬性
read_html('<a href="largitdata.com" qoo=123> hihi</a>') %>% html_nodes('a') %>% html_attr('href')
## [1] "largitdata.com"
read_html('<a href="largitdata.com" qoo=123> hihi</a>') %>% html_nodes('a') %>% html_attr('qoo')
## [1] "123"

paste0

paste('hello', 'world')
## [1] "hello world"
paste0('hello', 'world')
## [1] "helloworld"
paste('hello', 'world', sep='@')
## [1] "hello@world"
# paste with sep='' equals  to paste0 
paste('hello', 'world', sep='')
## [1] "helloworld"

剖析內容頁

appleurl = 'http://www.appledaily.com.tw/realtimenews/article/entertainment/20160710/904877/伊能靜生女照被當廣告 坐月子還要忙告人'
h1 = read_html(appleurl) %>% html_nodes('#h1') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
trans = read_html(appleurl) %>% html_nodes('.trans') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
clicked = read_html(appleurl) %>% html_nodes('.clicked') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
gggstime = read_html(appleurl) %>% html_nodes('.gggs time') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')

df = data.frame(title = h1, article = trans, click = clicked, dt = gggstime)

函式 (Function)

ftoc  = function(F_degree){
  ((F_degree - 32) * 5)  / 9 #Return
}

ftoc(62)
## [1] 16.66667
newscore  = function(score){
  s1 = sqrt(score)
  s2 = s1 * 10
  s2 #Return
}
newscore(60)
## [1] 77.45967
addnum = function(a, b = 10){
  a + b
}

addnum(3,5)
## [1] 8
addnum(3)
## [1] 13
addnum2 = function(a = 5, b = 10){
  a + b
}
addnum2(3)
## [1] 13
addnum2(b = 3)
## [1] 8

建立內文元素剖析函式

getArticle = function(appleurl, category){
  h1 = read_html(appleurl) %>% html_nodes('#h1') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
  trans = read_html(appleurl) %>% html_nodes('.trans') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
  clicked = read_html(appleurl) %>% html_nodes('.clicked') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
  gggstime = read_html(appleurl) %>% html_nodes('.gggs time') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
  
  df = data.frame(title = h1, article = trans, click = clicked, dt = gggstime, category = category)
  df
}

news = getArticle('http://www.appledaily.com.tw/realtimenews/article/life/20160710/904894/%E6%95%91%E6%95%91%E5%8F%B0%E6%9D%B1%E4%BA%BA%EF%BC%81%E5%8F%B0%E6%9D%B1%E9%A6%AC%E5%81%95%E5%89%B5%E9%99%A229%E5%B9%B4%E4%BE%86%E6%9C%80%E5%9A%B4%E9%87%8D%E7%81%BD%E6%90%8D', 'qpp')
View(news)

FOR 迴圈

for(i in 1:10){
  print(paste("hello", i))
}
## [1] "hello 1"
## [1] "hello 2"
## [1] "hello 3"
## [1] "hello 4"
## [1] "hello 5"
## [1] "hello 6"
## [1] "hello 7"
## [1] "hello 8"
## [1] "hello 9"
## [1] "hello 10"

取得各篇文章細節

dfall = data.frame(title = character(), 
                   article= character(),
                   click = character(), 
                   dt =character(),
                   category= character())
getURL <- function(applenews){  
  #applenews = 'http://www.appledaily.com.tw/realtimenews/section/new/'
  rtddt = read_html(applenews) %>% html_nodes('.rtddt a')
  for (ele in rtddt){
    categorys = ele %>% html_nodes('h2') %>% 
               html_text() %>% iconv(from='UTF-8', to='UTF-8')
    links = ele %>% html_attr('href') %>% 
               iconv(from='UTF-8', to='UTF-8')
    links = paste0('http://www.appledaily.com.tw', links)
    df = getArticle(links, categorys)
    dfall <- rbind(dfall,df)

  }
  dfall
}

applenews = getURL('http://www.appledaily.com.tw/realtimenews/section/new/')
View(applenews)

list 資料結構

# 建立一個list (with name)
person = list(name = 'Qoo', age = 30, likes=c('coke', 'cola', 'qoo'))
person$name # get value by name
## [1] "Qoo"
# 建立一個list (without name)
a = list(c(1,2,3), c(1,2))
a[[1]] # get value by position
## [1] 1 2 3
a[[2]]
## [1] 1 2

清理文章資料

a = 'aaa@bbb@ccc'
b = strsplit(a, '@')
b[[1]][1]
## [1] "aaa"
s = c('aaa@bbb@ccc', 'ddd@eee')
s1 = strsplit(s, '@')
s1
## [[1]]
## [1] "aaa" "bbb" "ccc"
## 
## [[2]]
## [1] "ddd" "eee"
class(s1)
## [1] "list"
s1[[1]]
## [1] "aaa" "bbb" "ccc"
s1[[2]]
## [1] "ddd" "eee"

character v.s. factor

class(c(1,2,3))
## [1] "numeric"
class(c('1','2','3'))
## [1] "character"
as.factor(c(1,2,3,1,2,3,1,2))
## [1] 1 2 3 1 2 3 1 2
## Levels: 1 2 3
# plot as numeric
plot(c(1,2,3,1,2,3,1,2))

# plot as factor
plot(as.factor(c(1,2,3,1,2,3,1,2)))

trimws

a <- '    Hi This is a sample  '
a
## [1] "    Hi This is a sample  "
trimws(a)
## [1] "Hi This is a sample"
trimws(a, 'l')
## [1] "Hi This is a sample  "
trimws(a, 'r')
## [1] "    Hi This is a sample"

collapse string

a = 'aaa@bbb@ccc'
b = strsplit(a, '@')
b[[1]][1]
## [1] "aaa"
paste(b[[1]], collapse = ' ')
## [1] "aaa bbb ccc"
paste(b[[1]], collapse = '|')
## [1] "aaa|bbb|ccc"

清理文章資料

class(applenews[ 1 , "article" ])
## [1] "factor"
# 將factor 轉型回character
applenews$article = as.character(applenews$article)

# clean up the article
a = strsplit(applenews[ 1 , "article" ], 'googletag')
b = a[[1]][1]
c = trimws(b)
d = strsplit(c, '  +')
e = paste(d, collapse=' ')

# write in magrittr %>%
strsplit(applenews[ 1 , "article" ], 'googletag') %>% .[[1]] %>% .[1] %>% trimws() %>% strsplit(., '  +') %>% .[[1]] %>% paste(., collapse = ' ')
## [1] "更新時間:16:33【更新:加賽結果】今天被迫與中國隊加賽的台灣代表隊,在加賽中火力完全釋放,終場以11比1大勝中國隊,台灣明天上午11點,繼續與菲律賓爭奪冠軍,以及LLB亞太區青棒代表權。【更新:新增影片】在中國參加2016年LLB(Little League Baseball,世界少棒聯盟)亞太區錦標賽的台灣代表隊原訂今天下午要與菲律賓隊打冠軍戰,但昨天台菲戰遭中國隊申訴,違反運動家精神 ,除教練團要被禁賽外,今天還需與中國隊再打一場,勝者才能與菲律賓打冠軍賽,這起事件引起許多網友討論(LLB台灣隊違反運動家精神 網友反應不一)。國際運動場上「策略運用」選對手,或「假球」的事件屢見不鮮,《蘋果》整理以下案例:【倫敦奧運】2012年倫敦奧運羽球小組循環賽,當時中國女雙組合、世界排名第1的王曉理、于洋刻意輸給南韓的「金銀組合」鄭景銀、金河娜,就是為了4強賽時能避開自家人趙雲蕾、田卿。南韓不甘願贏了比賽,另一組女雙河貞恩與金旼貞在面對印尼的比賽中,也刻意想要輸,因為一旦贏了比賽,8強淘汰賽首役,河貞恩與金旼貞就要硬碰世界第1的王曉理、于洋。最後世界羽球總會(BWF)出面滅火,將8名選手全處以失格極刑。倫敦奧運羽球打假球 共8人遭判失格【玉山盃青棒賽】2008年玉山盃青棒賽,是台灣首次因「違反運動精神」宣布比賽重賽的棒球賽事。該年玉山盃,平鎮高中為主體的桃園縣與中道中學為主體的宜蘭縣,兩隊為求晉級而消極比賽,比賽中途遭大會裁定取消,隔日再賽。違反運動家精神 台灣棒壇有前例【LLS女壘比賽】去年LLS(Little League Softball,世界少壘聯盟)的女壘比賽,華盛頓州代表隊跟本次台灣隊情況相同,雖早已確定晉級準決賽,但在最後一場預賽中,若勝過對手北卡羅萊納隊3分以上,將會在準決賽對上實力較強的愛荷華代表隊,因此教練讓主力球員坐板凳,最後華盛頓以0比8敗給北卡,也讓實力較強的愛荷華遭淘汰。愛荷華代表隊立即提出抗議,大會隨即判定除北卡取得準決賽資格外,愛荷華與華盛頓必需要加賽一場,決定晉級隊伍,最後愛荷華在加賽中擊敗華盛頓,獲得晉級。違反運動精神加賽 去年少棒有先例【HBL高中籃球聯賽】HBL高中籃球聯賽也曾發生過策略性選擇對手的案件,2008年12月30日,男子組16強複賽第2輪青年高中與南湖高中的比賽,當時最後14.3秒兩隊46比46打平,青年掌握最後一波進攻權,教練卻下達戰術要球員故意失誤,最後青年以2分之差輸球,也把新榮高中淘汰出局。教練遭處禁賽一年。HBL也曾發生策略性選擇 教練遭禁賽一年10:40今天上午判定台灣隊違反運動精神,除了教練團遭禁賽外,下午1點半還需要先與中國隊打一場加賽,勝者才能爭冠。台灣隊目前由隨隊具有教練資格的吳御瑋負責比賽調度。中華棒協秘書長林宗成表示,目前已經收到世界少棒聯盟所寄來的通知,除了上述教練團停權、需加賽一場兩點外,現階段會如期把比賽打完,待球隊回國後,將立即召開技術委員會討論此重大事件。<U+00A0>7月9日台灣隊昨天賽前已經篤定可以打入冠軍戰,但中國隊因為吞3連敗,僅在昨天勝過菲律賓,因此想要爭冠必需要看台菲戰結果,而台灣隊在刻意保留實力下,派出辛元旭擔任投手對上菲律賓。儘管台灣隊全場敲出8支安打,卻不敵在3局下敲出2分砲的菲律賓隊,終場以2比4輸球,這也確定中國隊遭淘汰,今天將由台菲爭冠,不過中國隊立即提出申訴。(賴德剛/綜合報導)LLB亞太區青棒 台灣明與菲律賓爭冠軍發稿時間:10:40更新時間:14:44<U+00A0>\n 【想知道更多,一定要看……】<U+200B>HBL也曾發生策略性選擇 教練遭禁賽一年LLB台灣隊違反運動家精神 網友反應不一<U+200B>倫敦奧運羽球打假球 共8人遭判失格 【看了這則新聞的人,也看了……】違反運動精神加賽 去年有先例【更新】西武吳念庭一軍首安出爐 李振昌重返一軍【好大膽】NFL球員罵過界 竟砲柯瑞斂財 亞太區青棒台灣被判違反運動家精神。資料照片亞太區青棒台灣被判違反運動家精神。資料照片台菲戰先發投手辛元旭。資料照片"

gsub 範例

a <- 'ABC@123@DEF'
gsub('.*@(.*)@.*', '\\1', a)
## [1] "123"
gsub('.*@(.*)@(.*)', '\\2', a)
## [1] "DEF"
email <- 'qoo@oop.com'
gsub('(.+)@(.+)', '\\1', email)
## [1] "qoo"
gsub('(.+)@(.+)', '\\2', email)
## [1] "oop.com"

轉換人氣數

class(applenews$click)
## [1] "factor"
applenews$click = as.character(applenews$click)

gsub('.{2}\\((\\d+)\\)', '\\1',x = applenews[1,'click'])
## [1] "178059"
gsub('.{2}\\((\\d+)\\)', '\\1',x = applenews[,'click'])
##  [1] "178059" "0"      "2526"   "0"      "0"      "9706"   "63"    
##  [8] "45"     "289"    "1249"   "561"    "1037"   "6133"   "27502" 
## [15] "1167"   "983"    "5679"   "331"    "105"    "14590"  "1220"  
## [22] "1220"   "895"    "1743"   "101"    "21515"  "340"    "552"   
## [29] "2230"   "6360"   "1097"
applenews$click = 
as.integer(gsub('.{2}\\((\\d+)\\)', '\\1',x = applenews$click))

sort(applenews$click)
##  [1]      0      0      0     45     63    101    105    289    331    340
## [11]    552    561    895    983   1037   1097   1167   1220   1220   1249
## [21]   1743   2230   2526   5679   6133   6360   9706  14590  21515  27502
## [31] 178059
head(applenews[order(applenews$click, decreasing = TRUE), c('title', 'click') ], 5)
##                                                       title  click
## 1  【不斷更新】亞太區青棒台灣遭申訴仍大勝中國 明與菲爭冠  178059
## 14                【我難過】小S台東媳婦伴公公 捐百萬助鄉親  27502
## 26           【有片】開齋節慶祝活動 台北車站大廳湧入穆斯林  21515
## 20                        【更新】沒事插花罵警靠北 2男GG了  14590
## 6                  蘇嘉全弔唁飛彈誤射喪家 籲最高額國賠處理   9706

字串轉換為時間

ds <- c("18, 2014 12:00")
x1 <- strptime(ds, "%d, %Y %H:%M")
x1
## [1] "2014-07-18 12:00:00 CST"
ds = '2016年04月05日22:47'
x2 <- strptime(ds, "%Y年%m月%d日%H:%M")
x2
## [1] "2016-04-05 22:47:00 CST"
x2 - x1
## Time difference of 627.4493 days

時間轉換

applenews$dt <- strptime(applenews$dt, "%Y年%m月%d日%H:%M")
View(applenews)

*apply family

# for loop
s = 0
for(i in 1:10){
  s = s + i
}
s
## [1] 55
# *apply
x = list(  c(1,2,3)  , c(2,3,4,5) )
lapply(x, sum)
## [[1]]
## [1] 6
## 
## [[2]]
## [1] 14
sapply(x, sum)
## [1]  6 14
f = function(e){
  e[1]
}
lapply(x, f)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
sapply(x, f)
## [1] 1 2
lapply(x, function(e)e[1])
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
sapply(x, function(e)e[1])
## [1] 1 2

清理文章

contentclean <- function(news){
    news %>% strsplit("googletag") %>% .[[1]] %>% .[1] %>% trimws() %>% strsplit('[ \n\r]+') %>% unlist() %>% paste(collapse=' ')
}

applenews$article <- sapply(applenews$article, contentclean)

抓取分頁資訊

pageurl = 'http://www.appledaily.com.tw/realtimenews/section/new/'

# use for loop
for (i in 1:30){
  print(paste0(pageurl, i) )
}

# use sapply
sapply(1:30, function(e) paste0(pageurl,e))



pageurl <- 'http://www.appledaily.com.tw/realtimenews/section/new/'
dfall <- lapply(1:3, 
  function(i){
    url <- paste0(pageurl, i)
    df <- getURL(url)
  } 
)
appledf <- do.call("rbind",dfall)

appledf$article = as.character(appledf$article)
appledf$dt = as.character(appledf$dt)
appledf$click = as.character(appledf$click)

appledf$article <- sapply(appledf$article, contentclean)
appledf$dt <- strptime(appledf$dt, "%Y年%m月%d日%H:%M")
appledf$click = 
as.integer(gsub('.{2}\\((\\d+)\\)', '\\1',x = appledf$click))

合併所有的 data frame

a.frame <- data.frame(x= c(1,2,3), b =c(2,3,4))
b.frame <- data.frame(x= c(3,3,2), b =c(1,2,3))
do.call("rbind", list(a.frame, b.frame))
##   x b
## 1 1 2
## 2 2 3
## 3 3 4
## 4 3 1
## 5 3 2
## 6 2 3
do.call("cbind", list(a.frame, b.frame))
##   x b x b
## 1 1 2 3 1
## 2 2 3 3 2
## 3 3 4 2 3