網路爬蟲入門 in R

Jying-Nan Wang

2016/11/12

什麼是網路爬蟲 (Web spider)

網路爬蟲 Q&A

設計網路爬虫注意事項

網路爬虫的步驟

(注意:並非所有網站都能順利取得資料)

網頁背景知識

Case 1: 分析師資料 (使用XML套件)

library(XML)
stockURL <- "http://finance.yahoo.com/q/ud?bypass=true&s=AAPL"
areport <- readHTMLTable(stockURL, header=F, which=3, stringsAsFactors=F)
# clean data based on the specifical web table structure
areport <- areport[12:(nrow(areport)-8),]
areport <- data.frame(areport[-1,])
head(areport)
##           V1             V2        V3            V4             V5
## 13 27-Oct-16 Hilliard Lyons Downgrade           Buy  Long-term Buy
## 14 26-Oct-16         Stifel Downgrade           Buy           Hold
## 15  8-Sep-16    Wells Fargo Downgrade    Outperform Market Perform
## 16 28-Jul-16 Hilliard Lyons   Upgrade Long-term Buy            Buy
## 17 27-Jul-16  Raymond James   Upgrade   Mkt Perform     Outperform
## 18  6-Jul-16        Longbow Initiated                          Buy

Case 1: 分析師資料 (使用rvest套件)

library(rvest)
stockURL <- "http://finance.yahoo.com/q/ud?bypass=true&s=AAPL"
sdat <- html(stockURL) %>% 
  html_nodes(".yfnc_tabledata1") %>%
  html_text()

# 整理資料
sdat <- matrix(sdat,ncol=5,byrow = TRUE)
sdat <- data.frame(sdate=sdat[,1],sfirm=sdat[,2],sact=sdat[,3],from1=sdat[,4],to1=sdat[,5])
head(sdat)
##       sdate          sfirm      sact         from1            to1
## 1 27-Oct-16 Hilliard Lyons Downgrade           Buy  Long-term Buy
## 2 26-Oct-16         Stifel Downgrade           Buy           Hold
## 3  8-Sep-16    Wells Fargo Downgrade    Outperform Market Perform
## 4 28-Jul-16 Hilliard Lyons   Upgrade Long-term Buy            Buy
## 5 27-Jul-16  Raymond James   Upgrade   Mkt Perform     Outperform
## 6  6-Jul-16        Longbow Initiated                          Buy

Case 2: 中時電子報

library(rvest)
news_url="http://www.chinatimes.com/money/realtimenews?page=1"
   mytitle <- html(news_url) %>% 
     html_nodes(".vertical-list h2") %>% 
     html_text()
   head(mytitle)
## [1] "喜康生技與亞獅康合作生產ASLAN004"         
## [2] "訊聯生技再生醫學、醫美耕耘再下一城"       
## [3] "台股成交量能不錯 財長:國安基金暫不進場"  
## [4] "《匯市》新台幣午盤貶值1.08角 暫報31.909"  
## [5] "台灣精品選拔現場首次公開 參選產品再創新高"
## [6] "《盤中》法人卡位金融 台股中場平盤附近拉鋸"
library(rvest)
news_url="http://www.chinatimes.com/money/realtimenews?page=1"

my_news =
  data.frame(
    title = html(news_url) %>% 
      html_nodes(".vertical-list h2") %>% 
      html_text(), 
    title_href = html(news_url) %>% 
      html_nodes(".vertical-list h2") %>%  
      html_nodes("a") %>% 
      html_attr( "href") ,
    stringsAsFactors=FALSE)  
head(my_news)
##                                       title
## 1          喜康生技與亞獅康合作生產ASLAN004
## 2        訊聯生技再生醫學、醫美耕耘再下一城
## 3   台股成交量能不錯 財長:國安基金暫不進場
## 4   《匯市》新台幣午盤貶值1.08角 暫報31.909
## 5 台灣精品選拔現場首次公開 參選產品再創新高
## 6 《盤中》法人卡位金融 台股中場平盤附近拉鋸
##                                                     title_href
## 1 http://www.chinatimes.com/realtimenews/20161114003137-260410
## 2 http://www.chinatimes.com/realtimenews/20161114003118-260410
## 3 http://www.chinatimes.com/realtimenews/20161114003127-260410
## 4 http://www.chinatimes.com/realtimenews/20161114002879-260410
## 5 http://www.chinatimes.com/realtimenews/20161114002753-260410
## 6 http://www.chinatimes.com/realtimenews/20161114002750-260410
  # 已經取得my_news資料
  i <- 1
  doc_url <- my_news[i,2]
  doc_temp <- html(doc_url) %>% html_nodes(".clear-fix")%>% html_nodes("p") %>% html_text()
  # 清理資料
  lastnum <- length(doc_temp)-7
  x<-paste(doc_temp[1:lastnum], collapse = '', sep="")
  x
## [1] "喜康生技宣佈與專注於亞洲盛行之癌症,開發免疫療法與標靶抗癌藥物之生物科技公司 – 亞獅康,締結製造服務合作夥伴關係,喜康將為亞獅康ASLAN004之後續開發,提供製程發展服務。ASLAN004為阻斷IL-4 與 IL-13訊號傳導之全人源單株抗體,亞獅康與喜康雙方團隊將密切合作,為ASLAN004臨床試驗用藥之生產進行分析與製程開發,並由已取得GMP認證之喜康竹北廠負責,自2016年展開ASLAN004的開發及製造。亞獅康營運長Mark McHale指出,喜康製程開發與生產製造能力將加速公司特有生物製劑產品的開發。亞獅康專注於開發亞洲盛行癌症的全新療法,與喜康生技的合作將能擴展公司與亞洲區域重要夥伴策略合作關係。"
  # 已經取得my_news資料
  i <- 1
  doc_url <- my_news[i,2]
  doc_time <- html(doc_url) %>% html_nodes(".reporter") %>% html_nodes("time") %>% html_text()
  date_value <- paste(substr(doc_time,1,4),substr(doc_time,6,7),substr(doc_time,9,10),sep="")
  date_value <- paste(date_value, substr(doc_time,13,14), substr(doc_time,16,17),sep="")
  filename <- paste("chinatimes/", date_value, "_", my_news[i,1], sep="")
  filename
## [1] "chinatimes/201611141352_喜康生技與亞獅康合作生產ASLAN004"
  doc_temp <- html(doc_url) %>% html_nodes(".clear-fix")%>% html_nodes("p") %>% html_text()

  lastnum <- length(doc_temp)-7
  x<-paste(doc_temp[1:lastnum], collapse = '', sep="")
  x
## [1] "喜康生技宣佈與專注於亞洲盛行之癌症,開發免疫療法與標靶抗癌藥物之生物科技公司 – 亞獅康,締結製造服務合作夥伴關係,喜康將為亞獅康ASLAN004之後續開發,提供製程發展服務。ASLAN004為阻斷IL-4 與 IL-13訊號傳導之全人源單株抗體,亞獅康與喜康雙方團隊將密切合作,為ASLAN004臨床試驗用藥之生產進行分析與製程開發,並由已取得GMP認證之喜康竹北廠負責,自2016年展開ASLAN004的開發及製造。亞獅康營運長Mark McHale指出,喜康製程開發與生產製造能力將加速公司特有生物製劑產品的開發。亞獅康專注於開發亞洲盛行癌症的全新療法,與喜康生技的合作將能擴展公司與亞洲區域重要夥伴策略合作關係。"
  # write.table(x[1], paste(filename, ".txt",sep="") , quote = FALSE, row.names=FALSE, col.names = FALSE)
  # Sys.sleep(runif(1,0,5))

Case 3: Box Office Mojo

library(rvest)
library(XML)
library(magrittr)

alldat <- matrix( ,ncol=11,nrow=0)

for (tyear in 2010:2010){
  for (pid in 1:1){
    stockURL <- paste("http://www.boxofficemojo.com/yearly/chart/?page=",pid,"&view=releasedate&view2=domestic&yr=",tyear,"&p=.htm",sep="")
    
    sdat <- html(stockURL)  %>% 
      html_nodes("table") %>%
      extract2(4) %>%
      html_nodes("a") %>%
      html_attr("href") 
    
    temp1 <- sdat[seq(18,315,by=3)] # 不同年份会不一样
    
    sdat1 <- html(stockURL)  %>% 
      html_nodes("table") %>%
      extract2(4) %>%
      html_nodes("td") %>%
      html_text()
    temp2 <- matrix(sdat1[15:914],nrow=100,byrow = TRUE)
    alltemp <- cbind(rep(tyear,100),temp2,temp1)
    
    alldat <- rbind(alldat,alltemp)
    
  }
}

alldat1 <- data.frame(alldat)
colnames(alldat1) <- c("myear","mrank","mtitle","mstudio","mgross","mtheaters","mopen","motheaters","modate","mcdate","mhref")
head(alldat1)
##   myear mrank                                      mtitle mstudio
## 1  2010     1                                 Toy Story 3      BV
## 2  2010     2                  Alice in Wonderland (2010)      BV
## 3  2010     3                                  Iron Man 2    Par.
## 4  2010     4                  The Twilight Saga: Eclipse    Sum.
## 5  2010     5 Harry Potter and the Deathly Hallows Part 1      WB
## 6  2010     6                                   Inception      WB
##         mgross mtheaters        mopen motheaters modate mcdate
## 1 $415,004,880     4,028 $110,307,189      4,028   6/18   12/2
## 2 $334,191,110     3,739 $116,101,023      3,728    3/5    7/8
## 3 $312,433,331     4,390 $128,122,480      4,380    5/7   8/19
## 4 $300,531,751     4,468  $64,832,191      4,468   6/30  10/21
## 5 $295,983,305     4,125 $125,017,372      4,125  11/19    4/7
## 6 $292,576,195     3,792  $62,785,337      3,792   7/16    1/6
##                                 mhref
## 1           /movies/?id=toystory3.htm
## 2 /movies/?id=aliceinwonderland10.htm
## 3            /movies/?id=ironman2.htm
## 4             /movies/?id=eclipse.htm
## 5        /movies/?id=harrypotter7.htm
## 6           /movies/?id=inception.htm
#write.csv(alldat1,"movtable2010.csv",row.names = FALSE)

網路爬虫練習

網路爬虫的下一步

進階爬蟲測試: ptt Gossiping版

library(rvest)

purl <- "https://www.ptt.cc/bbs/Gossiping/index.html"
pg <- read_html(purl) %>%
  html_node("body") %>%
  html_nodes(".title") %>%
  html_text(trim=TRUE) 
print(pg)
## character(0)

進階爬蟲案例: 需登入帳號密碼

library(rvest)
url <- "http://58921.com/user/login?redirect=%2Fboxoffice%2Fhistory%2F2882"
session <- html_session(url)
form1 <- html_form(session)[[1]]
filled_form <- set_values(form1,
                          `mail` = "ylchiu@126.com", 
                          `pass` = "xxxxxx")

movieURL <- "http://58921.com/alltime/2015?page=0"
mdat  <- submit_form(session, filled_form) %>%
  jump_to(movieURL) %>%
  html_nodes("table td") %>%
  html_text(trim=TRUE)
temp <- matrix(mdat, ncol=8, byrow=TRUE)

for (i in 1:3){
movieURL <- paste("http://58921.com/alltime/2015?page=", i, sep="")
mdat  <- submit_form(session, filled_form) %>%
  jump_to(movieURL) %>%
  html_nodes("table td") %>%
  html_text(trim=TRUE)

temp <- rbind(temp, matrix(mdat, ncol=8, byrow=TRUE))
}

進階爬蟲案例: Javascript

var url ='http://www.gewara.com/movie/65381346';
var page = new WebPage()
var fs = require('fs');

page.open(url, function (status) {
        just_wait();
});

function just_wait() {
    setTimeout(function() {
               fs.write('1.html', page.content, 'w');
            phantom.exit();
    }, 2500);
}
library(rvest)
library(XML)
library(magrittr)

allcid <- matrix(,nrow=0,ncol=1)

for (i in 1:1){
  url <- paste0("http://movie.mtime.com/movie/search/section/#pageIndex=",i,"&year=2009")
  
  lines <- readLines("hello.js")
  lines[1] <- paste0("var url ='", url ,"';")
  writeLines(lines, "hello.js")
  ## Download website
  system("phantomjs hello.js")
  
  pg <- read_html("1.html")
  cid <- pg %>% html_nodes("a") %>% html_attr("href")
  cid <- grep("http\\:\\/\\/movie\\.mtime\\.com\\/[0-9]",cid, value = TRUE)
  cid <- cid[!duplicated(cid)]
  cid <- cid[1:20]
  allcid <- c(allcid, cid)
  Sys.sleep(runif(1,1,5))
}

allcid <- allcid[!duplicated(allcid)]