Jying-Nan Wang
2016/11/12
The web spider is a robot that works on the Internet. It is known by a variety of different names including a web crawler, an ant, an automatic indexer, and a web scutter, but its purpose remains the same.
The web spider allows users to find webpages quickly. The web spider also validates links and HTML code, and sometimes it extracts other information from the website.
(注意:並非所有網站都能順利取得資料)
library(XML)
stockURL <- "http://finance.yahoo.com/q/ud?bypass=true&s=AAPL"
areport <- readHTMLTable(stockURL, header=F, which=3, stringsAsFactors=F)
# clean data based on the specifical web table structure
areport <- areport[12:(nrow(areport)-8),]
areport <- data.frame(areport[-1,])
head(areport)## V1 V2 V3 V4 V5
## 13 27-Oct-16 Hilliard Lyons Downgrade Buy Long-term Buy
## 14 26-Oct-16 Stifel Downgrade Buy Hold
## 15 8-Sep-16 Wells Fargo Downgrade Outperform Market Perform
## 16 28-Jul-16 Hilliard Lyons Upgrade Long-term Buy Buy
## 17 27-Jul-16 Raymond James Upgrade Mkt Perform Outperform
## 18 6-Jul-16 Longbow Initiated Buy
library(rvest)
stockURL <- "http://finance.yahoo.com/q/ud?bypass=true&s=AAPL"
sdat <- html(stockURL) %>%
html_nodes(".yfnc_tabledata1") %>%
html_text()
# 整理資料
sdat <- matrix(sdat,ncol=5,byrow = TRUE)
sdat <- data.frame(sdate=sdat[,1],sfirm=sdat[,2],sact=sdat[,3],from1=sdat[,4],to1=sdat[,5])
head(sdat)## sdate sfirm sact from1 to1
## 1 27-Oct-16 Hilliard Lyons Downgrade Buy Long-term Buy
## 2 26-Oct-16 Stifel Downgrade Buy Hold
## 3 8-Sep-16 Wells Fargo Downgrade Outperform Market Perform
## 4 28-Jul-16 Hilliard Lyons Upgrade Long-term Buy Buy
## 5 27-Jul-16 Raymond James Upgrade Mkt Perform Outperform
## 6 6-Jul-16 Longbow Initiated Buy
library(rvest)
news_url="http://www.chinatimes.com/money/realtimenews?page=1"
mytitle <- html(news_url) %>%
html_nodes(".vertical-list h2") %>%
html_text()
head(mytitle)## [1] "喜康生技與亞獅康合作生產ASLAN004"
## [2] "訊聯生技再生醫學、醫美耕耘再下一城"
## [3] "台股成交量能不錯 財長:國安基金暫不進場"
## [4] "《匯市》新台幣午盤貶值1.08角 暫報31.909"
## [5] "台灣精品選拔現場首次公開 參選產品再創新高"
## [6] "《盤中》法人卡位金融 台股中場平盤附近拉鋸"
library(rvest)
news_url="http://www.chinatimes.com/money/realtimenews?page=1"
my_news =
data.frame(
title = html(news_url) %>%
html_nodes(".vertical-list h2") %>%
html_text(),
title_href = html(news_url) %>%
html_nodes(".vertical-list h2") %>%
html_nodes("a") %>%
html_attr( "href") ,
stringsAsFactors=FALSE)
head(my_news)## title
## 1 喜康生技與亞獅康合作生產ASLAN004
## 2 訊聯生技再生醫學、醫美耕耘再下一城
## 3 台股成交量能不錯 財長:國安基金暫不進場
## 4 《匯市》新台幣午盤貶值1.08角 暫報31.909
## 5 台灣精品選拔現場首次公開 參選產品再創新高
## 6 《盤中》法人卡位金融 台股中場平盤附近拉鋸
## title_href
## 1 http://www.chinatimes.com/realtimenews/20161114003137-260410
## 2 http://www.chinatimes.com/realtimenews/20161114003118-260410
## 3 http://www.chinatimes.com/realtimenews/20161114003127-260410
## 4 http://www.chinatimes.com/realtimenews/20161114002879-260410
## 5 http://www.chinatimes.com/realtimenews/20161114002753-260410
## 6 http://www.chinatimes.com/realtimenews/20161114002750-260410
# 已經取得my_news資料
i <- 1
doc_url <- my_news[i,2]
doc_temp <- html(doc_url) %>% html_nodes(".clear-fix")%>% html_nodes("p") %>% html_text()
# 清理資料
lastnum <- length(doc_temp)-7
x<-paste(doc_temp[1:lastnum], collapse = '', sep="")
x## [1] "喜康生技宣佈與專注於亞洲盛行之癌症,開發免疫療法與標靶抗癌藥物之生物科技公司 – 亞獅康,締結製造服務合作夥伴關係,喜康將為亞獅康ASLAN004之後續開發,提供製程發展服務。ASLAN004為阻斷IL-4 與 IL-13訊號傳導之全人源單株抗體,亞獅康與喜康雙方團隊將密切合作,為ASLAN004臨床試驗用藥之生產進行分析與製程開發,並由已取得GMP認證之喜康竹北廠負責,自2016年展開ASLAN004的開發及製造。亞獅康營運長Mark McHale指出,喜康製程開發與生產製造能力將加速公司特有生物製劑產品的開發。亞獅康專注於開發亞洲盛行癌症的全新療法,與喜康生技的合作將能擴展公司與亞洲區域重要夥伴策略合作關係。"
# 已經取得my_news資料
i <- 1
doc_url <- my_news[i,2]
doc_time <- html(doc_url) %>% html_nodes(".reporter") %>% html_nodes("time") %>% html_text()
date_value <- paste(substr(doc_time,1,4),substr(doc_time,6,7),substr(doc_time,9,10),sep="")
date_value <- paste(date_value, substr(doc_time,13,14), substr(doc_time,16,17),sep="")
filename <- paste("chinatimes/", date_value, "_", my_news[i,1], sep="")
filename## [1] "chinatimes/201611141352_喜康生技與亞獅康合作生產ASLAN004"
doc_temp <- html(doc_url) %>% html_nodes(".clear-fix")%>% html_nodes("p") %>% html_text()
lastnum <- length(doc_temp)-7
x<-paste(doc_temp[1:lastnum], collapse = '', sep="")
x## [1] "喜康生技宣佈與專注於亞洲盛行之癌症,開發免疫療法與標靶抗癌藥物之生物科技公司 – 亞獅康,締結製造服務合作夥伴關係,喜康將為亞獅康ASLAN004之後續開發,提供製程發展服務。ASLAN004為阻斷IL-4 與 IL-13訊號傳導之全人源單株抗體,亞獅康與喜康雙方團隊將密切合作,為ASLAN004臨床試驗用藥之生產進行分析與製程開發,並由已取得GMP認證之喜康竹北廠負責,自2016年展開ASLAN004的開發及製造。亞獅康營運長Mark McHale指出,喜康製程開發與生產製造能力將加速公司特有生物製劑產品的開發。亞獅康專注於開發亞洲盛行癌症的全新療法,與喜康生技的合作將能擴展公司與亞洲區域重要夥伴策略合作關係。"
# write.table(x[1], paste(filename, ".txt",sep="") , quote = FALSE, row.names=FALSE, col.names = FALSE)
# Sys.sleep(runif(1,0,5))library(rvest)
library(XML)
library(magrittr)
alldat <- matrix( ,ncol=11,nrow=0)
for (tyear in 2010:2010){
for (pid in 1:1){
stockURL <- paste("http://www.boxofficemojo.com/yearly/chart/?page=",pid,"&view=releasedate&view2=domestic&yr=",tyear,"&p=.htm",sep="")
sdat <- html(stockURL) %>%
html_nodes("table") %>%
extract2(4) %>%
html_nodes("a") %>%
html_attr("href")
temp1 <- sdat[seq(18,315,by=3)] # 不同年份会不一样
sdat1 <- html(stockURL) %>%
html_nodes("table") %>%
extract2(4) %>%
html_nodes("td") %>%
html_text()
temp2 <- matrix(sdat1[15:914],nrow=100,byrow = TRUE)
alltemp <- cbind(rep(tyear,100),temp2,temp1)
alldat <- rbind(alldat,alltemp)
}
}
alldat1 <- data.frame(alldat)
colnames(alldat1) <- c("myear","mrank","mtitle","mstudio","mgross","mtheaters","mopen","motheaters","modate","mcdate","mhref")
head(alldat1)## myear mrank mtitle mstudio
## 1 2010 1 Toy Story 3 BV
## 2 2010 2 Alice in Wonderland (2010) BV
## 3 2010 3 Iron Man 2 Par.
## 4 2010 4 The Twilight Saga: Eclipse Sum.
## 5 2010 5 Harry Potter and the Deathly Hallows Part 1 WB
## 6 2010 6 Inception WB
## mgross mtheaters mopen motheaters modate mcdate
## 1 $415,004,880 4,028 $110,307,189 4,028 6/18 12/2
## 2 $334,191,110 3,739 $116,101,023 3,728 3/5 7/8
## 3 $312,433,331 4,390 $128,122,480 4,380 5/7 8/19
## 4 $300,531,751 4,468 $64,832,191 4,468 6/30 10/21
## 5 $295,983,305 4,125 $125,017,372 4,125 11/19 4/7
## 6 $292,576,195 3,792 $62,785,337 3,792 7/16 1/6
## mhref
## 1 /movies/?id=toystory3.htm
## 2 /movies/?id=aliceinwonderland10.htm
## 3 /movies/?id=ironman2.htm
## 4 /movies/?id=eclipse.htm
## 5 /movies/?id=harrypotter7.htm
## 6 /movies/?id=inception.htm
#write.csv(alldat1,"movtable2010.csv",row.names = FALSE)library(rvest)
purl <- "https://www.ptt.cc/bbs/Gossiping/index.html"
pg <- read_html(purl) %>%
html_node("body") %>%
html_nodes(".title") %>%
html_text(trim=TRUE)
print(pg)## character(0)
library(httr)
library(rvest)
session1 <- html_session(url = "https://www.ptt.cc/bbs/Gossiping/index.html")
form1 <- session1 %>%
html_node("form") %>%
html_form()
session_redirected <- submit_form(session = session1, form = form1 )
# Access cookies in a response
cookies(session_redirected)## domain flag path secure expiration name
## 1 #HttpOnly_.ptt.cc TRUE / FALSE 2017-11-14 14:09:54 __cfduid
## 2 www.ptt.cc FALSE / FALSE <NA> over18
## value
## 1 d25f0f633b9baa27e0385a8907dd7d5821479103794
## 2 1
session_redirected %>%
html_node("body") %>%
html_nodes(".title") %>%
html_text(trim=TRUE) ## [1] "[新聞] 拍到了!石虎萌樣入鏡超清楚"
## [2] "[問卦] 隨油徵收的訴求"
## [3] "[轉錄] 你的名字只是日本版海角七號吧"
## [4] "[問卦] 有木有壹電視的八卦?"
## [5] "Re: [問卦] 如果台北市禁止自私人小客車會怎樣?"
## [6] "[問卦] 徵人啟事寫禁肥宅有違法嗎?"
## [7] "[新聞] 優先洽購什麼?國防部:神盾艦、反潛直"
## [8] "Re: [新聞] 一例一休爭議造成痛苦 郭芳煜:感同"
## [9] "[新聞] Uber違法爭議 交部:行文要求APP下架"
## [10] "[問卦] 天衣無縫"
## [11] "Re: [新聞] 打臉柯P 機車族:捷運搭到飽貴3倍"
## [12] "[問卦] 有沒有臺灣最適當好用白天問候語的八卦?"
## [13] "Re: [新聞] 打臉柯P 機車族:捷運搭到飽貴3倍"
## [14] "[問卦] 有沒有髮夾彎轉得很痛苦的八卦?"
## [15] "[問卦] 為什麼英國可以一直培養出好的音樂家?"
## [16] "[問卦] 女生是不是不愛逛八卦板?"
## [17] "Re: [新聞] 捷運吃到飽2500元太貴 柯P:革命分階段"
## [18] "[新聞] 酒駕撞爛保時捷 醉男嗆警「開槍啊!」"
## [19] "Re: [問卦] 女生是不是不愛逛八卦板?"
## [20] "[公告] 八卦板板規(2016.08.16)"
## [21] "Fw: [徵求] 11/11 15:20-15:40 永康中華路行車紀錄器"
## [22] "[尋求] 11/09早上八點麥寮六輕行車記錄器"
## [23] "[公告] 同志議題大怒灌11/12~14"
## [24] "[公告] 十一月份置底閒聊區"
library(rvest)
url <- "http://58921.com/user/login?redirect=%2Fboxoffice%2Fhistory%2F2882"
session <- html_session(url)
form1 <- html_form(session)[[1]]
filled_form <- set_values(form1,
`mail` = "ylchiu@126.com",
`pass` = "xxxxxx")
movieURL <- "http://58921.com/alltime/2015?page=0"
mdat <- submit_form(session, filled_form) %>%
jump_to(movieURL) %>%
html_nodes("table td") %>%
html_text(trim=TRUE)
temp <- matrix(mdat, ncol=8, byrow=TRUE)
for (i in 1:3){
movieURL <- paste("http://58921.com/alltime/2015?page=", i, sep="")
mdat <- submit_form(session, filled_form) %>%
jump_to(movieURL) %>%
html_nodes("table td") %>%
html_text(trim=TRUE)
temp <- rbind(temp, matrix(mdat, ncol=8, byrow=TRUE))
}var url ='http://www.gewara.com/movie/65381346';
var page = new WebPage()
var fs = require('fs');
page.open(url, function (status) {
just_wait();
});
function just_wait() {
setTimeout(function() {
fs.write('1.html', page.content, 'w');
phantom.exit();
}, 2500);
}library(rvest)
library(XML)
library(magrittr)
allcid <- matrix(,nrow=0,ncol=1)
for (i in 1:1){
url <- paste0("http://movie.mtime.com/movie/search/section/#pageIndex=",i,"&year=2009")
lines <- readLines("hello.js")
lines[1] <- paste0("var url ='", url ,"';")
writeLines(lines, "hello.js")
## Download website
system("phantomjs hello.js")
pg <- read_html("1.html")
cid <- pg %>% html_nodes("a") %>% html_attr("href")
cid <- grep("http\\:\\/\\/movie\\.mtime\\.com\\/[0-9]",cid, value = TRUE)
cid <- cid[!duplicated(cid)]
cid <- cid[1:20]
allcid <- c(allcid, cid)
Sys.sleep(runif(1,1,5))
}
allcid <- allcid[!duplicated(allcid)]