library(xml2)
library(XML)
library(dplyr)
library(RSelenium)
load("data/rawdata.rdata")
# load("data/creatorInfo.rdata")
# load("data/projectInfo.rdata")
# load("data/updateInfo.rdata")
# load("data/supportInfo.rdata")
creator_info = Reduce(rbind, Map(function(x){
# x = creator_id[14]
url = paste0("https://www.kickstarter.com/profile/", x, "/about")
# url = "https://www.kickstarter.com/profile/795368464/about"
doc = read_html(url)
## biography
xpath = '//*[(@id = "content")]//p'
bio = paste0("",xml_text(xml_find_all(doc, xpath)))
bio = gsub("\n", "", bio[2]) # 去除換行及前後空白
bio = gsub("^\\s|\\s$", "", bio)
if(is.na(bio)) bio=NA
## websites
xpath = '//*[(@id = "content")]//a'
websites = paste0("",xml_attr(xml_find_all(doc, xpath), "href"))
websites = gsub("\n", "", websites) # 去除換行及前後空白
websites = gsub("^\\s|\\s$", "", websites)
webs = ""
webs = paste(websites, collapse = ', ') # 把所有網址連接在一起
## backed project count
xpath = '//span[@class="backed"]'
backed = xml_text(xml_find_all(doc, xpath)) %>% regmatches(., regexpr("[[:digit:]]", .)) %>% trimws
if(length(backed)==0) backed=NA
## created project count
xpath = '//span[@class="count"]'
created = xml_text(xml_find_all(doc, xpath)[1]) %>% regmatches(., regexpr("[[:digit:]]", .)) %>% trimws
if(length(created)==0) created=NA
# 存成data frame
df = data.frame(cid = x,
biography = bio,
websites = webs,
backed = backed,
created = created)
}, creator_id))
# View(creator_info)
save(creator_info, file="data/creatorInfo.rdata")
"done"
iter = 0
project_info = Reduce(rbind, Map(function(x){
# x="https://www.kickstarter.com/projects/1870476470/tomb-of-horror-vol-2-horror-anthology?ref=category_newest"
iter = iter+1
doc = read_html(x)
## updates count
xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
## comments count
xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--comments", " " ))]'
cmt_cnt = xml_find_all(doc, xpath) %>%
xml_text %>%
regmatches(., regexpr("[[:digit:]]", .)) %>%
trimws %>%
as.numeric
## project image
xpath='//img[contains(concat( " ", @class, " " ), concat( " ", "aspect-ratio--object", " " ))]'
img_src = doc %>% xml_find_all(xpath) %>% xml_attr("src")
if(length(img_src)==0) img_src=NA
## Description
xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "description-container", " " ))]'
desc = doc %>%
xml_find_all(xpath) %>%
xml_text %>%
sub("\\sLearn about accountability on Kickstarter","",.) %>% # 去除最下面的贅字
sub("\\sQuestions about this project\\?","",.) %>%
sub("Check out the FAQ","",.) %>%
sub("\\sReport this project to Kickstarter","",.) %>%
trimws
### Risks and challenges
desc_risksAndChallenges = desc %>% regmatches(., regexpr("Risks and challenges[[:space:]].*", .)) %>%
sub("Risks and challenges","",.) %>% trimws
if(length(desc_risksAndChallenges)==0) desc_risksAndChallenges=NA
### About
desc_about = desc %>%
regmatches(., regexpr("About[[:space:]].*", .)) %>%
sub("About","",.) %>% trimws
Erroresult<- tryCatch( # 如果沒有Risks and challenges
{
desc_about = desc_about %>% sub(regmatches(., regexpr("Risks and challenges[[:space:]].*", .)),"",.) %>% trimws
}, warning = function(war) {
print(paste("MY_WARNING: ",war))
}, error = function(err) {
print(paste("MY_ERROR: ",err))
})
### pictures in description //example[124] or [126]
pic_src = doc %>% xml_find_all(xpath) %>% xml_find_all('//img[@class="fit"]') %>% xml_attr("src")
pic_srcs = ""
pic_srcs = paste(pic_src, collapse = ', ') # 把所有網址連接在一起
pic_cnt = length(pic_src) %>% as.numeric
if(pic_cnt==0) pic_srcs=NA
## support
xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "pledge-selectable-sidebar", " " ))]'
supports = doc %>% xml_find_all(xpath)
sup_cnt = length(supports) %>% as.numeric
## Hidden Project
Erroresult<- tryCatch(
{
xpath = '//*[(@id = "hidden_project")]'
isHidden = xml_find_all(doc, xpath)
if(length(isHidden)!=0){
upd_cnt = NA
cmt_cnt = NA
img_src = NA
desc = NA
desc_about = NA
desc_risksAndChallenges = NA
pic_cnt = NA
pic_srcs = NA
sup_cnt = NA
}
}, warning = function(war) {
print(paste("MY_WARNING: ",war))
}, error = function(err) {
print(paste("MY_ERROR: ",err))
})
# 存成data frame
df = data.frame(
purl = x,
upd_cnt = upd_cnt,
cmt_cnt = cmt_cnt,
img_src = img_src,
desc = desc,
desc_about = desc_about,
desc_risksAndChallenges = desc_risksAndChallenges,
pic_cnt = pic_cnt,
pic_srcs = pic_srcs,
sup_cnt = sup_cnt
)
}, project_url))
# View(project_info)
save(project_info, file="data/projectInfo.rdata")
"done"
upates = sapply(project_url,function(x){
# print(x)
upd=data.frame()
# x = project_url[405]
xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
doc = read_html(x)
upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
## Hidden Project
Erroresult<- tryCatch(
{
xpath = '//*[(@id = "hidden_project")]'
isHidden = xml_find_all(doc, xpath)
if(length(isHidden)!=0){
upd_date = NA
upd_title = NA
upd_content = NA
upd_cnt = 0
}
}, warning = function(war) {
print(paste("MY_WARNING: ",war))
}, error = function(err) {
print(paste("MY_ERROR hidden project: ",err))
})
if(upd_cnt!=0){
url = x %>% sub(".ref=.*","",.) %>% paste0("/updates")
doc = read_html(url)
xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "hover-target", " " ))]'
upPosts = xml_find_all(doc, xpath)
upPosts_date = upPosts %>%
xml_text %>%
regmatches(., regexpr("[[:upper:]][[:lower:]]* [[:digit:]]{1,2}, [[:digit:]]{4}", .)) %>%
trimws %>% as.character
# strptime("%B %d, %Y")
upPosts_title = upPosts %>%
xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "grid-post__title", " " ))]') %>%
xml_text %>% trimws
upPosts_content = upPosts %>%
xml_find_all('//div[@class = "grid-post__content"]') %>%
xml_text %>% trimws
if(length(upPosts_content)==0){
upPosts_content = NA
}
# 有些upd沒有content只有title
if(length(upPosts_content)!=length(upPosts_title)){
for(i in 1:(length(upPosts_title) - length(upPosts_content)) ){
upPosts_content = append(upPosts_content, "")
}
}
# 存成data frame
upd = data.frame(
purl = x,
upd_date = upPosts_date,
upd_title = upPosts_title,
upd_content = upPosts_content
)
# insert data to DB
updDB$insert(upd)
} # end if
})
updates = Reduce(rbind, lapply(project_url,function(x){
# print(x)
iter = iter+1
upd=data.frame()
# x = project_url[405]
xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
doc = read_html(x)
upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
## Hidden Project
Erroresult<- tryCatch(
{
xpath = '//*[(@id = "hidden_project")]'
isHidden = xml_find_all(doc, xpath)
if(length(isHidden)!=0){
upd_date = NA
upd_title = NA
upd_content = NA
upd_cnt = 0
}
}, warning = function(war) {
print(paste("MY_WARNING: ",war))
}, error = function(err) {
print(paste("MY_ERROR hidden project: ",err))
})
if(upd_cnt!=0){
url = x %>% sub(".ref=.*","",.) %>% paste0("/updates")
doc = read_html(url)
xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "hover-target", " " ))]'
upPosts = xml_find_all(doc, xpath)
upPosts_date = upPosts %>%
xml_text %>%
regmatches(., regexpr("[[:upper:]][[:lower:]]* [[:digit:]]{1,2}, [[:digit:]]{4}", .)) %>%
trimws %>% as.character
# strptime("%B %d, %Y")
upPosts_title = upPosts %>%
xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "grid-post__title", " " ))]') %>%
xml_text %>% trimws
upPosts_content = upPosts %>%
xml_find_all('//div[@class = "grid-post__content"]') %>%
xml_text %>% trimws
if(length(upPosts_content)==0){
upPosts_content = NA
}
# 有些upd沒有content只有title
if(length(upPosts_content)!=length(upPosts_title)){
for(i in 1:(length(upPosts_title) - length(upPosts_content)) ){
upPosts_content = append(upPosts_content, "")
}
}
# 存成data frame
upd = data.frame(
purl = x,
upd_date = upPosts_date,
upd_title = upPosts_title,
upd_content = upPosts_content
)
} # end if
return(upd)
}))
# View(updates)
save(updates, file="data/updateInfo.rdata")
"done"
supports = Reduce(rbind, lapply(project_url,function(x){
# print(x)
sup = data.frame()
# x = project_url[1321]
doc = read_html(x)
xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "pledge-selectable-sidebar", " " ))]'
supports = doc %>% xml_find_all(xpath)
sup_cnt = length(supports) %>% as.numeric
## Hidden Project
Erroresult<- tryCatch(
{
xpath = '//*[(@id = "hidden_project")]'
isHidden = xml_find_all(doc, xpath)
if(length(isHidden)!=0){
sup_amounts = NA
sup_desc = NA
sup_extraInfo = NA
sup_baker = NA
sup_cnt=0
}
}, warning = function(war) {
print(paste("MY_WARNING: ",war))
}, error = function(err) {
print(paste("MY_ERROR hidden project: ",err))
})
if(sup_cnt!=0){
sup_amounts = supports %>%
xml_find_all('//h2[@class="pledge__amount"]') %>%
xml_text %>% trimws
sup_desc = supports %>%
xml_find_all('//div[@class="pledge__reward-description pledge__reward-description--expanded"]') %>%
xml_text %>% trimws
sup_extraInfo = supports %>%
xml_find_all('//div[@class="pledge__extra-info"]') %>%
xml_text %>% trimws
sup_baker = supports %>%
xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "pledge__backer-count", " " ))]') %>%
xml_text %>% regmatches(., regexpr("[[:digit:]]+", .)) %>% as.numeric
# 存成data frame
sup = data.frame(
purl = x,
sup_amounts = sup_amounts,
sup_desc = sup_desc,
sup_extraInfo = sup_extraInfo,
sup_baker = sup_baker
)
} # end if
return(sup)
}))
# View(supports)
save(supports, file="data/supportInfo.rdata")
"done"
# ra_bio = with(all_bio, readability(bio, url))
# ra_bio
# summary(ra_bio)