data source




Start to crawl

library(xml2)
library(XML)
library(dplyr)
library(RSelenium)

load("data/rawdata.rdata")

load

# load("data/creatorInfo.rdata")
# load("data/projectInfo.rdata")
# load("data/updateInfo.rdata")
# load("data/supportInfo.rdata")

Creator Information

creator_info = Reduce(rbind, Map(function(x){

  # x = creator_id[14]
  url = paste0("https://www.kickstarter.com/profile/", x, "/about")
  # url = "https://www.kickstarter.com/profile/795368464/about"
  doc = read_html(url)
  
  ## biography
  xpath = '//*[(@id = "content")]//p'
  bio = paste0("",xml_text(xml_find_all(doc, xpath)))
  bio = gsub("\n", "", bio[2])      # 去除換行及前後空白
  bio = gsub("^\\s|\\s$", "", bio)
  if(is.na(bio)) bio=NA
  
  ## websites
  xpath = '//*[(@id = "content")]//a'
  websites = paste0("",xml_attr(xml_find_all(doc, xpath), "href"))
  websites = gsub("\n", "", websites)      # 去除換行及前後空白
  websites = gsub("^\\s|\\s$", "", websites)
  
  webs = "" 
  webs = paste(websites, collapse = ', ') # 把所有網址連接在一起

  ## backed project count
  xpath = '//span[@class="backed"]'
  backed = xml_text(xml_find_all(doc, xpath)) %>% regmatches(., regexpr("[[:digit:]]", .)) %>% trimws
  if(length(backed)==0) backed=NA
  
  ## created project count
  xpath = '//span[@class="count"]'
  created = xml_text(xml_find_all(doc, xpath)[1]) %>% regmatches(., regexpr("[[:digit:]]", .)) %>% trimws
  if(length(created)==0) created=NA
  
  # 存成data frame
  df = data.frame(cid = x, 
                  biography = bio, 
                  websites = webs, 
                  backed = backed,
                  created = created)
}, creator_id))

# View(creator_info)

save(creator_info, file="data/creatorInfo.rdata")
"done"

project information

iter = 0
project_info = Reduce(rbind, Map(function(x){
  
  # x="https://www.kickstarter.com/projects/1870476470/tomb-of-horror-vol-2-horror-anthology?ref=category_newest"
  
  iter = iter+1
  doc = read_html(x)
  
  ## updates count
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
  upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
  
  ## comments count 
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--comments", " " ))]'
  cmt_cnt = xml_find_all(doc, xpath) %>% 
    xml_text %>% 
    regmatches(., regexpr("[[:digit:]]", .)) %>%
    trimws %>%
    as.numeric
  
  ## project image
  xpath='//img[contains(concat( " ", @class, " " ), concat( " ", "aspect-ratio--object", " " ))]'
  img_src = doc %>% xml_find_all(xpath) %>% xml_attr("src")
  if(length(img_src)==0) img_src=NA
  
  ## Description
  xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "description-container", " " ))]'
  desc = doc %>% 
    xml_find_all(xpath) %>% 
    xml_text %>% 
    sub("\\sLearn about accountability on Kickstarter","",.) %>%    # 去除最下面的贅字
    sub("\\sQuestions about this project\\?","",.) %>% 
    sub("Check out the FAQ","",.) %>% 
    sub("\\sReport this project to Kickstarter","",.) %>% 
    trimws
  
  ### Risks and challenges
  desc_risksAndChallenges = desc %>% regmatches(., regexpr("Risks and challenges[[:space:]].*", .)) %>%
    sub("Risks and challenges","",.) %>% trimws
  if(length(desc_risksAndChallenges)==0) desc_risksAndChallenges=NA
  ### About
  desc_about = desc %>% 
    regmatches(., regexpr("About[[:space:]].*", .)) %>%
    sub("About","",.) %>% trimws
  Erroresult<- tryCatch(                  # 如果沒有Risks and challenges
    {
      desc_about = desc_about %>% sub(regmatches(., regexpr("Risks and challenges[[:space:]].*", .)),"",.) %>% trimws  
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR:  ",err))
    })
  
  ### pictures in description //example[124] or [126]
  pic_src = doc %>% xml_find_all(xpath) %>% xml_find_all('//img[@class="fit"]') %>% xml_attr("src")
  pic_srcs = "" 
  pic_srcs = paste(pic_src, collapse = ', ') # 把所有網址連接在一起
  pic_cnt = length(pic_src) %>% as.numeric
  if(pic_cnt==0) pic_srcs=NA

  ## support
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "pledge-selectable-sidebar", " " ))]'
  supports = doc %>% xml_find_all(xpath)
  sup_cnt = length(supports) %>% as.numeric
  
  ## Hidden Project
  Erroresult<- tryCatch(
    {
      xpath = '//*[(@id = "hidden_project")]'
      isHidden = xml_find_all(doc, xpath)
      if(length(isHidden)!=0){
        upd_cnt = NA
        cmt_cnt = NA
        img_src = NA
        desc = NA
        desc_about = NA
        desc_risksAndChallenges = NA
        pic_cnt = NA
        pic_srcs = NA
        sup_cnt = NA
      }
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR:  ",err))
    })

  # 存成data frame
  df = data.frame(
    purl = x,
    upd_cnt = upd_cnt,
    cmt_cnt = cmt_cnt,
    img_src = img_src,
    desc = desc,
    desc_about = desc_about,
    desc_risksAndChallenges = desc_risksAndChallenges,
    pic_cnt = pic_cnt,
    pic_srcs = pic_srcs,
    sup_cnt = sup_cnt
  )
}, project_url))

# View(project_info)

save(project_info, file="data/projectInfo.rdata")
"done"

update posts

upates = sapply(project_url,function(x){
  
  # print(x)
  upd=data.frame()
  # x = project_url[405]
  
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
  doc = read_html(x)
  upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
  
  ## Hidden Project
  Erroresult<- tryCatch(
    {
      xpath = '//*[(@id = "hidden_project")]'
      isHidden = xml_find_all(doc, xpath)
      if(length(isHidden)!=0){
        upd_date = NA
        upd_title = NA
        upd_content = NA
        upd_cnt = 0
      }
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR hidden project:  ",err))
    })
  
  if(upd_cnt!=0){
    
    url = x %>% sub(".ref=.*","",.) %>% paste0("/updates")
    doc = read_html(url)
    
    xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "hover-target", " " ))]'
    upPosts = xml_find_all(doc, xpath)
    
    upPosts_date = upPosts %>% 
      xml_text %>% 
      regmatches(., regexpr("[[:upper:]][[:lower:]]* [[:digit:]]{1,2}, [[:digit:]]{4}", .)) %>% 
      trimws %>% as.character
      # strptime("%B %d, %Y") 
    
    upPosts_title = upPosts %>% 
      xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "grid-post__title", " " ))]') %>% 
      xml_text %>% trimws
    
    upPosts_content = upPosts %>% 
      xml_find_all('//div[@class = "grid-post__content"]') %>% 
      xml_text %>% trimws
    if(length(upPosts_content)==0){
      upPosts_content = NA
    }
    
    # 有些upd沒有content只有title
    if(length(upPosts_content)!=length(upPosts_title)){
      for(i in 1:(length(upPosts_title) - length(upPosts_content)) ){
        upPosts_content = append(upPosts_content, "")
      }
    }
    
    
    # 存成data frame
    upd = data.frame(
        purl = x,
        upd_date = upPosts_date,
        upd_title = upPosts_title,
        upd_content = upPosts_content
      )
    
    # insert data to DB
    updDB$insert(upd)
    
  } # end if 
})
updates = Reduce(rbind, lapply(project_url,function(x){
  
  # print(x)
  iter = iter+1
  upd=data.frame()

  # x = project_url[405]
  
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
  doc = read_html(x)
  upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
  
  ## Hidden Project
  Erroresult<- tryCatch(
    {
      xpath = '//*[(@id = "hidden_project")]'
      isHidden = xml_find_all(doc, xpath)
      if(length(isHidden)!=0){
        upd_date = NA
        upd_title = NA
        upd_content = NA
        upd_cnt = 0
      }
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR hidden project:  ",err))
    })
  
  if(upd_cnt!=0){
    
    url = x %>% sub(".ref=.*","",.) %>% paste0("/updates")
    doc = read_html(url)
    
    xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "hover-target", " " ))]'
    upPosts = xml_find_all(doc, xpath)
    
    upPosts_date = upPosts %>% 
      xml_text %>% 
      regmatches(., regexpr("[[:upper:]][[:lower:]]* [[:digit:]]{1,2}, [[:digit:]]{4}", .)) %>% 
      trimws %>% as.character
      # strptime("%B %d, %Y") 
    
    upPosts_title = upPosts %>% 
      xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "grid-post__title", " " ))]') %>% 
      xml_text %>% trimws
    
    upPosts_content = upPosts %>% 
      xml_find_all('//div[@class = "grid-post__content"]') %>% 
      xml_text %>% trimws
    if(length(upPosts_content)==0){
      upPosts_content = NA
    }
    
    # 有些upd沒有content只有title
    if(length(upPosts_content)!=length(upPosts_title)){
      for(i in 1:(length(upPosts_title) - length(upPosts_content)) ){
        upPosts_content = append(upPosts_content, "")
      }
    }
    
    
    # 存成data frame
    
    upd = data.frame(
        purl = x,
        upd_date = upPosts_date,
        upd_title = upPosts_title,
        upd_content = upPosts_content
      )
  } # end if 
  
  return(upd)
}))

# View(updates)
save(updates, file="data/updateInfo.rdata")
"done"

supports

supports = Reduce(rbind, lapply(project_url,function(x){
  
  # print(x)
  sup = data.frame()
  # x = project_url[1321]
    
  doc = read_html(x)
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "pledge-selectable-sidebar", " " ))]'
  supports = doc %>% xml_find_all(xpath)
  sup_cnt = length(supports) %>% as.numeric
  
  ## Hidden Project
  Erroresult<- tryCatch(
    {
      xpath = '//*[(@id = "hidden_project")]'
      isHidden = xml_find_all(doc, xpath)
      
      if(length(isHidden)!=0){
        sup_amounts = NA
        sup_desc = NA
        sup_extraInfo = NA
        sup_baker = NA
        sup_cnt=0
      }
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR hidden project:  ",err))
    })
  
  if(sup_cnt!=0){
      
    sup_amounts = supports %>% 
      xml_find_all('//h2[@class="pledge__amount"]') %>% 
      xml_text %>% trimws
    sup_desc = supports %>% 
      xml_find_all('//div[@class="pledge__reward-description pledge__reward-description--expanded"]') %>% 
      xml_text %>% trimws
    sup_extraInfo = supports %>% 
      xml_find_all('//div[@class="pledge__extra-info"]') %>% 
      xml_text %>% trimws
    sup_baker = supports %>% 
      xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "pledge__backer-count", " " ))]') %>% 
      xml_text %>% regmatches(., regexpr("[[:digit:]]+", .)) %>% as.numeric
      
    # 存成data frame
    sup = data.frame(
        purl = x,
        sup_amounts = sup_amounts,
        sup_desc = sup_desc,
        sup_extraInfo = sup_extraInfo,
        sup_baker = sup_baker
      )
    } # end if 
    
    return(sup)
  }))

# View(supports)
save(supports, file="data/supportInfo.rdata")
"done"



creator biography可讀性

# ra_bio = with(all_bio, readability(bio, url))
# ra_bio
# summary(ra_bio)
---
title: "R Notebook"
output: html_notebook
---

<br>
[data source](https://webrobots.io/kickstarter-datasets/) 

---
<br> <br> <hr>
## connect to MongoDB
```{r}
# install.packages("Rtools")
# install.packages("devtools")
# library(devtools)
# install.packages("rJava")
# Sys.setenv(JAVA_HOME="C:/Program Files/Java/jdk-11.0.1/")
# library(rJava)
# install.packages("RMongo")
```

```{r}
library(RMongo)
mongo = mongoDbConnect("Kickstarter")
dbShowCollections(mongo)
query <- dbGetQuery(mongo, 'rawdata', "")

# 把creator_info轉json再存進去
library(rjson)
crt_json = toJSON(creator_info[1,])

# insert data in mongo
output <- dbInsertDocument(mongo, "test_data", crt_json)  
output <- dbGetQuery(mongo, 'rawdata', '{}')
output


# dbDisconnect(mongo)
```

mongolite
```{r}
library("mongolite")
rawdataDB = mongo(collection = "rawdata", db = "Kickstarter") # create connection, database and collection
rawdataDB$count
```


---
<br> <br> <hr>
## Start to crawl
```{r message=FALSE}
library(xml2)
library(XML)
library(dplyr)
library(RSelenium)

load("data/rawdata.rdata")
```


#### load
```{r}
# load("data/creatorInfo.rdata")
# load("data/projectInfo.rdata")
# load("data/updateInfo.rdata")
# load("data/supportInfo.rdata")
```

### Creator Information

```{r}
creator_info = Reduce(rbind, Map(function(x){

  # x = creator_id[14]
  url = paste0("https://www.kickstarter.com/profile/", x, "/about")
  # url = "https://www.kickstarter.com/profile/795368464/about"
  doc = read_html(url)
  
  ## biography
  xpath = '//*[(@id = "content")]//p'
  bio = paste0("",xml_text(xml_find_all(doc, xpath)))
  bio = gsub("\n", "", bio[2])      # 去除換行及前後空白
  bio = gsub("^\\s|\\s$", "", bio)
  if(is.na(bio)) bio=NA
  
  ## websites
  xpath = '//*[(@id = "content")]//a'
  websites = paste0("",xml_attr(xml_find_all(doc, xpath), "href"))
  websites = gsub("\n", "", websites)      # 去除換行及前後空白
  websites = gsub("^\\s|\\s$", "", websites)
  
  webs = "" 
  webs = paste(websites, collapse = ', ') # 把所有網址連接在一起

  ## backed project count
  xpath = '//span[@class="backed"]'
  backed = xml_text(xml_find_all(doc, xpath)) %>% regmatches(., regexpr("[[:digit:]]", .)) %>% trimws
  if(length(backed)==0) backed=NA
  
  ## created project count
  xpath = '//span[@class="count"]'
  created = xml_text(xml_find_all(doc, xpath)[1]) %>% regmatches(., regexpr("[[:digit:]]", .)) %>% trimws
  if(length(created)==0) created=NA
  
  # 存成data frame
  df = data.frame(cid = x, 
                  biography = bio, 
                  websites = webs, 
                  backed = backed,
                  created = created)
}, creator_id))

# View(creator_info)

save(creator_info, file="data/creatorInfo.rdata")
"done"
```


### project information
```{r}
iter = 0
project_info = Reduce(rbind, Map(function(x){
  
  # x="https://www.kickstarter.com/projects/1870476470/tomb-of-horror-vol-2-horror-anthology?ref=category_newest"
  
  iter = iter+1
  doc = read_html(x)
  
  ## updates count
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
  upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
  
  ## comments count 
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--comments", " " ))]'
  cmt_cnt = xml_find_all(doc, xpath) %>% 
    xml_text %>% 
    regmatches(., regexpr("[[:digit:]]", .)) %>%
    trimws %>%
    as.numeric
  
  ## project image
  xpath='//img[contains(concat( " ", @class, " " ), concat( " ", "aspect-ratio--object", " " ))]'
  img_src = doc %>% xml_find_all(xpath) %>% xml_attr("src")
  if(length(img_src)==0) img_src=NA
  
  ## Description
  xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "description-container", " " ))]'
  desc = doc %>% 
    xml_find_all(xpath) %>% 
    xml_text %>% 
    sub("\\sLearn about accountability on Kickstarter","",.) %>%    # 去除最下面的贅字
    sub("\\sQuestions about this project\\?","",.) %>% 
    sub("Check out the FAQ","",.) %>% 
    sub("\\sReport this project to Kickstarter","",.) %>% 
    trimws
  
  ### Risks and challenges
  desc_risksAndChallenges = desc %>% regmatches(., regexpr("Risks and challenges[[:space:]].*", .)) %>%
    sub("Risks and challenges","",.) %>% trimws
  if(length(desc_risksAndChallenges)==0) desc_risksAndChallenges=NA
  ### About
  desc_about = desc %>% 
    regmatches(., regexpr("About[[:space:]].*", .)) %>%
    sub("About","",.) %>% trimws
  Erroresult<- tryCatch(                  # 如果沒有Risks and challenges
    {
      desc_about = desc_about %>% sub(regmatches(., regexpr("Risks and challenges[[:space:]].*", .)),"",.) %>% trimws  
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR:  ",err))
    })
  
  ### pictures in description //example[124] or [126]
  pic_src = doc %>% xml_find_all(xpath) %>% xml_find_all('//img[@class="fit"]') %>% xml_attr("src")
  pic_srcs = "" 
  pic_srcs = paste(pic_src, collapse = ', ') # 把所有網址連接在一起
  pic_cnt = length(pic_src) %>% as.numeric
  if(pic_cnt==0) pic_srcs=NA

  ## support
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "pledge-selectable-sidebar", " " ))]'
  supports = doc %>% xml_find_all(xpath)
  sup_cnt = length(supports) %>% as.numeric
  
  ## Hidden Project
  Erroresult<- tryCatch(
    {
      xpath = '//*[(@id = "hidden_project")]'
      isHidden = xml_find_all(doc, xpath)
      if(length(isHidden)!=0){
        upd_cnt = NA
        cmt_cnt = NA
        img_src = NA
        desc = NA
        desc_about = NA
        desc_risksAndChallenges = NA
        pic_cnt = NA
        pic_srcs = NA
        sup_cnt = NA
      }
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR:  ",err))
    })

  # 存成data frame
  df = data.frame(
    purl = x,
    upd_cnt = upd_cnt,
    cmt_cnt = cmt_cnt,
    img_src = img_src,
    desc = desc,
    desc_about = desc_about,
    desc_risksAndChallenges = desc_risksAndChallenges,
    pic_cnt = pic_cnt,
    pic_srcs = pic_srcs,
    sup_cnt = sup_cnt
  )
}, project_url))

# View(project_info)

save(project_info, file="data/projectInfo.rdata")
"done"
```

#### update posts

```{r}
updDB = mongo(collection = "updates", db = "Kickstarter") # create connection, database and collection
updDB$count()
```

```{r}
upates = sapply(project_url,function(x){
  
  # print(x)
  upd=data.frame()
  # x = project_url[405]
  
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
  doc = read_html(x)
  upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
  
  ## Hidden Project
  Erroresult<- tryCatch(
    {
      xpath = '//*[(@id = "hidden_project")]'
      isHidden = xml_find_all(doc, xpath)
      if(length(isHidden)!=0){
        upd_date = NA
        upd_title = NA
        upd_content = NA
        upd_cnt = 0
      }
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR hidden project:  ",err))
    })
  
  if(upd_cnt!=0){
    
    url = x %>% sub(".ref=.*","",.) %>% paste0("/updates")
    doc = read_html(url)
    
    xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "hover-target", " " ))]'
    upPosts = xml_find_all(doc, xpath)
    
    upPosts_date = upPosts %>% 
      xml_text %>% 
      regmatches(., regexpr("[[:upper:]][[:lower:]]* [[:digit:]]{1,2}, [[:digit:]]{4}", .)) %>% 
      trimws %>% as.character
      # strptime("%B %d, %Y") 
    
    upPosts_title = upPosts %>% 
      xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "grid-post__title", " " ))]') %>% 
      xml_text %>% trimws
    
    upPosts_content = upPosts %>% 
      xml_find_all('//div[@class = "grid-post__content"]') %>% 
      xml_text %>% trimws
    if(length(upPosts_content)==0){
      upPosts_content = NA
    }
    
    # 有些upd沒有content只有title
    if(length(upPosts_content)!=length(upPosts_title)){
      for(i in 1:(length(upPosts_title) - length(upPosts_content)) ){
        upPosts_content = append(upPosts_content, "")
      }
    }
    
    
    # 存成data frame
    upd = data.frame(
        purl = x,
        upd_date = upPosts_date,
        upd_title = upPosts_title,
        upd_content = upPosts_content
      )
    
    # insert data to DB
    updDB$insert(upd)
    
  } # end if 
})
```


```{r}
updates = Reduce(rbind, lapply(project_url,function(x){
  
  # print(x)
  iter = iter+1
  upd=data.frame()

  # x = project_url[405]
  
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "project-nav__link--updates", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "count", " " ))]'
  doc = read_html(x)
  upd_cnt = xml_text(xml_find_all(doc, xpath)) %>% as.numeric
  
  ## Hidden Project
  Erroresult<- tryCatch(
    {
      xpath = '//*[(@id = "hidden_project")]'
      isHidden = xml_find_all(doc, xpath)
      if(length(isHidden)!=0){
        upd_date = NA
        upd_title = NA
        upd_content = NA
        upd_cnt = 0
      }
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR hidden project:  ",err))
    })
  
  if(upd_cnt!=0){
    
    url = x %>% sub(".ref=.*","",.) %>% paste0("/updates")
    doc = read_html(url)
    
    xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "hover-target", " " ))]'
    upPosts = xml_find_all(doc, xpath)
    
    upPosts_date = upPosts %>% 
      xml_text %>% 
      regmatches(., regexpr("[[:upper:]][[:lower:]]* [[:digit:]]{1,2}, [[:digit:]]{4}", .)) %>% 
      trimws %>% as.character
      # strptime("%B %d, %Y") 
    
    upPosts_title = upPosts %>% 
      xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "grid-post__title", " " ))]') %>% 
      xml_text %>% trimws
    
    upPosts_content = upPosts %>% 
      xml_find_all('//div[@class = "grid-post__content"]') %>% 
      xml_text %>% trimws
    if(length(upPosts_content)==0){
      upPosts_content = NA
    }
    
    # 有些upd沒有content只有title
    if(length(upPosts_content)!=length(upPosts_title)){
      for(i in 1:(length(upPosts_title) - length(upPosts_content)) ){
        upPosts_content = append(upPosts_content, "")
      }
    }
    
    
    # 存成data frame
    
    upd = data.frame(
        purl = x,
        upd_date = upPosts_date,
        upd_title = upPosts_title,
        upd_content = upPosts_content
      )
  } # end if 
  
  return(upd)
}))

# View(updates)
save(updates, file="data/updateInfo.rdata")
"done"
```


### supports

```{r}
supports = Reduce(rbind, lapply(project_url,function(x){
  
  # print(x)
  sup = data.frame()
  # x = project_url[1321]
    
  doc = read_html(x)
  xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "pledge-selectable-sidebar", " " ))]'
  supports = doc %>% xml_find_all(xpath)
  sup_cnt = length(supports) %>% as.numeric
  
  ## Hidden Project
  Erroresult<- tryCatch(
    {
      xpath = '//*[(@id = "hidden_project")]'
      isHidden = xml_find_all(doc, xpath)
      
      if(length(isHidden)!=0){
        sup_amounts = NA
        sup_desc = NA
        sup_extraInfo = NA
        sup_baker = NA
        sup_cnt=0
      }
    }, warning = function(war) {
      print(paste("MY_WARNING:  ",war))
    }, error = function(err) {
      print(paste("MY_ERROR hidden project:  ",err))
    })
  
  if(sup_cnt!=0){
      
    sup_amounts = supports %>% 
      xml_find_all('//h2[@class="pledge__amount"]') %>% 
      xml_text %>% trimws
    sup_desc = supports %>% 
      xml_find_all('//div[@class="pledge__reward-description pledge__reward-description--expanded"]') %>% 
      xml_text %>% trimws
    sup_extraInfo = supports %>% 
      xml_find_all('//div[@class="pledge__extra-info"]') %>% 
      xml_text %>% trimws
    sup_baker = supports %>% 
      xml_find_all('//*[contains(concat( " ", @class, " " ), concat( " ", "pledge__backer-count", " " ))]') %>% 
      xml_text %>% regmatches(., regexpr("[[:digit:]]+", .)) %>% as.numeric
      
    # 存成data frame
    sup = data.frame(
        purl = x,
        sup_amounts = sup_amounts,
        sup_desc = sup_desc,
        sup_extraInfo = sup_extraInfo,
        sup_baker = sup_baker
      )
    } # end if 
    
    return(sup)
  }))

# View(supports)
save(supports, file="data/supportInfo.rdata")
"done"
```





---
<br> <br> <hr>

## RSelenium
```{r include=FALSE}
# start a chrome browser
rD <- rsDriver()
remDr <- rD[["client"]]
```

### comment information
```{r}
iter=0
comments = Reduce(rbind, Map(function(x){
  
  # print(x) 
  # x = "https://www.kickstarter.com/projects/27930725/petrified-forest-national-park-residency?ref=category_newest"
  # url = "https://www.kickstarter.com/projects/jeszika/anubis-egyptian-mythology-art-prints-make-100/comments"
  
  iter=iter+1
  url = x %>% sub(".ref=.*","",.) %>% paste0("/comments")
  remDr$navigate(url)
  
  ## RSelenium
  webElem <- remDr$findElements(using = 'xpath', '//*[contains(concat( " ", @class, " " ), concat( " ", "p2", " " )) and contains(concat( " ", @class, " " ), concat( " ", "mb3", " " ))]')
  
  cmt_author=NA
  cmt_content=NA
  
  Erroresult<- tryCatch({
    cmts = webElem[[1]]$findChildElements(using = "tag name", value = "li")
    cmt_author = cmts[[1]]$findChildElements(using = "xpath", value = "//span[@class='mr2']") %>% 
      lapply(., function(x){x$getElementText()}) %>% 
      unlist
    cmt_content = lapply(cmts, FUN = function(x){
      x$findChildElements(using = "tag name", value = "p") %>% 
        lapply(., function(x){x$getElementText()}) %>% 
        unlist %>% 
        paste(., collapse = '\n')
    })
    cmt_content = cmt_content %>% unlist
    cmt_content = cmt_content[!cmt_content == "This person has canceled their pledge. Show the comment."]
    
  }, warning = function(war) {
    print(paste("MY_WARNING:  ",war))
  }, error = function(err) {
    print(paste("MY_ERROR:  ",err));
  })
  
  # 存成data frame
  if(length(cmt_author)!=0){
    df = data.frame(
      url= x,
      cmt_author = cmt_author,
      cmt_content = cmt_content
    )
  }
  
}, project_url))  

# save
save(comments,"data/commentsInfo.rdata")
"done"
```


### Project Main Video

** complete **
```{r}
iter=0
proj_vid = data.frame()
proj_vid = Reduce(rbind, Map(function(x){

  # print(x)
  # x = project_url[141]
  iter = iter+1
  remDr$navigate(x)
  
  # RSelenium
  webElems <- remDr$findElements(using = 'class', "border-medium")
  webElemsClasses <- lapply(webElems, function(x){x$getElementAttribute("class")}) %>% unlist
  
  Erroresult<- tryCatch({
    
    vid_src <- c("")
  
    # clicking play button let video play
    webElem <- webElems[[which(webElemsClasses == "m-auto w20p h20p w15p-md h15p-md p1 p2-md bg-green-700 border border-white border-medium")]]
    webElem$clickElement()
    
    # get source
    webElems <- remDr$findElements(using = 'xpath', "//source")
    vid_src <- sapply(webElems, function(x){x$getElementAttribute("src")})[1] %>% as.character
    
  }, warning = function(war) {
    print(paste("MY_WARNING:  ",war))
  }, error = function(err) {
    print(paste("MY_ERROR:  ",err));
  })

  
  # 存成data frame
  df = data.frame(url= x, src = vid_src)
  
}, project_url))

# View(proj_vid)

# save
save(proj_vid, "data/videoInfo.rdata")
"done"
```

#### 關掉Selenium Server
```{r}
remDr$close()

# stop the selenium server
rD[["server"]]$stop() 
```



### collaborator 
**not complete**
```{r}

x = project_url[118]
x = 'https://www.kickstarter.com/projects/1971386116/little-miss-sumo?ref=discovery'
x = 'https://www.kickstarter.com/projects/beehivebooks/i-will-live-forever-papercraft-comics-by-maelle-do?ref=discovery'
  # 進入專案頁面
  remDr$navigate(x)
  
  # 進入creator小page
  webElems <- remDr$findElements(using = 'class', "flex-noshrink")
  webElemsClasses <- unlist(lapply(webElems, function(x){x$getElementAttribute("class")}))
  Erroresult<- tryCatch({
    webElem <- webElems[[which(webElemsClasses == "w4 w7-md mb2-md pointer flex-noshrink keyboard-focusable")[2]]]
    webElem$clickElement()
  }, warning = function(war) {
    print(paste("MY_WARNING:  ",war))
  }, error = function(err) {
    webElem <- webElems[[which(webElemsClasses == "w4 w7-md mb2-md pointer flex-noshrink keyboard-focusable")[1]]]
    webElem$clickElement()
    print(paste("MY_ERROR:  ",err))
 })
  
  webElems = remDr$findElements(using='xpath', '//h4')
  webElemsClasses <- unlist(lapply(webElems, function(x){x$getElementAttribute("class")}))
  webElem <- webElems[which(webElemsClasses == "mt0 mb3 type-18")]
  webElem$getElementText
  
```


### creator small page

** not complete **
```{r}
projURL_df = data.frame()
projURL_df = Reduce(rbind, Map(function(x){

  x = project_url[118]
  # 進入專案頁面
  remDr$navigate(x)
  
  # 進入creator小page
  webElems <- remDr$findElements(using = 'class', "flex-noshrink")
  webElemsClasses <- unlist(lapply(webElems, function(x){x$getElementAttribute("class")}))
  Erroresult<- tryCatch({
    webElem <- webElems[[which(webElemsClasses == "w4 w7-md mb2-md pointer flex-noshrink keyboard-focusable")[2]]]
    webElem$clickElement()
  }, warning = function(war) {
    print(paste("MY_WARNING:  ",war))
  }, error = function(err) {
    webElem <- webElems[[which(webElemsClasses == "w4 w7-md mb2-md pointer flex-noshrink keyboard-focusable")[1]]]
    webElem$clickElement()
    print(paste("MY_ERROR:  ",err))
 })
  
  ## last login date

  ## facebook connect
  # use this project !
  # https://www.kickstarter.com/projects/1870476470/tomb-of-horror-vol-2-horror-anthology?ref=category_newest
  
  
  ## Collaborators on this project
  
  
  
  # 存成data frame
  df = data.frame(url= x, description = desc, text = text)

}, project_url[1:5]))
```

#### 關掉Selenium Server
```{r}
remDr$close()

# stop the selenium server
rD[["server"]]$stop() 
```


---
<br> <br> <hr>

### creator biography可讀性
```{r}
# ra_bio = with(all_bio, readability(bio, url))
# ra_bio
# summary(ra_bio)
```