library(data.table)
kickstarter = fread("Kickstarter_2017-01-15T22_21_04_985Z/Kickstarter041.csv", data.table = F, verbose = T, stringsAsFactors = F)
Input contains no \n. Taking this to be a filename to open
[01] Check arguments
Using 8 threads (omp_get_max_threads()=8, nth=8)
NAstrings = [<<NA>>]
None of the NAstrings look like numbers.
show progress = 1
0/1 column will be read as integer
[02] Opening the file
Opening file Kickstarter_2017-01-15T22_21_04_985Z/Kickstarter041.csv
File opened, size = 14.93MB (15651944 bytes).
Memory mapped ok
[03] Detect and skip BOM
[04] Arrange mmap to be \0 terminated
\n has been found in the input and different lines can end with different line endings (e.g. mixed \n and \r\n in one file). This is common and ideal.
[05] Skipping initial rows if needed
Positioned on line 1 starting: <<id,photo,name,blurb,goal,pledg>>
[06] Detect separator, quoting rule, and ncolumns
Detecting sep automatically ...
sep=',' with 100 lines of 32 fields using quote rule 0
Detected 32 columns on line 1. This line is either column names or first data row. Line starts as: <<id,photo,name,blurb,goal,pledg>>
Quote rule picked = 0
fill=false and the most number of columns found is 32
[07] Detect column types, good nrow estimate and whether first row is column names
Number of sampling jump points = 10 because (15651943 bytes from row 1 to eof) / (2 * 473340 jump0size) == 16
Type codes (jump 000) : 5AAA57AA4AAA455554577AAAA4AA2222 Quote rule 0
Type codes (jump 004) : 5AAA77AA4AAA455554577AAAA4AA2222 Quote rule 0
Type codes (jump 010) : 5AAA77AA4AAA455554577AAAA4AA2222 Quote rule 0
'header' determined to be true due to column 1 containing a string on row 1 and a lower type (int32) in the rest of the 1049 sample rows
=====
Sampled 1049 rows (handled \n inside quoted fields) at 11 jump points
Bytes from first data row on line 2 to the end of last row: 15651617
Line length: mean=4777.62 sd=92.31 min=4228 max=5341
Estimated number of rows: 15651617 / 4777.62 = 3277
Initial alloc = 3604 rows (3277 + 9%) using bytes/max(mean-2*sd,min) clamped between [1.1*estn, 2.0*estn]
=====
[08] Assign column names
[09] Apply user overrides on column types
After 0 type and 0 drop user overrides : 5AAA77AA4AAA455554577AAAA4AA2222
[10] Allocate memory for the datatable
Allocating 32 column slots (32 - 0 dropped) with 3604 rows
[11] Read the data
jumps=[0..3), chunk_size=5217205, total_size=15651617
Read 3269 rows x 32 columns from 14.93MB (15651944 bytes) file in 00:00.171 wall clock time
[12] Finalizing the datatable
Type counts:
4 : bool8 '2'
4 : bool8 '4'
6 : int32 '5'
4 : float64 '7'
14 : string 'A'
=============================
0.009s ( 5%) Memory map 0.015GB file
0.093s ( 54%) sep=',' ncol=32 and header detection
0.001s ( 1%) Column type detection using 1049 sample rows
0.001s ( 1%) Allocation of 3604 rows x 32 cols (0.001GB) of which 3269 ( 91%) rows used
0.067s ( 39%) Reading 3 chunks (0 swept) of 4.976MB (each chunk 1089 rows) using 3 threads
+ 0.031s ( 18%) Parse to row-major thread buffers (grown 0 times)
+ 0.022s ( 13%) Transpose
+ 0.014s ( 8%) Waiting
0.000s ( 0%) Rereading 0 columns due to out-of-sample type exceptions
0.171s Total
kickstarter = sapply(kickstarter, FUN = function(X) gsub("\"\"", "\"", X))
kickstarter = data.frame(kickstarter, stringsAsFactors = F)
kickstarter$goal <- as.numeric(kickstarter$goal)
kickstarter$pledged <- as.numeric(kickstarter$pledged)
kickstarter$state <- as.factor(kickstarter$state)
kickstarter$disable_communication <- as.factor(kickstarter$disable_communication)
kickstarter$country <- as.factor(kickstarter$country)
kickstarter$currency <- as.factor(kickstarter$currency)
kickstarter$currency_trailing_code <- as.factor(kickstarter$currency_trailing_code)
kickstarter$deadline <- as.numeric(kickstarter$deadline)
kickstarter$state_changed_at <- as.numeric(kickstarter$state_changed_at)
kickstarter$created_at <- as.numeric(kickstarter$created_at)
kickstarter$launched_at <- as.numeric(kickstarter$launched_at)
kickstarter$staff_pick <- as.factor(kickstarter$staff_pick)
# kickstarter$is_starrable <- as.factor(kickstarter$is_starrable)
kickstarter$backers_count <- as.numeric(kickstarter$backers_count)
kickstarter$static_usd_rate <- as.numeric(kickstarter$static_usd_rate)
kickstarter$usd_pledged <- as.numeric(kickstarter$usd_pledged)
kickstarter$spotlight <- as.factor(kickstarter$spotlight)
summary(kickstarter)
id photo name blurb goal
Length:3269 Length:3269 Length:3269 Length:3269 Min. : 0
Class :character Class :character Class :character Class :character 1st Qu.: 700
Mode :character Mode :character Mode :character Mode :character Median : 2331
Mean : 52307
3rd Qu.: 5500
Max. :100000000
pledged state slug disable_communication country
Min. : 0 canceled : 418 Length:3269 FALSE:3257 US :2520
1st Qu.: 4 failed :2085 Class :character TRUE : 12 GB : 370
Median : 118 live : 38 Mode :character CA : 143
Mean : 1780 successful: 716 AU : 61
3rd Qu.: 785 suspended : 12 NL : 32
Max. :266305 FR : 23
(Other): 120
currency currency_symbol currency_trailing_code deadline state_changed_at
USD :2520 Length:3269 FALSE: 507 Min. :1.243e+09 Min. :1.243e+09
GBP : 370 Class :character TRUE :2762 1st Qu.:1.366e+09 1st Qu.:1.366e+09
CAD : 143 Mode :character Median :1.415e+09 Median :1.415e+09
EUR : 133 Mean :1.404e+09 Mean :1.404e+09
AUD : 61 3rd Qu.:1.445e+09 3rd Qu.:1.445e+09
SEK : 19 Max. :1.489e+09 Max. :1.485e+09
(Other): 23
created_at launched_at staff_pick backers_count static_usd_rate
Min. :1.242e+09 Min. :1.242e+09 FALSE:3050 Min. : 0.0 Min. :0.04823
1st Qu.:1.361e+09 1st Qu.:1.364e+09 TRUE : 219 1st Qu.: 1.0 1st Qu.:1.00000
Median :1.409e+09 Median :1.412e+09 Median : 4.0 Median :1.00000
Mean :1.399e+09 Mean :1.401e+09 Mean : 24.6 Mean :1.04528
3rd Qu.:1.440e+09 3rd Qu.:1.442e+09 3rd Qu.: 18.0 3rd Qu.:1.00000
Max. :1.484e+09 Max. :1.485e+09 Max. :1633.0 Max. :1.71447
usd_pledged creator location category profile
Min. : 0.0 Length:3269 Length:3269 Length:3269 Length:3269
1st Qu.: 4.2 Class :character Class :character Class :character Class :character
Median : 121.0 Mode :character Mode :character Mode :character Mode :character
Mean : 1537.1
3rd Qu.: 800.0
Max. :111111.8
spotlight urls source_url friends is_starred
FALSE:2553 Length:3269 Length:3269 Length:3269 Length:3269
TRUE : 716 Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
is_backing permissions
Length:3269 Length:3269
Class :character Class :character
Mode :character Mode :character
vjson = which(sapply(kickstarter$creator, jsonlite::validate) == TRUE)
attributes(vjson) <- NULL
creator = kickstarter$creator[vjson]
library(rjson)
creator = lapply(creator, fromJSON)
creator_url = sapply(X = creator, FUN = function(X) X$urls$web$user)
head(creator_url)
[1] "https://www.kickstarter.com/profile/1785411136" "https://www.kickstarter.com/profile/686630136"
[3] "https://www.kickstarter.com/profile/660752087" "https://www.kickstarter.com/profile/1159279616"
[5] "https://www.kickstarter.com/profile/littlealicecrafts" "https://www.kickstarter.com/profile/1988843766"
library(xml2)
library(XML)
這裡只有retrive 100頁,需要非常久的時間。
creator_location = Reduce(rbind, Map(function(x){
url = paste0(x, "/about")
doc = read_html(url)
# 挑出location的文字
xpath = '//span[contains(@class,"location")]'
bio = paste0("",xml_text(xml_find_all(doc, xpath)))
# 去除換行及前後空白
bio = gsub("\n", "", bio)
bio = gsub("^\\s|\\s$", "", bio)
# 存成data frame
df = data.frame(url = x, bio)
}, creator_url[1:100]))
View(creator_location)
urljson = which(sapply(kickstarter$urls, jsonlite::validate) == TRUE)
attributes(urljson) <- NULL
projects = kickstarter$urls[urljson]
library(rjson)
projects = lapply(projects, fromJSON)
projects_url = sapply(X = projects, FUN = function(X) X$web$project)
head(projects_url)
[1] "https://www.kickstarter.com/projects/1785411136/burnin-4-you-woodburning?ref=category"
[2] "https://www.kickstarter.com/projects/686630136/reasons-to-be-cheerful?ref=category"
[3] "https://www.kickstarter.com/projects/660752087/artwork-by-harriette-harrison?ref=category"
[4] "https://www.kickstarter.com/projects/1159279616/pixel-pets-5-w-x-3-h-sticker-art?ref=category"
[5] "https://www.kickstarter.com/projects/littlealicecrafts/the-little-alice-colouring-book?ref=category"
[6] "https://www.kickstarter.com/projects/1988843766/mr-bigfish-vinyl-stickers?ref=category"
library(dplyr)
package 愼㸱愼㸵dplyr愼㸱愼㸶 was built under R version 3.5.1
Attaching package: 愼㸱愼㸵dplyr愼㸱愼㸶
The following objects are masked from 愼㸱愼㸵package:data.table愼㸱愼㸶:
between, first, last
The following objects are masked from 愼㸱愼㸵package:stats愼㸱愼㸶:
filter, lag
The following objects are masked from 愼㸱愼㸵package:base愼㸱愼㸶:
intersect, setdiff, setequal, union
library(RSelenium)
rD <- rsDriver()
checking Selenium Server versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
Creating directory: C:\Users\adm\AppData\Local\binman\binman_seleniumserver\generic\3.14.0
Downloading binary: https://www.googleapis.com/download/storage/v1/b/selenium-release/o/3.14%2F...
BEGIN: POSTDOWNLOAD
checking chromedriver versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
checking geckodriver versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
checking phantomjs versions:
BEGIN: PREDOWNLOAD
BEGIN: DOWNLOAD
BEGIN: POSTDOWNLOAD
[1] "Connecting to remote server"
$`acceptInsecureCerts`
[1] FALSE
$acceptSslCerts
[1] FALSE
$applicationCacheEnabled
[1] FALSE
$browserConnectionEnabled
[1] FALSE
$browserName
[1] "chrome"
$chrome
$chrome$`chromedriverVersion`
[1] "2.41.578737 (49da6702b16031c40d63e5618de03a32ff6c197e)"
$chrome$userDataDir
[1] "C:\\Users\\adm\\AppData\\Local\\Temp\\scoped_dir15952_23970"
$cssSelectorsEnabled
[1] TRUE
$databaseEnabled
[1] FALSE
$`goog:chromeOptions`
$`goog:chromeOptions`$`debuggerAddress`
[1] "localhost:54493"
$handlesAlerts
[1] TRUE
$hasTouchScreen
[1] FALSE
$javascriptEnabled
[1] TRUE
$locationContextEnabled
[1] TRUE
$mobileEmulationEnabled
[1] FALSE
$nativeEvents
[1] TRUE
$networkConnectionEnabled
[1] FALSE
$pageLoadStrategy
[1] "normal"
$platform
[1] "Windows NT"
$rotatable
[1] FALSE
$setWindowRect
[1] TRUE
$takesHeapSnapshot
[1] TRUE
$takesScreenshot
[1] TRUE
$unexpectedAlertBehaviour
[1] ""
$version
[1] "68.0.3440.106"
$webStorageEnabled
[1] TRUE
$webdriver.remote.sessionid
[1] "09961742d677020761caa6714196ec31"
$id
[1] "09961742d677020761caa6714196ec31"
remDr <- rD[["client"]]
remDr$navigate(projects_url[55])
remDr$navigate('https://www.kickstarter.com/projects/771968170/reveries-an-analog-photo-book-about-guangzhou-chin?ref=home_new_and_noteworthy')
webElem$clickElement()
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
# xpath: //*[@id="react-project-header"]/div/div/div[7]/div/div/div[2]/div/div/div/div/div/div[2]/div/div[2]/div[1]/div/div[4]/span/text()
# xpath: //*[@id="react-project-header"]/div/div/div[8]/div/div/div[2]/div/div/div/div/div/div[2]/div/div[2]/div[1]/div/div[4]/span
webElem <- remDr$findElement(using = 'xpath', '//*[@id="react-project-header"]/div/div/div[7]/div/div/div[2]/div/div/div/div/div/div[2]/div/div[2]/div[1]/div/div[4]/span')
Selenium message:no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="react-project-header"]/div/div/div[7]/div/div/div[2]/div/div/div/div/div/div[2]/div/div[2]/div[1]/div/div[4]/span"}
(Session info: chrome=68.0.3440.106)
(Driver info: chromedriver=2.41.578737 (49da6702b16031c40d63e5618de03a32ff6c197e),platform=Windows NT 10.0.17134 x86_64)
Error: Summary: NoSuchElement
Detail: An element could not be located on the page using the given search parameters.
Further Details: run errorDetails method
created <- regmatches(text, regexpr(".+created", text))
created <- sub("created", "", created)
created <- sub("First", "1", created)
created <- trimws(created)
created
[1] "1"
backed <- regmatches(text, regexpr(".+backed", text))
backed <- sub(".+created..", "", backed)
backed <- sub("backed", "", backed)
backed <- trimws(backed)
backed
[1] "0"
# class : block type-16 link-soft-black medium
webElems <- remDr$findElements(using = 'class', "medium")
webElemsClasses <- unlist(lapply(webElems, function(x){x$getElementAttribute("class")}))
webElem <- webElems[[which(webElemsClasses == "block type-16 link-soft-black medium")[1]]]
# xpath: //*[@id="react-project-header"]/div/div/div[8]/div/div/div[2]/div/div/div/div/div/div[2]/div/div[2]/div[2]/div/a[2]
webElem <- remDr$findElement(using = 'xpath', '//*[@id="react-project-header"]/div/div/div[8]/div/div/div[2]/div/div/div/div/div/div[2]/div/div[2]/div[2]/div/a[2]')
0812
x = projects_url[5]
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
# 進入專案頁面
remDr$navigate(x)
# 進入小page
webElems <- remDr$findElements(using = 'class', "flex-noshrink")
webElemsClasses <- unlist(lapply(webElems, function(x){x$getElementAttribute("class")}))
webElem <- webElems[[which(webElemsClasses == "w4 w7-md mb2-md pointer flex-noshrink keyboard-focusable")[1]]]
webElem$clickElement()
# 找到element
webElem <- remDr$findElement(using = 'xpath', '//*[@id="react-project-header"]/div/div/div[7]/div/div/div[2]/div/div/div/div/div/div[2]/div/div[2]/div[1]/div/div[4]/span')
text = webElem$getElementText()
text
[[1]]
[1] "First created · 0 backed"
remDr$close()
# stop the selenium server
rD[["server"]]$stop()
urljson = which(sapply(kickstarter$urls, jsonlite::validate) == TRUE)
attributes(urljson) <- NULL
projects = kickstarter$urls[urljson]
library(rjson)
projects = lapply(projects, fromJSON)
projects_url = sapply(X = projects, FUN = function(X) X$web$project)
head(projects_url)
projects_url <- sub(".ref=.*", "", projects_url)
projects_url <- trimws(projects_url)
# projects_url <- paste0(projects_url, "/comments")
projects_url
library(xml2)
library(magrittr)
library(rvest)
comment <- read_html(url)%>%
html_nodes(".collaborator p") %>%
html_text(.)
all_comments = Reduce(rbind, Map(function(x){
url <- paste0(x, "/comments")
doc <- read_html(url)
# xpath <- '//*[contains(concat( " ", @class, " " ), concat( " ", "collaborator", " " ))]//p'
cmt <- paste0("",xml_text(html_nodes(doc, ".collaborator p")))
cmt <- sub("^@[[:space:]]?[[:alpha:]]+.*\n", "", cmt)
cmt <- trimws(cmt)
df <- data.frame(url = x, cmt)
}, projects_url[1:20]))
all_comments[which(all_comments$cmt != ""),1]
# ra_bio = with(all_bio, readability(bio, url))
# ra_bio
# summary(ra_bio)