if(!("devtools" %in% installed.packages())) install.packages("devtools")
if("srhoads" %in% installed.packages()) library(srhoads) else devtools::install_github("srhoads/srhoads")
pkg("shiny")
Kickstarter is an American public-benefit corporation based in Brooklyn, New York, that maintains a global crowd funding platform focused on creativity. The company’s stated mission is to “help bring creative projects to life”.
Kickstarter has reportedly received more than $1.9 billion in pledges from 9.4 million backers to fund 257,000 creative projects, such as films, music, stage shows, comics, journalism, video games, technology and food-related projects.
For this assignment, I am asking you to analyze the descriptions of kickstarter projects to identify commonalities of successful (and unsuccessful projects) using the text mining techniques we covered in the past two lectures.
The dataset for this assignment is taken from webroboto.io ‘s repository. They developed a scrapper robot that crawls all Kickstarter projects monthly since 2009. We will just take data from the most recent crawl on 2018-02-15.
To simplify your task, I have downloaded the files and partially cleaned the scraped data. In particular, I converted several JSON columns, corrected some obvious data issues, and removed some variables that are not of interest (or missing frequently). I have also subsetted the data to only contain projects originating in the United States (to have only English language and USD denominated projects).
The data is contained in the file kickstarter_projects.csv and contains about 150,000 projects and about 20 variables.
Below, I’m reading the data straight from github.
# d <- read.csv("https://raw.githubusercontent.com/QMSS-GR5063-2018/DV_CU_course_material/master/Exercises/09_kickstarter/kickstarter_projects.csv?token=AZbwLuzRYxYEP2HM2U0W6xLJR6ZeCw8-ks5ayrITwA%3D%3D")
d <- tryCatch(read.csv("https://raw.githubusercontent.com/QMSS-GR5063-2018/DV_CU_course_material/master/Exercises/09_kickstarter/kickstarter_projects.csv?token=AZbwLuzRYxYEP2HM2U0W6xLJR6ZeCw8-ks5ayrITwA%3D%3D"), error=function(e){
tryCatch(read.csv("https://raw.githubusercontent.com/QMSS-GR5063-2018/DV_CU_course_material/master/Exercises/09_kickstarter/kickstarter_projects.csv?token=AGLPALRW2PWE4X2PF4ZZSGC5AR3WK"),
error=function(e){
tryCatch(get(load("kickstarter_projects.csv.rda")), error=function(e) NULL)
})
})
Interestingly enough, it seems that .rda (RData) files are smaller than .f (feather) files.
kickstarter_projects <- d
save_kickstarter_projects = F
if(save_kickstarter_projects) save(kickstarter_projects, file="kickstarter_projects.csv.rda") # feather::write_feather(kickstarter_projects, "kickstarter_projects.csv.f")
if(is.null(d)) d <- data.frame(
backers_count = c(51L, 26L, 89L, 41L, 11L),
converted_pledged_amount = c(1536L, 1616L, 3700L, 1510L, 1054L),
goal = c(1000, 1500, 500, 1200, 2957),
id = c(747214266L, 167852290L, 954681482L, 219527796L,
1832299147L),
pledged = c(1536.01, 1616, 3700, 1510, 1054),
blurb = as.factor(c("Fancy Ketchup seeks the help of its loyal fan base to raise enough money to fund its followup to its first album,
\"Hold the Mayo.\"",
"THE PLATH PROJECT: TWO WORLD PREMIERES\n@ The Center for New Music, San Francisco",
"Quality handmade pens made from exotic hardwoods and other elegant materials",
"\"The Dracula Letters\" is the latest project by composer S.J. Pettersson featuring famed mezzo soprano Iris Malkin.",
"Powerfully Healing Perspective... Learn how flipping the current pain scale empowers your body/mind to rewire faulty brain programming!")),
country = as.factor(c("USA", "USA", "USA", "USA", "USA")),
created_at = as.factor(c("2013-02-06", "2014-10-09",
"2012-09-29", "2014-10-02",
"2018-01-21")),
currency = as.factor(c("USD", "USD", "USD", "USD", "USD")),
deadline = as.factor(c("2013-03-15", "2014-12-24",
"2012-10-30", "2014-11-05",
"2018-02-23")),
is_starrable = as.factor(c("false", "false", "false", "false",
"true")),
launched_at = as.factor(c("2013-02-13", "2014-10-25",
"2012-10-10", "2014-10-06",
"2018-01-24")),
name = as.factor(c("Fancy Ketchup's Second Album",
"The Plath Project",
"Handcrafted Pens Made from Exotic Woods",
"\"The Dracula Letters\" - by S.J. Pettersson",
"Comfort Quest- HEALING Pain through a Transformative Lens")),
slug = as.factor(c("fancy-ketchups-second-album",
"the-plath-project",
"handcrafted-pens-made-from-exotic-woods",
"the-dracula-letters-by-sj-pettersson",
"comfort-quest-healing-pain-through-a-transformativ")),
source_url = as.factor(c("https://www.kickstarter.com/discover/categories/music/rock",
"https://www.kickstarter.com/discover/categories/music/classical%20music",
"https://www.kickstarter.com/discover/categories/crafts",
"https://www.kickstarter.com/discover/categories/music/classical%20music",
"https://www.kickstarter.com/discover/categories/publishing/nonfiction")),
spotlight = as.factor(c("true", "true", "true", "true",
"false")),
staff_pick = as.factor(c("false", "true", "false", "true",
"false")),
state = as.factor(c("successful", "successful",
"successful", "successful", "live")),
state_changed_at = as.factor(c("2013-03-15", "2014-12-24",
"2012-10-30", "2014-11-05",
"2018-01-24")),
location_town = as.factor(c("Los Angeles", "San Francisco",
"Tremonton", "Los Angeles",
"Jacksonville")),
location_state = as.factor(c("CA", "CA", "UT", "CA", "FL")),
top_category = as.factor(c("music", "music", "crafts", "music",
"publishing")),
sub_category = as.factor(c("rock", "classical music", NA,
"classical music", "nonfiction"))
)
There are several ways to identify success of a project:
- State (state): Whether a campaign was successful or not.
- Pledged Amount (pledged)
- Achievement Ratio: Create a variable achievement_ratio by calculating the percentage of the original monetary goal reached by the actual amount pledged (that is pledged\goal 100).*
- Number of backers (backers_count)
- How quickly the goal was reached (difference between launched_at and state_changed_at) for those campaigns that were successful.
success by achievement ratio
library(lubridate)
library(tidyverse)
library(plotly)
summary(d$state)
canceled failed live successful suspended
5994 53189 4191 84457 386
summary(d$pledged)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 123 1661 12273 6293 10266846
summary(d$achievement_ratio <- (d$goal / d$pledged) * 100)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.002 81.974 98.280 Inf 3793.627 Inf 52
summary(d$backers_count)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 4.0 28.0 148.5 87.0 105857.0
summary(d$goal_time <- date(d$state_changed_at) - date(d$launched_at))
Length Class Mode
148217 difftime numeric
summary(d$goal_time_weeks <- (date(d$state_changed_at) - date(d$launched_at)) / 7)
Length Class Mode
148217 difftime numeric
Use one or more of these measures to visually summarize which categories were most successful in attracting funding on kickstarter. Briefly summarize your findings.
My plotly map is a histogram of the number of projects in each category, characterized by the frequency of each state condition for its projects. Music has the highest number of successful projects. Then film and publishing follow suite next in quantity of successes. Tech has the most failures, but interestingly a lot of backing by backers (next visualization). Perhaps this is because tech can be more challenging than basic musical or literature-based projects.
After reviewing some of my visualizations below, it is pretty apparent that each of the top categories had more successes than any other one state condition. It looks like by a moderate margin, technology has the largest number of successful projects that were backed by people/groups/institutions. For most of the categories, successful projects are the ones that had the highest backing. This makes sense, becase they were being supported. This pattern is true for actually every category. The journalism category looks like it has the fewest backed projects, but the variability between its states of completion is much lower than seemingly all of the other categories. For theater, it seems that almost every one of its few number of projects succeeded in terms of backrer support. The second and third most successful categories are film/video and games respectively. All of this seems intuitive because of the popularity of media, technology, and virtual gaming in the modern world.
The second two visualizations below show the lenth of time it took for each project to reach a change in state/completion. By and large, the most successful projects tok the most amount of time, with theater’ taking a slight lead over the rest. Most of the briefest project durations were the ones that were canceled. That’s good because it means people weren’t wasting TOO much time on projects that would go nowhere. But still, some of the failed projects outrank some of the successes in some categories in time spent on them, like technology. It’s interesting how technology has the greatest number of successful projects, but its failed projects took more time than did its successful ones (on average).
plot_ly(d, x = ~ top_category, color = ~ state) %>%
add_histogram()
ggplot(d, aes(x = top_category, y = backers_count, fill = state)) +
geom_bar(stat = "identity", position = "dodge") + coord_flip() + ylab("Number of Backers") + xlab("Top Category") + ggtitle("Success by Popular Category")
ggplot(d, aes(x = top_category, y = goal_time_weeks, fill = state)) +
geom_bar(stat = "identity", position = "dodge") + coord_flip() + ylab("Total Weeks to Finish Project") + xlab("Top Category") + ggtitle("Success by Popular Category")
ggplot(d, aes(x = top_category, y = goal_time, fill = state)) +
geom_bar(stat = "identity") + coord_flip() + ylab("Total Days to Finish Project") + xlab("Top Project Category") + ggtitle("Project Success by Popular Category")
library(plotly)
setup_plotly = F
if(setup_plotly){
R.home(component = "home")
usethis::edit_r_environ()
Sys.setenv("plotly_username"="SRhoads")
Sys.setenv("plotly_api_key"="VL5XaziLtaphG9hlICkH")
}
(p <- plot_ly(d, x = ~ goal_time_weeks, color = ~ top_category, type = "box"))
(plot.b4.ly <- ggplot(d,
aes(x= reorder(top_category, goal_time_weeks, na.rm=TRUE),
y = goal_time_weeks)) +
geom_boxplot(aes(fill = state),
outlier.colour = "transparent",
alpha = 0.3) +
coord_flip() +
labs(x = "Popular Project Category", y = "Donor Procurement Rate") +
geom_jitter(shape = 21,
aes(fill = state),
size = 1,
position = position_jitter(w = 0.01)))
(plot.b4.ly.2 <- ggplot(d,
aes(x= reorder(top_category, goal_time_weeks, na.rm=TRUE),
y = goal_time_weeks)) +
geom_boxplot(
outlier.colour = "transparent",
alpha = 0.3) +
coord_flip() +
labs(x = "Popular Project Category", y = "Donor Procurement Rate") +
geom_jitter(shape = 21,
aes(fill = state),
size = 1,
position = position_jitter(w = 0.01)))
(plot.b4.ly.3 <- ggplot(d,
aes(x= reorder(top_category, goal_time_weeks, na.rm=TRUE),
y = goal_time_weeks)) +
geom_boxplot(aes(fill = state),
outlier.colour = "transparent",
alpha = 0.3) +
coord_flip() + labs(x = "Popular Project Category", y = "Donor Procurement Rate"))
ggplotly(plot.b4.ly)