Text Mining Kickstarter Projects

if(!("devtools" %in% installed.packages())) install.packages("devtools")
if("srhoads" %in% installed.packages()) library(srhoads) else devtools::install_github("srhoads/srhoads")
pkg("shiny")

Overview

Kickstarter is an American public-benefit corporation based in Brooklyn, New York, that maintains a global crowd funding platform focused on creativity. The company’s stated mission is to “help bring creative projects to life”.

Kickstarter has reportedly received more than $1.9 billion in pledges from 9.4 million backers to fund 257,000 creative projects, such as films, music, stage shows, comics, journalism, video games, technology and food-related projects.

For this assignment, I am asking you to analyze the descriptions of kickstarter projects to identify commonalities of successful (and unsuccessful projects) using the text mining techniques we covered in the past two lectures.


Data

The dataset for this assignment is taken from webroboto.io ‘s repository. They developed a scrapper robot that crawls all Kickstarter projects monthly since 2009. We will just take data from the most recent crawl on 2018-02-15.

To simplify your task, I have downloaded the files and partially cleaned the scraped data. In particular, I converted several JSON columns, corrected some obvious data issues, and removed some variables that are not of interest (or missing frequently). I have also subsetted the data to only contain projects originating in the United States (to have only English language and USD denominated projects).

The data is contained in the file kickstarter_projects.csv and contains about 150,000 projects and about 20 variables.

Tasks for the Assignment

1. Identifying Successful Projects

Below, I’m reading the data straight from github.

# d <- read.csv("https://raw.githubusercontent.com/QMSS-GR5063-2018/DV_CU_course_material/master/Exercises/09_kickstarter/kickstarter_projects.csv?token=AZbwLuzRYxYEP2HM2U0W6xLJR6ZeCw8-ks5ayrITwA%3D%3D")

d <- tryCatch(read.csv("https://raw.githubusercontent.com/QMSS-GR5063-2018/DV_CU_course_material/master/Exercises/09_kickstarter/kickstarter_projects.csv?token=AZbwLuzRYxYEP2HM2U0W6xLJR6ZeCw8-ks5ayrITwA%3D%3D"), error=function(e){
  tryCatch(read.csv("https://raw.githubusercontent.com/QMSS-GR5063-2018/DV_CU_course_material/master/Exercises/09_kickstarter/kickstarter_projects.csv?token=AGLPALRW2PWE4X2PF4ZZSGC5AR3WK"), 
           error=function(e){
             tryCatch(get(load("kickstarter_projects.csv.rda")), error=function(e) NULL)
           })
})

Interestingly enough, it seems that .rda (RData) files are smaller than .f (feather) files.

kickstarter_projects <- d

save_kickstarter_projects = F
if(save_kickstarter_projects) save(kickstarter_projects, file="kickstarter_projects.csv.rda") # feather::write_feather(kickstarter_projects, "kickstarter_projects.csv.f")
if(is.null(d)) d <- data.frame(
  backers_count = c(51L, 26L, 89L, 41L, 11L),
  converted_pledged_amount = c(1536L, 1616L, 3700L, 1510L, 1054L),
  goal = c(1000, 1500, 500, 1200, 2957),
  id = c(747214266L, 167852290L, 954681482L, 219527796L,
         1832299147L),
  pledged = c(1536.01, 1616, 3700, 1510, 1054),
  blurb = as.factor(c("Fancy Ketchup seeks the help of its loyal fan base to raise enough money to fund its followup to its first album,
                                          \"Hold the Mayo.\"",
                      "THE PLATH PROJECT: TWO WORLD PREMIERES\n@ The Center for New Music, San Francisco",
                      "Quality handmade pens made from exotic hardwoods and other elegant materials",
                      "\"The Dracula Letters\" is the latest project by composer S.J. Pettersson featuring famed mezzo soprano Iris Malkin.",
                      "Powerfully Healing Perspective... Learn how flipping the current pain scale empowers your body/mind to rewire faulty brain programming!")),
  country = as.factor(c("USA", "USA", "USA", "USA", "USA")),
  created_at = as.factor(c("2013-02-06", "2014-10-09",
                           "2012-09-29", "2014-10-02",
                           "2018-01-21")),
  currency = as.factor(c("USD", "USD", "USD", "USD", "USD")),
  deadline = as.factor(c("2013-03-15", "2014-12-24",
                         "2012-10-30", "2014-11-05",
                         "2018-02-23")),
  is_starrable = as.factor(c("false", "false", "false", "false",
                             "true")),
  launched_at = as.factor(c("2013-02-13", "2014-10-25",
                            "2012-10-10", "2014-10-06",
                            "2018-01-24")),
  name = as.factor(c("Fancy Ketchup's Second Album",
                     "The Plath Project",
                     "Handcrafted Pens Made from Exotic Woods",
                     "\"The Dracula Letters\" -  by S.J. Pettersson",
                     "Comfort Quest- HEALING Pain through a Transformative Lens")),
  slug = as.factor(c("fancy-ketchups-second-album",
                     "the-plath-project",
                     "handcrafted-pens-made-from-exotic-woods",
                     "the-dracula-letters-by-sj-pettersson",
                     "comfort-quest-healing-pain-through-a-transformativ")),
  source_url = as.factor(c("https://www.kickstarter.com/discover/categories/music/rock",
                           "https://www.kickstarter.com/discover/categories/music/classical%20music",
                           "https://www.kickstarter.com/discover/categories/crafts",
                           "https://www.kickstarter.com/discover/categories/music/classical%20music",
                           "https://www.kickstarter.com/discover/categories/publishing/nonfiction")),
  spotlight = as.factor(c("true", "true", "true", "true",
                          "false")),
  staff_pick = as.factor(c("false", "true", "false", "true",
                           "false")),
  state = as.factor(c("successful", "successful",
                      "successful", "successful", "live")),
  state_changed_at = as.factor(c("2013-03-15", "2014-12-24",
                                 "2012-10-30", "2014-11-05",
                                 "2018-01-24")),
  location_town = as.factor(c("Los Angeles", "San Francisco",
                              "Tremonton", "Los Angeles",
                              "Jacksonville")),
  location_state = as.factor(c("CA", "CA", "UT", "CA", "FL")),
  top_category = as.factor(c("music", "music", "crafts", "music",
                             "publishing")),
  sub_category = as.factor(c("rock", "classical music", NA,
                             "classical music", "nonfiction"))
)

a) Success by Category

There are several ways to identify success of a project:
- State (state): Whether a campaign was successful or not.
- Pledged Amount (pledged)
- Achievement Ratio: Create a variable achievement_ratio by calculating the percentage of the original monetary goal reached by the actual amount pledged (that is pledged\goal 100).*
- Number of backers (backers_count)
- How quickly the goal was reached (difference between launched_at and state_changed_at) for those campaigns that were successful.

success by achievement ratio

library(lubridate)
library(tidyverse)
library(plotly)
summary(d$state)
  canceled     failed       live successful  suspended 
      5994      53189       4191      84457        386 
summary(d$pledged)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       0      123     1661    12273     6293 10266846 
summary(d$achievement_ratio <- (d$goal / d$pledged) * 100)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
   0.002   81.974   98.280      Inf 3793.627      Inf       52 
summary(d$backers_count)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
     0.0      4.0     28.0    148.5     87.0 105857.0 
summary(d$goal_time <- date(d$state_changed_at) - date(d$launched_at))
  Length    Class     Mode 
  148217 difftime  numeric 
summary(d$goal_time_weeks <- (date(d$state_changed_at) - date(d$launched_at)) / 7)
  Length    Class     Mode 
  148217 difftime  numeric 

Use one or more of these measures to visually summarize which categories were most successful in attracting funding on kickstarter. Briefly summarize your findings.

My plotly map is a histogram of the number of projects in each category, characterized by the frequency of each state condition for its projects. Music has the highest number of successful projects. Then film and publishing follow suite next in quantity of successes. Tech has the most failures, but interestingly a lot of backing by backers (next visualization). Perhaps this is because tech can be more challenging than basic musical or literature-based projects.

After reviewing some of my visualizations below, it is pretty apparent that each of the top categories had more successes than any other one state condition. It looks like by a moderate margin, technology has the largest number of successful projects that were backed by people/groups/institutions. For most of the categories, successful projects are the ones that had the highest backing. This makes sense, becase they were being supported. This pattern is true for actually every category. The journalism category looks like it has the fewest backed projects, but the variability between its states of completion is much lower than seemingly all of the other categories. For theater, it seems that almost every one of its few number of projects succeeded in terms of backrer support. The second and third most successful categories are film/video and games respectively. All of this seems intuitive because of the popularity of media, technology, and virtual gaming in the modern world.

The second two visualizations below show the lenth of time it took for each project to reach a change in state/completion. By and large, the most successful projects tok the most amount of time, with theater’ taking a slight lead over the rest. Most of the briefest project durations were the ones that were canceled. That’s good because it means people weren’t wasting TOO much time on projects that would go nowhere. But still, some of the failed projects outrank some of the successes in some categories in time spent on them, like technology. It’s interesting how technology has the greatest number of successful projects, but its failed projects took more time than did its successful ones (on average).

plot_ly(d, x = ~ top_category, color = ~ state) %>%
  add_histogram()
ggplot(d, aes(x = top_category, y = backers_count, fill = state)) + 
  geom_bar(stat = "identity", position = "dodge") + coord_flip() + ylab("Number of Backers") + xlab("Top Category") + ggtitle("Success by Popular Category")

ggplot(d, aes(x = top_category, y = goal_time_weeks, fill = state)) + 
  geom_bar(stat = "identity", position = "dodge") + coord_flip() + ylab("Total Weeks to Finish Project") + xlab("Top Category") + ggtitle("Success by Popular Category")

ggplot(d, aes(x = top_category, y = goal_time, fill = state)) + 
  geom_bar(stat = "identity") + coord_flip() + ylab("Total Days to Finish Project") + xlab("Top Project Category") + ggtitle("Project Success by Popular Category")

library(plotly)

setup_plotly = F
if(setup_plotly){
  R.home(component = "home")
  usethis::edit_r_environ()
  Sys.setenv("plotly_username"="SRhoads")
  Sys.setenv("plotly_api_key"="VL5XaziLtaphG9hlICkH")
}
(p <- plot_ly(d, x = ~ goal_time_weeks, color = ~ top_category, type = "box"))
(plot.b4.ly <- ggplot(d,
                      aes(x= reorder(top_category, goal_time_weeks, na.rm=TRUE),
                          y = goal_time_weeks)) + 
   geom_boxplot(aes(fill = state), 
                outlier.colour = "transparent", 
                alpha = 0.3) +
   coord_flip() + 
   labs(x = "Popular Project Category", y = "Donor Procurement Rate") +
   geom_jitter(shape = 21, 
               aes(fill = state), 
               size = 1,
               position = position_jitter(w = 0.01))) 

(plot.b4.ly.2 <- ggplot(d,
                        aes(x= reorder(top_category, goal_time_weeks, na.rm=TRUE),
                            y = goal_time_weeks)) + 
    geom_boxplot( 
      outlier.colour = "transparent", 
      alpha = 0.3) +
    coord_flip() + 
    labs(x = "Popular Project Category", y = "Donor Procurement Rate") +
    geom_jitter(shape = 21, 
                aes(fill = state), 
                size = 1,
                position = position_jitter(w = 0.01))) 

(plot.b4.ly.3 <- ggplot(d,
                        aes(x= reorder(top_category, goal_time_weeks, na.rm=TRUE),
                            y = goal_time_weeks)) + 
    geom_boxplot(aes(fill = state), 
                 outlier.colour = "transparent", 
                 alpha = 0.3) +
    coord_flip() + labs(x = "Popular Project Category", y = "Donor Procurement Rate"))

ggplotly(plot.b4.ly)

BONUS ONLY: b) Success by Location

Now, use the location information to calculate the total number of successful projects by state (if you are ambitious, normalize by population). Also, identify the Top 50 “innovative” cities in the U.S. (by whatever measure you find plausible). Provide a leaflet map showing the most innovative states and cities in the U.S. on a single map based on these information.

Below, there are two tables that illustrate high-performing geographic areas. The first table shows is that the top countries (when leaving out the dominant United States) are Canada, Great Britain, the Netherlans, and o on. The next table evaluates the number of successful projects based on USA state. I have the 50 states ranked from largest quantity of successful projects to lowest quantity of successful projects. Unsurprisingly to me, CA and NY rank on top. These states are two of America’s most digitally connective and proactive states. I grew up in the Bay Area and then moved to NYC for college, so I clearly have a biased view of the US.

HTML("dimensions:")
dimensions:
dim(d.1.non.usa <- d %>% dplyr::select(everything()) %>%
      subset(country != "USA") %>%
      subset(state == "successful") %>%
      dplyr::group_by(country) %>%
      dplyr::summarize(`Number of Successful Projects` = n()) %>%
      dplyr::arrange(desc(`Number of Successful Projects`)))
[1] 15  2
# summarise(project.count = count(state[state == "successful"])))
names(d) %>% paste0("<code>", ., "</code>", collapse=", ") %>% HTML()
backers_count, blurb, converted_pledged_amount, country, created_at, currency, deadline, goal, id, is_starrable, launched_at, name, pledged, slug, source_url, spotlight, staff_pick, state, state_changed_at, location_town, location_state, top_category, sub_category, achievement_ratio, goal_time, goal_time_weeks
HTML("dimensions:")
dimensions:
dim(top50cities <- head(d.1 <- d %>% dplyr::select(everything()) %>%
                          subset(country == "USA") %>%
                          subset(state == "successful") %>%
                          dplyr::group_by(location_town, location_state) %>%
                          dplyr::summarize( `Number of Successful Projects` = n()) %>%
                          dplyr::arrange(desc(`Number of Successful Projects`)), n = 50))
[1] 50  3
plot_ly(d.1 %>% rename(`USA State`=location_state), x = ~ `Number of Successful Projects`, y = ~ `USA State`, color = ~ `USA State`, type = "bar")
plot_ly(d.1.non.usa, x = ~ `Number of Successful Projects`, y = ~ country, color = ~ country, type = "bar")
# The plot above works, but it makes the knit take wayyyyy too long so I commented it out

As evident in the table below, the highest performing cities (based on number of successful projects) are NYC and LA, in NY and CA respectively. I’m only considering the United States in this descriptive table. The highest ranking scores of successful projects are 6350 ad 6256, respectively.

# head(arrange(d, desc(location_town)), n = 50)
# summarise(project.count = count(state[state == "successful"])))
register_google <- function (key, account_type, client, signature, second_limit, day_limit) {
  
  # get current options
  options <- getOption("ggmap")
  
  # check for client/sig specs
  if (!missing(client) &&  missing(signature) ) {
    stop("if client is specified, signature must be also.")
  }
  if ( missing(client) && !missing(signature) ) {
    stop("if signature is specified, client must be also.")
  }
  if (!missing(client) && !missing(signature) ) {
    if (goog_account() == "standard" && missing(account_type)) {
      stop("if providing client and signature, the account type must be premium.")
    }
  }
  
  # construct new ones
  if(!missing(key)) options$google$key <- key
  if(!missing(account_type)) options$google$account_type <- account_type
  if(!missing(day_limit)) options$google$day_limit <- day_limit
  if(!missing(second_limit)) options$google$second_limit <- second_limit
  if(!missing(client)) options$google$client <- client
  if(!missing(signature)) options$google$signature <- signature
  
  # # set premium defaults
  if (!missing(account_type) && account_type == "premium") {
    if(missing(day_limit)) options$google$day_limit <- 100000
  }
  
  # class
  class(options) <- "ggmap_credentials"
  
  # set new options
  options(ggmap = options)
  
  # return
  invisible(NULL)
}

In order to use ggmap’s geocode() function, you need to go authorize Google API stuff. I think here (https://developers.google.com/places/web-service/get-api-key) you need to make a project & enable relevant APIs to geocoding. So that includes the Places API, Geocoding API, and maybe others? Like geolocation or static maps?

Below, I provide a leaflet visualization of the top 50 cities on the metric of number of successful projects. The popups describe the city name as well as the number of successful projects it has housed.

library(leaflet)
library(ggmap)

cities50 <- as.character(top50cities$location_town)
geocode_locations=F # geocode_locations=T
# if(geocode_locations) 
locations <- tryCatch(geocode(cities50, output = c("latlon")), error=function(e){
  tryCatch(read.csv("locations.datavis.ass3.csv"), error=function(e) NULL)
})

if(is.null(locations) | length(distinct(locations)) < 3) locations <- data.frame(
  X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
        15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
        27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L,
        40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L),
  lon = c(NA, -118.2436849, -73.9441579, -122.4194155, -87.6297982,
          -122.6764815, -122.3320708, -97.7430608, -71.0588801,
          -75.1652215, -93.2650108, -86.7816016, -120.7401385, -84.3879824,
          -117.1610838, -104.990251, -73.9712488, -96.7969879, -122.2711137,
          -79.9958864, -111.8910474, NA, -76.6121893, NA, NA, -112.0740373,
          -81.3792365, -115.1398296, NA, -82.9987942, -94.5785667,
          -80.1917902, -110.9747108, -105.2705456, -121.4943996, NA, NA, NA,
          -122.272747, NA, NA, NA, NA, -86.158068, NA, -118.3286614,
          -78.6381787, -89.4012302, -98.4936282, 0.121817),
  lat = c(NA, 34.0522342, 40.6781784, 37.7749295, 41.8781136,
          45.5230622, 47.6062095, 30.267153, 42.3600825, 39.9525839,
          44.977753, 36.1626638, 47.7510741, 33.7489954, 32.715738,
          39.7392358, 40.7830603, 32.7766642, 37.8043637, 40.4406248, 40.7607793,
          NA, 39.2903848, NA, NA, 33.4483771, 28.5383355, 36.1699412, NA,
          39.9611755, 39.0997265, 25.7616798, 32.2226066, 40.0149856,
          38.5815719, NA, NA, NA, 37.8715926, NA, NA, NA, NA, 39.768403, NA,
          34.0928092, 35.7795897, 43.0730517, 29.4241219, 52.205337)
)

# geocode("San Francisco", output = c("latlon"))
#  ccxcxxcx≈    b   vcbwrite.csv(locations, "locations.datavis.ass3.csv")

head(top50 <- cbind(data.frame(top50cities), data.frame(locations))) %>% DT::datatable()
# usa <- get_map("USA", zoom = 8, source = "google", maptype = "hybrid")

pal = colorFactor("Set1", domain = "") # Grab a palette
color_offsel1 = pal(top50$location_state)

greens <- colorNumeric("Reds", domain = NULL)
leaflet(top50) %>% 
  addProviderTiles(providers$Esri.WorldStreetMap) %>% 
  addCircleMarkers(popup = paste("City of", top50$location_town, "<br/>", "Project Successes:", top50$Number.of.Successful.Projects),
                   color = ~greens(Number.of.Successful.Projects)) %>% 
  addLegend(pal = pal, values = ~ top50$location_state, title = "States of Top 50 Cities")
usa <- c(left = -125, bottom = 25.75, right = -67, top = 49)
get_stamenmap(usa, zoom = 5, maptype = "toner-lite") %>% ggmap(top50)

==================================================