Kickstarter Projects

SVM

Introduction

Kickstarter is a crowdfunding platform that hosts details of projects whose founders seek funding. The public can view projects and decide to donate money towards helping them reach their funding goal. On Kaggle, there is a data set of over 370K rows and 11 columns with details about Kickstarter projects. The goal is to build a SVM model and a classification decision tree that predicts a project’s “State”: Successful, Failed, Canceled, or Suspended. Which is better for this problem?

library(tidyverse)
library(tidytext)
library(stopwords)
library(caret) # for upsampling
library(e1071)
library(tree)
#nrow(df %>% filter(State=="Canceled"))  # sus: 1842

df <- read.csv("kickstarter.csv")

suspended_df <- df %>% filter(State=="Suspended")
sample_size <- nrow(suspended_df)   # smallest number of instances among the States

successful_df <- df %>% filter(State=="Successful")
x <- sample(1:nrow(successful_df), size=sample_size)
successful_df <- successful_df[x,]

failed_df <- df %>% filter(State=="Failed")
x <- sample(1:nrow(failed_df), size=sample_size)
failed_df <- failed_df[x,]

canceled_df <- df %>% filter(State=="Canceled")
x <- sample(1:nrow(canceled_df), size=sample_size)
canceled_df <- canceled_df[x,]
df <- rbind(suspended_df, successful_df, failed_df, canceled_df)
df %>% group_by(Category) %>% 
  count(Category) %>% 
  summarize(n) %>% 
  arrange(desc(n))
## # A tibble: 15 x 2
##    Category         n
##    <chr>        <int>
##  1 Film & Video  1038
##  2 Technology     918
##  3 Music          832
##  4 Games          829
##  5 Design         718
##  6 Publishing     573
##  7 Food           505
##  8 Art            499
##  9 Fashion        471
## 10 Photography    215
## 11 Comics         196
## 12 Crafts         195
## 13 Theater        185
## 14 Journalism     126
## 15 Dance           68
glimpse(df)
## Rows: 7,368
## Columns: 11
## $ ID          <int> 323824315, 546540380, 605472774, 2127489984, 313825513, 26~
## $ Name        <chr> "Boobs, Breasts, and Sex Appeal : : An artistic exploratio~
## $ Category    <chr> "Photography", "Games", "Art", "Music", "Technology", "Pub~
## $ Subcategory <chr> "Photography", "Games", "Art", "Music", "Technology", "Pub~
## $ Country     <chr> "United States", "United States", "United States", "United~
## $ Launched    <chr> "2010-06-23 01:15:15", "2010-08-09 05:41:40", "2010-08-11 ~
## $ Deadline    <chr> "2010-07-26", "2010-11-01", "2010-09-10", "2010-10-01", "2~
## $ Goal        <int> 100, 15000, 4500, 800, 17840, 1, 5000, 5000, 1000, 10000, ~
## $ Pledged     <int> 0, 0, 0, 0, 95, 0, 0, 0, 0, 0, 31, 50, 25, 0, 0, 0, 70, 0,~
## $ Backers     <int> 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 2, 0, 0, 1~
## $ State       <chr> "Suspended", "Suspended", "Suspended", "Suspended", "Suspe~

Feature Engineering

#df <- read.csv("kickstarter.csv", nrows=5000)  # 5K has been the best so far, at 77.5% acc
df <- df %>% dplyr::select(-c("ID", "Country", "Subcategory"))

df <- df %>% mutate(Launched = ymd_hms(df$Launched), Deadline = ymd(df$Deadline))
df <- df %>% mutate(LaunchMonth = month(Launched), LaunchIsWeekend=ifelse(wday(Launched) %in% c(6,7,1), 1, 0), DeadlineMonth=month(Deadline), Campaign_Length=round(difftime(ymd(Deadline), ymd_hms(Launched), units="days")))
# abs(DeadlineMonth-LaunchMonth)


df <- df %>% dplyr::select(-c("Launched", "Deadline"))

df$State <- as.factor(df$State)
df$Category <- as.factor(df$Category)
df$LaunchMonth <- as.factor(df$LaunchMonth)
df$LaunchIsWeekend <- as.factor(df$LaunchIsWeekend)
df$DeadlineMonth <- as.factor(df$DeadlineMonth)

Top 3 words in the project name by category

x <- stopwords("en", source = "snowball")
stop_words <- data.frame(word=x)
y <- df %>% unnest_tokens(output=word, input=Name) %>%
        anti_join(stop_words, by = c("word" = "word")) %>% 
        filter(word != tolower(Category)) %>% 
        count(Category, State, word, sort=TRUE) %>% 
        filter(State=="Canceled") %>% 
        group_by(Category) %>% 
        slice_max(n,n=3, with_ties=FALSE)

z <- data.frame(Category=unique(y$Category), top_words=length(unique(y$Category))*NA)
for (i in 1:nrow(z)) {
  category <- z$Category[i]
  top_words <- y %>% filter(Category==category) %>% pull(word)
  z$top_words[i] <- I(list(top_words))
}
# Add column 
# Name Text Analysis
# Drop Subcategory because there is little representation for many of them. Top 3 words in each Category for the Successful projects only. Add this column, SuccessfulName: It is a boolean telling whether the Name of the project contains ANY of the top words for its Category.

df$Name <- tolower(df$Name)
df$SuccessfulName <- nrow(df)*NA

# get top words for each category
art_words <- unlist(z %>% filter(Category=="Art") %>% pull(top_words)) 
comics_words <- unlist(z %>% filter(Category=="Comics") %>% pull(top_words))
crafts_words <- unlist(z %>% filter(Category=="Crafts") %>% pull(top_words))
dance_words <- unlist(z %>% filter(Category=="Dance") %>% pull(top_words))
design_words <- unlist(z %>% filter(Category=="Design") %>% pull(top_words))
fashion_words <- unlist(z %>% filter(Category=="Fashion") %>% pull(top_words))
film_words <- unlist(z %>% filter(Category=="Film & Video") %>% pull(top_words))
food_words <- unlist(z %>% filter(Category=="Food") %>% pull(top_words))
games_words <- unlist(z %>% filter(Category=="Games") %>% pull(top_words))
journalism_words <- unlist(z %>% filter(Category=="Journalism") %>% pull(top_words))
music_words <- unlist(z %>% filter(Category=="Music") %>% pull(top_words))
photography_words <- unlist(z %>% filter(Category=="Photography") %>% pull(top_words))
publishing_words <- unlist(z %>% filter(Category=="Publishing") %>% pull(top_words))
technology_words <- unlist(z %>% filter(Category=="Technology") %>% pull(top_words))
theater_words <- unlist(z %>% filter(Category=="Theater") %>% pull(top_words))

for (i in 1:nrow(df)) {
  category <- df$Category[i]
  # get top words for this category:
  cat_words <-  switch(
    category,
    "Art" = art_words,
    "Comics" = comics_words,
    "Crafts" = crafts_words,
    "Dance" = dance_words,
    "Design" = design_words,
    "Fashion" = fashion_words,
    "Film & Video" = film_words,
    "Food" = food_words,
    "Games" = games_words,
    "Journalism" = journalism_words,
    "Music" = music_words,
    "Photography" = photography_words,
    "Publishing" = publishing_words,
    "Technology" = technology_words,
    "Theater" = theater_words
  )
  df$SuccessfulName[i] <- if_else( any(str_detect(df$Name[i], cat_words)), 1, 0) 
}

df <- df %>% dplyr::select(-c("Name"))

Class imbalance

round(prop.table(table(dplyr::select(df, State))),2)
## 
##   Canceled     Failed Successful  Suspended 
##       0.25       0.25       0.25       0.25

Split into training and test sets, 75-25.

set.seed(1234)
sample_set <- sample(nrow(df), round(nrow(df)*0.75), replace=FALSE)
df_train <- df[sample_set, ]
df_test <- df[-sample_set, ]

Train SVM model. Which features should be used? Which kernel is best? Cost?

mod <- svm(State ~ ., data = df_train , kernel = "linear",
cost = 100 , scale = TRUE)
mod
## 
## Call:
## svm(formula = State ~ ., data = df_train, kernel = "linear", cost = 100, 
##     scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  100 
## 
## Number of Support Vectors:  4927
# tune.out <- tune(svm, State ~ ., data = df_train, kernel = "linear", 
#                  ranges = list(cost =c(0.001 , 0.01 , 0.1 , 1 , 5 , 10 , 100)))
# summary(tune.out)
# best performance: error 0.5226158 at cost 100, 10-fold CV

Make predictions.

pred <- predict(mod, df_test, type="class")
table(pred, df_test$State)
##             
## pred         Canceled Failed Successful Suspended
##   Canceled        301     96         46        32
##   Failed           59    216         46       150
##   Successful        2     16        254        26
##   Suspended       101    153         94       250

Evaluate SVM model.

confusionMatrix(pred, df_test$State)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Canceled Failed Successful Suspended
##   Canceled        301     96         46        32
##   Failed           59    216         46       150
##   Successful        2     16        254        26
##   Suspended       101    153         94       250
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5543          
##                  95% CI : (0.5312, 0.5772)
##     No Information Rate : 0.2611          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.405           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: Canceled Class: Failed Class: Successful
## Sensitivity                   0.6501        0.4491            0.5773
## Specificity                   0.8738        0.8126            0.9686
## Pos Pred Value                0.6337        0.4586            0.8523
## Neg Pred Value                0.8815        0.8067            0.8795
## Prevalence                    0.2514        0.2611            0.2389
## Detection Rate                0.1634        0.1173            0.1379
## Detection Prevalence          0.2579        0.2557            0.1618
## Balanced Accuracy             0.7620        0.6309            0.7729
##                      Class: Suspended
## Sensitivity                    0.5459
## Specificity                    0.7486
## Pos Pred Value                 0.4181
## Neg Pred Value                 0.8328
## Prevalence                     0.2486
## Detection Rate                 0.1357
## Detection Prevalence           0.3246
## Balanced Accuracy              0.6472

Decision Tree

mod <- tree(State~., data=df_train)
## Warning in tree(State ~ ., data = df_train): NAs introduced by coercion
pred <- predict(mod, df_test, type="class")
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
confusionMatrix(pred, df_test$State)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Canceled Failed Successful Suspended
##   Canceled        272     75         21        35
##   Failed          103    293         15       207
##   Successful       17     42        295        20
##   Suspended        71     71        109       196
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5733         
##                  95% CI : (0.5503, 0.596)
##     No Information Rate : 0.2611         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.43           
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Canceled Class: Failed Class: Successful
## Sensitivity                   0.5875        0.6091            0.6705
## Specificity                   0.9050        0.7612            0.9437
## Pos Pred Value                0.6749        0.4741            0.7888
## Neg Pred Value                0.8673        0.8464            0.9012
## Prevalence                    0.2514        0.2611            0.2389
## Detection Rate                0.1477        0.1591            0.1602
## Detection Prevalence          0.2188        0.3355            0.2030
## Balanced Accuracy             0.7462        0.6852            0.8071
##                      Class: Suspended
## Sensitivity                    0.4279
## Specificity                    0.8186
## Pos Pred Value                 0.4385
## Neg Pred Value                 0.8122
## Prevalence                     0.2486
## Detection Rate                 0.1064
## Detection Prevalence           0.2427
## Balanced Accuracy              0.6233