Kickstarter is a crowdfunding platform that hosts details of projects whose founders seek funding. The public can view projects and decide to donate money towards helping them reach their funding goal. On Kaggle, there is a data set of over 370K rows and 11 columns with details about Kickstarter projects. The goal is to build a SVM model and a classification decision tree that predicts a project’s “State”: Successful, Failed, Canceled, or Suspended. Which is better for this problem?
library(tidyverse)
library(tidytext)
library(stopwords)
library(caret) # for upsampling
library(e1071)
library(tree)
#nrow(df %>% filter(State=="Canceled")) # sus: 1842
df <- read.csv("kickstarter.csv")
suspended_df <- df %>% filter(State=="Suspended")
sample_size <- nrow(suspended_df) # smallest number of instances among the States
successful_df <- df %>% filter(State=="Successful")
x <- sample(1:nrow(successful_df), size=sample_size)
successful_df <- successful_df[x,]
failed_df <- df %>% filter(State=="Failed")
x <- sample(1:nrow(failed_df), size=sample_size)
failed_df <- failed_df[x,]
canceled_df <- df %>% filter(State=="Canceled")
x <- sample(1:nrow(canceled_df), size=sample_size)
canceled_df <- canceled_df[x,]
df <- rbind(suspended_df, successful_df, failed_df, canceled_df)
df %>% group_by(Category) %>%
count(Category) %>%
summarize(n) %>%
arrange(desc(n))
## # A tibble: 15 x 2
## Category n
## <chr> <int>
## 1 Film & Video 1038
## 2 Technology 918
## 3 Music 832
## 4 Games 829
## 5 Design 718
## 6 Publishing 573
## 7 Food 505
## 8 Art 499
## 9 Fashion 471
## 10 Photography 215
## 11 Comics 196
## 12 Crafts 195
## 13 Theater 185
## 14 Journalism 126
## 15 Dance 68
glimpse(df)
## Rows: 7,368
## Columns: 11
## $ ID <int> 323824315, 546540380, 605472774, 2127489984, 313825513, 26~
## $ Name <chr> "Boobs, Breasts, and Sex Appeal : : An artistic exploratio~
## $ Category <chr> "Photography", "Games", "Art", "Music", "Technology", "Pub~
## $ Subcategory <chr> "Photography", "Games", "Art", "Music", "Technology", "Pub~
## $ Country <chr> "United States", "United States", "United States", "United~
## $ Launched <chr> "2010-06-23 01:15:15", "2010-08-09 05:41:40", "2010-08-11 ~
## $ Deadline <chr> "2010-07-26", "2010-11-01", "2010-09-10", "2010-10-01", "2~
## $ Goal <int> 100, 15000, 4500, 800, 17840, 1, 5000, 5000, 1000, 10000, ~
## $ Pledged <int> 0, 0, 0, 0, 95, 0, 0, 0, 0, 0, 31, 50, 25, 0, 0, 0, 70, 0,~
## $ Backers <int> 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 2, 0, 0, 1~
## $ State <chr> "Suspended", "Suspended", "Suspended", "Suspended", "Suspe~
#df <- read.csv("kickstarter.csv", nrows=5000) # 5K has been the best so far, at 77.5% acc
df <- df %>% dplyr::select(-c("ID", "Country", "Subcategory"))
df <- df %>% mutate(Launched = ymd_hms(df$Launched), Deadline = ymd(df$Deadline))
df <- df %>% mutate(LaunchMonth = month(Launched), LaunchIsWeekend=ifelse(wday(Launched) %in% c(6,7,1), 1, 0), DeadlineMonth=month(Deadline), Campaign_Length=round(difftime(ymd(Deadline), ymd_hms(Launched), units="days")))
# abs(DeadlineMonth-LaunchMonth)
df <- df %>% dplyr::select(-c("Launched", "Deadline"))
df$State <- as.factor(df$State)
df$Category <- as.factor(df$Category)
df$LaunchMonth <- as.factor(df$LaunchMonth)
df$LaunchIsWeekend <- as.factor(df$LaunchIsWeekend)
df$DeadlineMonth <- as.factor(df$DeadlineMonth)
Top 3 words in the project name by category
x <- stopwords("en", source = "snowball")
stop_words <- data.frame(word=x)
y <- df %>% unnest_tokens(output=word, input=Name) %>%
anti_join(stop_words, by = c("word" = "word")) %>%
filter(word != tolower(Category)) %>%
count(Category, State, word, sort=TRUE) %>%
filter(State=="Canceled") %>%
group_by(Category) %>%
slice_max(n,n=3, with_ties=FALSE)
z <- data.frame(Category=unique(y$Category), top_words=length(unique(y$Category))*NA)
for (i in 1:nrow(z)) {
category <- z$Category[i]
top_words <- y %>% filter(Category==category) %>% pull(word)
z$top_words[i] <- I(list(top_words))
}
# Add column
# Name Text Analysis
# Drop Subcategory because there is little representation for many of them. Top 3 words in each Category for the Successful projects only. Add this column, SuccessfulName: It is a boolean telling whether the Name of the project contains ANY of the top words for its Category.
df$Name <- tolower(df$Name)
df$SuccessfulName <- nrow(df)*NA
# get top words for each category
art_words <- unlist(z %>% filter(Category=="Art") %>% pull(top_words))
comics_words <- unlist(z %>% filter(Category=="Comics") %>% pull(top_words))
crafts_words <- unlist(z %>% filter(Category=="Crafts") %>% pull(top_words))
dance_words <- unlist(z %>% filter(Category=="Dance") %>% pull(top_words))
design_words <- unlist(z %>% filter(Category=="Design") %>% pull(top_words))
fashion_words <- unlist(z %>% filter(Category=="Fashion") %>% pull(top_words))
film_words <- unlist(z %>% filter(Category=="Film & Video") %>% pull(top_words))
food_words <- unlist(z %>% filter(Category=="Food") %>% pull(top_words))
games_words <- unlist(z %>% filter(Category=="Games") %>% pull(top_words))
journalism_words <- unlist(z %>% filter(Category=="Journalism") %>% pull(top_words))
music_words <- unlist(z %>% filter(Category=="Music") %>% pull(top_words))
photography_words <- unlist(z %>% filter(Category=="Photography") %>% pull(top_words))
publishing_words <- unlist(z %>% filter(Category=="Publishing") %>% pull(top_words))
technology_words <- unlist(z %>% filter(Category=="Technology") %>% pull(top_words))
theater_words <- unlist(z %>% filter(Category=="Theater") %>% pull(top_words))
for (i in 1:nrow(df)) {
category <- df$Category[i]
# get top words for this category:
cat_words <- switch(
category,
"Art" = art_words,
"Comics" = comics_words,
"Crafts" = crafts_words,
"Dance" = dance_words,
"Design" = design_words,
"Fashion" = fashion_words,
"Film & Video" = film_words,
"Food" = food_words,
"Games" = games_words,
"Journalism" = journalism_words,
"Music" = music_words,
"Photography" = photography_words,
"Publishing" = publishing_words,
"Technology" = technology_words,
"Theater" = theater_words
)
df$SuccessfulName[i] <- if_else( any(str_detect(df$Name[i], cat_words)), 1, 0)
}
df <- df %>% dplyr::select(-c("Name"))
Class imbalance
round(prop.table(table(dplyr::select(df, State))),2)
##
## Canceled Failed Successful Suspended
## 0.25 0.25 0.25 0.25
Split into training and test sets, 75-25.
set.seed(1234)
sample_set <- sample(nrow(df), round(nrow(df)*0.75), replace=FALSE)
df_train <- df[sample_set, ]
df_test <- df[-sample_set, ]
Train SVM model. Which features should be used? Which kernel is best? Cost?
mod <- svm(State ~ ., data = df_train , kernel = "linear",
cost = 100 , scale = TRUE)
mod
##
## Call:
## svm(formula = State ~ ., data = df_train, kernel = "linear", cost = 100,
## scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 100
##
## Number of Support Vectors: 4927
# tune.out <- tune(svm, State ~ ., data = df_train, kernel = "linear",
# ranges = list(cost =c(0.001 , 0.01 , 0.1 , 1 , 5 , 10 , 100)))
# summary(tune.out)
# best performance: error 0.5226158 at cost 100, 10-fold CV
Make predictions.
pred <- predict(mod, df_test, type="class")
table(pred, df_test$State)
##
## pred Canceled Failed Successful Suspended
## Canceled 301 96 46 32
## Failed 59 216 46 150
## Successful 2 16 254 26
## Suspended 101 153 94 250
Evaluate SVM model.
confusionMatrix(pred, df_test$State)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Canceled Failed Successful Suspended
## Canceled 301 96 46 32
## Failed 59 216 46 150
## Successful 2 16 254 26
## Suspended 101 153 94 250
##
## Overall Statistics
##
## Accuracy : 0.5543
## 95% CI : (0.5312, 0.5772)
## No Information Rate : 0.2611
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.405
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Canceled Class: Failed Class: Successful
## Sensitivity 0.6501 0.4491 0.5773
## Specificity 0.8738 0.8126 0.9686
## Pos Pred Value 0.6337 0.4586 0.8523
## Neg Pred Value 0.8815 0.8067 0.8795
## Prevalence 0.2514 0.2611 0.2389
## Detection Rate 0.1634 0.1173 0.1379
## Detection Prevalence 0.2579 0.2557 0.1618
## Balanced Accuracy 0.7620 0.6309 0.7729
## Class: Suspended
## Sensitivity 0.5459
## Specificity 0.7486
## Pos Pred Value 0.4181
## Neg Pred Value 0.8328
## Prevalence 0.2486
## Detection Rate 0.1357
## Detection Prevalence 0.3246
## Balanced Accuracy 0.6472
mod <- tree(State~., data=df_train)
## Warning in tree(State ~ ., data = df_train): NAs introduced by coercion
pred <- predict(mod, df_test, type="class")
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
confusionMatrix(pred, df_test$State)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Canceled Failed Successful Suspended
## Canceled 272 75 21 35
## Failed 103 293 15 207
## Successful 17 42 295 20
## Suspended 71 71 109 196
##
## Overall Statistics
##
## Accuracy : 0.5733
## 95% CI : (0.5503, 0.596)
## No Information Rate : 0.2611
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.43
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Canceled Class: Failed Class: Successful
## Sensitivity 0.5875 0.6091 0.6705
## Specificity 0.9050 0.7612 0.9437
## Pos Pred Value 0.6749 0.4741 0.7888
## Neg Pred Value 0.8673 0.8464 0.9012
## Prevalence 0.2514 0.2611 0.2389
## Detection Rate 0.1477 0.1591 0.1602
## Detection Prevalence 0.2188 0.3355 0.2030
## Balanced Accuracy 0.7462 0.6852 0.8071
## Class: Suspended
## Sensitivity 0.4279
## Specificity 0.8186
## Pos Pred Value 0.4385
## Neg Pred Value 0.8122
## Prevalence 0.2486
## Detection Rate 0.1064
## Detection Prevalence 0.2427
## Balanced Accuracy 0.6233