Kickstarter is an online platform that hosts details of projects whose founders seek funding. The public can view projects and decide to donate money towards helping them reach their funding goal. On Kaggle, there is a data set of over 370K rows and 11 columns with details about Kickstarter projects. Our first goal is to build a classification decision tree that predicts a project’s “State”: Successful, Failed, Canceled, or Suspended.
library(tidyverse)
library(corrplot)
library(reshape2)
library(tree)
library(caret)
library(rpart)
library(tidytext)
library(stopwords)
df <- read.csv('kickstarter.csv')
df <- df[1:50000,] # 50,000 works but only has US
df1 <- df
summary(df)
## ID Name Category Subcategory
## Min. :3.941e+04 Length:50000 Length:50000 Length:50000
## 1st Qu.:5.413e+08 Class :character Class :character Class :character
## Median :1.077e+09 Mode :character Mode :character Mode :character
## Mean :1.077e+09
## 3rd Qu.:1.613e+09
## Max. :2.147e+09
## Country Launched Deadline Goal
## Length:50000 Length:50000 Length:50000 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 1950
## Mode :character Mode :character Mode :character Median : 4000
## Mean : 12937
## 3rd Qu.: 10000
## Max. :21474836
## Pledged Backers State
## Min. : 0 Min. : 0.00 Length:50000
## 1st Qu.: 100 1st Qu.: 3.00 Class :character
## Median : 976 Median : 17.00 Mode :character
## Mean : 4285 Mean : 58.88
## 3rd Qu.: 3520 3rd Qu.: 51.00
## Max. :10266846 Max. :87142.00
df <- df %>% mutate(Launched = ymd_hms(df$Launched), Deadline = ymd(df$Deadline))
df <- df %>% mutate(LaunchMonth = month(Launched), LaunchIsWeekend=ifelse(wday(Launched) %in% c(6,7,1), 1, 0), DeadlineMonth=month(Deadline), Campaign_Length=abs(DeadlineMonth-LaunchMonth)) #
df <- df %>% dplyr::select(-c("ID", "Launched", "Deadline"))
Top 3 words in the project name by category
x <- stopwords("en", source = "snowball")
stop_words <- data.frame(word=x)
y <- df %>% unnest_tokens(output=word, input=Name) %>%
anti_join(stop_words, by = c("word" = "word")) %>%
filter(word != tolower(Category)) %>%
count(Category, State, word, sort=TRUE) %>%
filter(State=="Successful") %>%
group_by(Category) %>%
slice_max(n,n=3, with_ties=FALSE)
z <- data.frame(Category=unique(y$Category), top_words=length(unique(y$Category))*NA)
for (i in 1:nrow(z)) {
category <- z$Category[i]
top_words <- y %>% filter(Category==category) %>% pull(word)
z$top_words[i] <- I(list(top_words))
}
head(z)
## Category top_words
## 1 Art project, new, 2011
## 2 Comics comic, book, graphic
## 3 Crafts project, new, handmade
## 4 Dance new, project, company
## 5 Design iphone, ipad, case
## 6 Fashion collection, clothing, jewelry
# Add column
# Name Text Analysis
# Drop Subcategory because there is little representation for many of them. Top 3 words in each Category for the Successful projects only. Add this column, SuccessfulName: It is a boolean telling whether the Name of the project contains ANY of the top words for its Category.
df$Name <- tolower(df$Name)
df$SuccessfulName <- nrow(df)*NA
# get top words for each category
art_words <- unlist(z %>% filter(Category=="Art") %>% pull(top_words))
comics_words <- unlist(z %>% filter(Category=="Comics") %>% pull(top_words))
crafts_words <- unlist(z %>% filter(Category=="Crafts") %>% pull(top_words))
dance_words <- unlist(z %>% filter(Category=="Dance") %>% pull(top_words))
design_words <- unlist(z %>% filter(Category=="Design") %>% pull(top_words))
fashion_words <- unlist(z %>% filter(Category=="Fashion") %>% pull(top_words))
film_words <- unlist(z %>% filter(Category=="Film & Video") %>% pull(top_words))
food_words <- unlist(z %>% filter(Category=="Food") %>% pull(top_words))
games_words <- unlist(z %>% filter(Category=="Games") %>% pull(top_words))
journalism_words <- unlist(z %>% filter(Category=="Journalism") %>% pull(top_words))
music_words <- unlist(z %>% filter(Category=="Music") %>% pull(top_words))
photography_words <- unlist(z %>% filter(Category=="Photography") %>% pull(top_words))
publishing_words <- unlist(z %>% filter(Category=="Publishing") %>% pull(top_words))
technology_words <- unlist(z %>% filter(Category=="Technology") %>% pull(top_words))
theater_words <- unlist(z %>% filter(Category=="Theater") %>% pull(top_words))
for (i in 1:nrow(df)) {
category <- df$Category[i]
# get top words for this category:
cat_words <- switch(
category,
"Art" = art_words,
"Comics" = comics_words,
"Crafts" = crafts_words,
"Dance" = dance_words,
"Design" = design_words,
"Fashion" = fashion_words,
"Film & Video" = film_words,
"Food" = food_words,
"Games" = games_words,
"Journalism" = journalism_words,
"Music" = music_words,
"Photography" = photography_words,
"Publishing" = publishing_words,
"Technology" = technology_words,
"Theater" = theater_words
)
df$SuccessfulName[i] <- if_else( any(str_detect(df$Name[i], cat_words)), 1, 0)
}
There are many more samples where Country is “United States” than
other countries. This may affect representation in the data. Replace
this column with isUS, indicating whether the project
originated in the US or elsewhere.
df$isUS <- ifelse(df$Country=="United States", 1, 0)
df$State <- as.factor(df$State)
df <- df %>% dplyr::select(-c("Name", "Subcategory", "Country"))
# Count of each State
ggplot(data=df) +
geom_bar(mapping=aes(x=State)) +
theme_classic() +
ggtitle("Count of Each Project State")
df <- cbind(data.frame(Id = as.integer(rownames(df))), df)
df_melt = melt(subset(df, select=c(Goal, Pledged, Backers, Campaign_Length), id.vars = "Id"))
## No id variables; using all as measure variables
#glass_melt = melt(subset(glass, select=-c(Type)), id.vars = "Id")
ggplot(aes(value), data = df_melt) + geom_histogram(stat = "bin", fill = "navyblue") + facet_wrap(~variable, scales = "free") + labs(title = "Distributions of Continuous Variables", x = "Variable", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
corrplot::corrplot(cor(dplyr::select(df, c(Goal, Pledged, Backers, Campaign_Length)), use = "complete.obs"), method = 'circle', type = 'lower', insig='blank', addCoef.col ='black', number.cex = 0.8, order = 'AOE', diag=FALSE)
df <- df %>% dplyr::select(-c("Id"))
All predictors are used. The train-test split is 75-25.
set.seed(1234)
df1$State <- as.factor(df1$State)
df1 <- df1[sample(1:nrow(df1)), ]
x <- df1
sample_set <- sample(nrow(x), round(nrow(x)*0.75), replace=FALSE)
df_train <- x[sample_set, ]
df_test <- x[-sample_set, ]
mod <- tree(State~., data=df_train)
## Warning in tree(State ~ ., data = df_train): NAs introduced by coercion
pred <- predict(mod, df_test, type="class")
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
table(pred, df_test$State)
##
## pred Canceled Failed Successful Suspended
## Canceled 0 0 0 0
## Failed 956 5495 100 13
## Successful 53 348 5534 1
## Suspended 0 0 0 0
confusionMatrix(pred, df_test$State)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Canceled Failed Successful Suspended
## Canceled 0 0 0 0
## Failed 956 5495 100 13
## Successful 53 348 5534 1
## Suspended 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8823
## 95% CI : (0.8765, 0.8879)
## No Information Rate : 0.4674
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7823
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Canceled Class: Failed Class: Successful
## Sensitivity 0.00000 0.9404 0.9823
## Specificity 1.00000 0.8394 0.9415
## Pos Pred Value NaN 0.8371 0.9323
## Neg Pred Value 0.91928 0.9414 0.9848
## Prevalence 0.08072 0.4674 0.4507
## Detection Rate 0.00000 0.4396 0.4427
## Detection Prevalence 0.00000 0.5251 0.4749
## Balanced Accuracy 0.50000 0.8899 0.9619
## Class: Suspended
## Sensitivity 0.00000
## Specificity 1.00000
## Pos Pred Value NaN
## Neg Pred Value 0.99888
## Prevalence 0.00112
## Detection Rate 0.00000
## Detection Prevalence 0.00000
## Balanced Accuracy 0.50000
df <- df[sample(1:nrow(df)), ]
x <- df
sample_set <- sample(nrow(x), round(nrow(x)*0.75), replace=FALSE)
df_train <- x[sample_set, ]
df_test <- x[-sample_set, ]
mod <- tree(State~., data=df_train)
## Warning in tree(State ~ ., data = df_train): NAs introduced by coercion
pred <- predict(mod, df_test, type="class")
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
table(pred, df_test$State)
##
## pred Canceled Failed Successful Suspended
## Canceled 0 0 0 0
## Failed 920 5507 69 13
## Successful 49 308 5632 2
## Suspended 0 0 0 0
confusionMatrix(pred, df_test$State)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Canceled Failed Successful Suspended
## Canceled 0 0 0 0
## Failed 920 5507 69 13
## Successful 49 308 5632 2
## Suspended 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8911
## 95% CI : (0.8855, 0.8965)
## No Information Rate : 0.4652
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7981
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Canceled Class: Failed Class: Successful
## Sensitivity 0.00000 0.9470 0.9879
## Specificity 1.00000 0.8501 0.9472
## Pos Pred Value NaN 0.8461 0.9401
## Neg Pred Value 0.92248 0.9486 0.9894
## Prevalence 0.07752 0.4652 0.4561
## Detection Rate 0.00000 0.4406 0.4506
## Detection Prevalence 0.00000 0.5207 0.4793
## Balanced Accuracy 0.50000 0.8986 0.9675
## Class: Suspended
## Sensitivity 0.0000
## Specificity 1.0000
## Pos Pred Value NaN
## Neg Pred Value 0.9988
## Prevalence 0.0012
## Detection Rate 0.0000
## Detection Prevalence 0.0000
## Balanced Accuracy 0.5000
For both trees, the accuracy is high but the model completely failed in the Canceled and Suspended states. It did well for Successful and Failed. This is an effect of class imbalance.
plot(mod)
text(mod, pretty=0)
caret libraryrf_mod <- train(
State ~ .,
data = df_train,
metric = "Accuracy",
method = "rf",
trControl = trainControl(method = "boot632", number=3),
tuneGrid = expand.grid(.mtry = 3) # mtry=sqrt(num predictors)
)
rf_pred <- predict(rf_mod, df_test)
confusionMatrix(rf_pred, df_test$State)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Canceled Failed Successful Suspended
## Canceled 0 0 0 0
## Failed 940 5536 36 13
## Successful 29 279 5665 2
## Suspended 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8961
## 95% CI : (0.8906, 0.9014)
## No Information Rate : 0.4652
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8073
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Canceled Class: Failed Class: Successful
## Sensitivity 0.00000 0.9520 0.9937
## Specificity 1.00000 0.8521 0.9544
## Pos Pred Value NaN 0.8484 0.9481
## Neg Pred Value 0.92248 0.9533 0.9945
## Prevalence 0.07752 0.4652 0.4561
## Detection Rate 0.00000 0.4429 0.4532
## Detection Prevalence 0.00000 0.5220 0.4780
## Balanced Accuracy 0.50000 0.9020 0.9740
## Class: Suspended
## Sensitivity 0.0000
## Specificity 1.0000
## Pos Pred Value NaN
## Neg Pred Value 0.9988
## Prevalence 0.0012
## Detection Rate 0.0000
## Detection Prevalence 0.0000
## Balanced Accuracy 0.5000
Reset the seed to randomize the predictors.
set.seed(4444)
df <- df[sample(1:nrow(df)), ]
x <- df
sample_set <- sample(nrow(x), round(nrow(x)*0.75), replace=FALSE)
df_train <- x[sample_set, ]
df_test <- x[-sample_set, ]
rf_mod1 <- train(
State ~ .,
data = df_train,
metric = "Accuracy",
method = "rf",
trControl = trainControl(method = "boot632", number=3),
tuneGrid = expand.grid(.mtry = 3) # mtry=sqrt(num predictors)
)
rf_pred1 <- predict(rf_mod1, df_test)
confusionMatrix(rf_pred1, df_test$State)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Canceled Failed Successful Suspended
## Canceled 0 0 0 0
## Failed 932 5535 30 11
## Successful 36 247 5709 0
## Suspended 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8995
## 95% CI : (0.8941, 0.9047)
## No Information Rate : 0.4626
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8136
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Canceled Class: Failed Class: Successful
## Sensitivity 0.00000 0.9573 0.9948
## Specificity 1.00000 0.8552 0.9581
## Pos Pred Value NaN 0.8505 0.9528
## Neg Pred Value 0.92256 0.9588 0.9954
## Prevalence 0.07744 0.4626 0.4591
## Detection Rate 0.00000 0.4428 0.4567
## Detection Prevalence 0.00000 0.5206 0.4794
## Balanced Accuracy 0.50000 0.9062 0.9765
## Class: Suspended
## Sensitivity 0.00000
## Specificity 1.00000
## Pos Pred Value NaN
## Neg Pred Value 0.99912
## Prevalence 0.00088
## Detection Rate 0.00000
## Detection Prevalence 0.00000
## Balanced Accuracy 0.50000
The random forests and first decision tree produce negligible differences in accuracy. This is due to class imbalance, and ensemble methods cannot mitigate the negative effect on accuracy. The only way to address the problem is to fix the class imbalance by gathering more samples.