HW2: Decision Tree Algorithms

Introduction

Kickstarter is an online platform that hosts details of projects whose founders seek funding. The public can view projects and decide to donate money towards helping them reach their funding goal. On Kaggle, there is a data set of over 370K rows and 11 columns with details about Kickstarter projects. Our first goal is to build a classification decision tree that predicts a project’s “State”: Successful, Failed, Canceled, or Suspended.

library(tidyverse)
library(corrplot)
library(reshape2)
library(tree)
library(caret)
library(rpart)
library(tidytext)
library(stopwords)

df <- read.csv('kickstarter.csv')
df <- df[1:50000,]   # 50,000 works but only has US
df1 <- df

summary(df)

##        ID                Name             Category         Subcategory       
##  Min.   :3.941e+04   Length:50000       Length:50000       Length:50000      
##  1st Qu.:5.413e+08   Class :character   Class :character   Class :character  
##  Median :1.077e+09   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.077e+09                                                           
##  3rd Qu.:1.613e+09                                                           
##  Max.   :2.147e+09                                                           
##    Country            Launched           Deadline              Goal         
##  Length:50000       Length:50000       Length:50000       Min.   :       0  
##  Class :character   Class :character   Class :character   1st Qu.:    1950  
##  Mode  :character   Mode  :character   Mode  :character   Median :    4000  
##                                                           Mean   :   12937  
##                                                           3rd Qu.:   10000  
##                                                           Max.   :21474836  
##     Pledged            Backers            State          
##  Min.   :       0   Min.   :    0.00   Length:50000      
##  1st Qu.:     100   1st Qu.:    3.00   Class :character  
##  Median :     976   Median :   17.00   Mode  :character  
##  Mean   :    4285   Mean   :   58.88                     
##  3rd Qu.:    3520   3rd Qu.:   51.00                     
##  Max.   :10266846   Max.   :87142.00

Feature Engineering

df <- df %>% mutate(Launched = ymd_hms(df$Launched), Deadline = ymd(df$Deadline))
df <- df %>% mutate(LaunchMonth = month(Launched), LaunchIsWeekend=ifelse(wday(Launched) %in% c(6,7,1), 1, 0), DeadlineMonth=month(Deadline), Campaign_Length=abs(DeadlineMonth-LaunchMonth))  # 
df <- df %>% dplyr::select(-c("ID", "Launched", "Deadline"))

Top 3 words in the project name by category

x <- stopwords("en", source = "snowball")
stop_words <- data.frame(word=x)
y <- df %>% unnest_tokens(output=word, input=Name) %>%
        anti_join(stop_words, by = c("word" = "word")) %>% 
        filter(word != tolower(Category)) %>% 
        count(Category, State, word, sort=TRUE) %>% 
        filter(State=="Successful") %>% 
        group_by(Category) %>% 
        slice_max(n,n=3, with_ties=FALSE)

z <- data.frame(Category=unique(y$Category), top_words=length(unique(y$Category))*NA)
for (i in 1:nrow(z)) {
  category <- z$Category[i]
  top_words <- y %>% filter(Category==category) %>% pull(word)
  z$top_words[i] <- I(list(top_words))
}

head(z)

##   Category                     top_words
## 1      Art            project, new, 2011
## 2   Comics          comic, book, graphic
## 3   Crafts        project, new, handmade
## 4    Dance         new, project, company
## 5   Design            iphone, ipad, case
## 6  Fashion collection, clothing, jewelry

# Add column 
# Name Text Analysis
# Drop Subcategory because there is little representation for many of them. Top 3 words in each Category for the Successful projects only. Add this column, SuccessfulName: It is a boolean telling whether the Name of the project contains ANY of the top words for its Category.

df$Name <- tolower(df$Name)
df$SuccessfulName <- nrow(df)*NA

# get top words for each category
art_words <- unlist(z %>% filter(Category=="Art") %>% pull(top_words)) 
comics_words <- unlist(z %>% filter(Category=="Comics") %>% pull(top_words))
crafts_words <- unlist(z %>% filter(Category=="Crafts") %>% pull(top_words))
dance_words <- unlist(z %>% filter(Category=="Dance") %>% pull(top_words))
design_words <- unlist(z %>% filter(Category=="Design") %>% pull(top_words))
fashion_words <- unlist(z %>% filter(Category=="Fashion") %>% pull(top_words))
film_words <- unlist(z %>% filter(Category=="Film & Video") %>% pull(top_words))
food_words <- unlist(z %>% filter(Category=="Food") %>% pull(top_words))
games_words <- unlist(z %>% filter(Category=="Games") %>% pull(top_words))
journalism_words <- unlist(z %>% filter(Category=="Journalism") %>% pull(top_words))
music_words <- unlist(z %>% filter(Category=="Music") %>% pull(top_words))
photography_words <- unlist(z %>% filter(Category=="Photography") %>% pull(top_words))
publishing_words <- unlist(z %>% filter(Category=="Publishing") %>% pull(top_words))
technology_words <- unlist(z %>% filter(Category=="Technology") %>% pull(top_words))
theater_words <- unlist(z %>% filter(Category=="Theater") %>% pull(top_words))

for (i in 1:nrow(df)) {
  category <- df$Category[i]
  # get top words for this category:
  cat_words <-  switch(
    category,
    "Art" = art_words,
    "Comics" = comics_words,
    "Crafts" = crafts_words,
    "Dance" = dance_words,
    "Design" = design_words,
    "Fashion" = fashion_words,
    "Film & Video" = film_words,
    "Food" = food_words,
    "Games" = games_words,
    "Journalism" = journalism_words,
    "Music" = music_words,
    "Photography" = photography_words,
    "Publishing" = publishing_words,
    "Technology" = technology_words,
    "Theater" = theater_words
  )
  df$SuccessfulName[i] <- if_else( any(str_detect(df$Name[i], cat_words)), 1, 0) 
}

There are many more samples where Country is “United States” than other countries. This may affect representation in the data. Replace this column with isUS, indicating whether the project originated in the US or elsewhere.

df$isUS <- ifelse(df$Country=="United States", 1, 0)
df$State <- as.factor(df$State)
df <- df %>% dplyr::select(-c("Name", "Subcategory", "Country"))

Data Exploration

# Count of each State
ggplot(data=df) +
  geom_bar(mapping=aes(x=State)) +
  theme_classic() +
  ggtitle("Count of Each Project State")

df <- cbind(data.frame(Id = as.integer(rownames(df))), df) 
df_melt = melt(subset(df, select=c(Goal, Pledged, Backers, Campaign_Length), id.vars = "Id"))

## No id variables; using all as measure variables

#glass_melt = melt(subset(glass, select=-c(Type)), id.vars = "Id")

ggplot(aes(value), data = df_melt) + geom_histogram(stat = "bin", fill = "navyblue") + facet_wrap(~variable, scales = "free") + labs(title = "Distributions of Continuous Variables", x = "Variable", y = "Count")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

corrplot::corrplot(cor(dplyr::select(df, c(Goal, Pledged, Backers, Campaign_Length)), use = "complete.obs"), method = 'circle', type = 'lower', insig='blank', addCoef.col ='black', number.cex = 0.8, order = 'AOE', diag=FALSE)

df <- df %>% dplyr::select(-c("Id"))

Create training and test sets for decision tree

All predictors are used. The train-test split is 75-25.

Decision Tree 1: No feature engineering.

set.seed(1234)
df1$State <- as.factor(df1$State)
df1 <- df1[sample(1:nrow(df1)), ]
x <- df1
sample_set <- sample(nrow(x), round(nrow(x)*0.75), replace=FALSE)
df_train <- x[sample_set, ]
df_test <- x[-sample_set, ]

mod <- tree(State~., data=df_train)

## Warning in tree(State ~ ., data = df_train): NAs introduced by coercion

pred <- predict(mod, df_test, type="class")

## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion

table(pred, df_test$State)

##             
## pred         Canceled Failed Successful Suspended
##   Canceled          0      0          0         0
##   Failed          956   5495        100        13
##   Successful       53    348       5534         1
##   Suspended         0      0          0         0

confusionMatrix(pred, df_test$State)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Canceled Failed Successful Suspended
##   Canceled          0      0          0         0
##   Failed          956   5495        100        13
##   Successful       53    348       5534         1
##   Suspended         0      0          0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8823          
##                  95% CI : (0.8765, 0.8879)
##     No Information Rate : 0.4674          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7823          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Canceled Class: Failed Class: Successful
## Sensitivity                  0.00000        0.9404            0.9823
## Specificity                  1.00000        0.8394            0.9415
## Pos Pred Value                   NaN        0.8371            0.9323
## Neg Pred Value               0.91928        0.9414            0.9848
## Prevalence                   0.08072        0.4674            0.4507
## Detection Rate               0.00000        0.4396            0.4427
## Detection Prevalence         0.00000        0.5251            0.4749
## Balanced Accuracy            0.50000        0.8899            0.9619
##                      Class: Suspended
## Sensitivity                   0.00000
## Specificity                   1.00000
## Pos Pred Value                    NaN
## Neg Pred Value                0.99888
## Prevalence                    0.00112
## Detection Rate                0.00000
## Detection Prevalence          0.00000
## Balanced Accuracy             0.50000

Decision Tree 2: Includes feature engineering.

df <- df[sample(1:nrow(df)), ]
x <- df
sample_set <- sample(nrow(x), round(nrow(x)*0.75), replace=FALSE)
df_train <- x[sample_set, ]
df_test <- x[-sample_set, ]

mod <- tree(State~., data=df_train)

## Warning in tree(State ~ ., data = df_train): NAs introduced by coercion

pred <- predict(mod, df_test, type="class")

## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion

table(pred, df_test$State)

##             
## pred         Canceled Failed Successful Suspended
##   Canceled          0      0          0         0
##   Failed          920   5507         69        13
##   Successful       49    308       5632         2
##   Suspended         0      0          0         0

confusionMatrix(pred, df_test$State)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Canceled Failed Successful Suspended
##   Canceled          0      0          0         0
##   Failed          920   5507         69        13
##   Successful       49    308       5632         2
##   Suspended         0      0          0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8911          
##                  95% CI : (0.8855, 0.8965)
##     No Information Rate : 0.4652          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7981          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Canceled Class: Failed Class: Successful
## Sensitivity                  0.00000        0.9470            0.9879
## Specificity                  1.00000        0.8501            0.9472
## Pos Pred Value                   NaN        0.8461            0.9401
## Neg Pred Value               0.92248        0.9486            0.9894
## Prevalence                   0.07752        0.4652            0.4561
## Detection Rate               0.00000        0.4406            0.4506
## Detection Prevalence         0.00000        0.5207            0.4793
## Balanced Accuracy            0.50000        0.8986            0.9675
##                      Class: Suspended
## Sensitivity                    0.0000
## Specificity                    1.0000
## Pos Pred Value                    NaN
## Neg Pred Value                 0.9988
## Prevalence                     0.0012
## Detection Rate                 0.0000
## Detection Prevalence           0.0000
## Balanced Accuracy              0.5000

For both trees, the accuracy is high but the model completely failed in the Canceled and Suspended states. It did well for Successful and Failed. This is an effect of class imbalance.

plot(mod)
text(mod, pretty=0)

Random forest using the `caret` library

rf_mod <- train(
  State ~ .,
  data = df_train,
  metric = "Accuracy",
  method = "rf",
  trControl = trainControl(method = "boot632", number=3),
  tuneGrid = expand.grid(.mtry = 3)   # mtry=sqrt(num predictors)
)

rf_pred <- predict(rf_mod, df_test)
confusionMatrix(rf_pred, df_test$State)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Canceled Failed Successful Suspended
##   Canceled          0      0          0         0
##   Failed          940   5536         36        13
##   Successful       29    279       5665         2
##   Suspended         0      0          0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8961          
##                  95% CI : (0.8906, 0.9014)
##     No Information Rate : 0.4652          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8073          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Canceled Class: Failed Class: Successful
## Sensitivity                  0.00000        0.9520            0.9937
## Specificity                  1.00000        0.8521            0.9544
## Pos Pred Value                   NaN        0.8484            0.9481
## Neg Pred Value               0.92248        0.9533            0.9945
## Prevalence                   0.07752        0.4652            0.4561
## Detection Rate               0.00000        0.4429            0.4532
## Detection Prevalence         0.00000        0.5220            0.4780
## Balanced Accuracy            0.50000        0.9020            0.9740
##                      Class: Suspended
## Sensitivity                    0.0000
## Specificity                    1.0000
## Pos Pred Value                    NaN
## Neg Pred Value                 0.9988
## Prevalence                     0.0012
## Detection Rate                 0.0000
## Detection Prevalence           0.0000
## Balanced Accuracy              0.5000

Generating the 2nd Random Forest

Reset the seed to randomize the predictors.

set.seed(4444)
df <- df[sample(1:nrow(df)), ]
x <- df
sample_set <- sample(nrow(x), round(nrow(x)*0.75), replace=FALSE)
df_train <- x[sample_set, ]
df_test <- x[-sample_set, ]

rf_mod1 <- train(
  State ~ .,
  data = df_train,
  metric = "Accuracy",
  method = "rf",
  trControl = trainControl(method = "boot632", number=3),
  tuneGrid = expand.grid(.mtry = 3)   # mtry=sqrt(num predictors)
)

rf_pred1 <- predict(rf_mod1, df_test)
confusionMatrix(rf_pred1, df_test$State)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Canceled Failed Successful Suspended
##   Canceled          0      0          0         0
##   Failed          932   5535         30        11
##   Successful       36    247       5709         0
##   Suspended         0      0          0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8995          
##                  95% CI : (0.8941, 0.9047)
##     No Information Rate : 0.4626          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8136          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Canceled Class: Failed Class: Successful
## Sensitivity                  0.00000        0.9573            0.9948
## Specificity                  1.00000        0.8552            0.9581
## Pos Pred Value                   NaN        0.8505            0.9528
## Neg Pred Value               0.92256        0.9588            0.9954
## Prevalence                   0.07744        0.4626            0.4591
## Detection Rate               0.00000        0.4428            0.4567
## Detection Prevalence         0.00000        0.5206            0.4794
## Balanced Accuracy            0.50000        0.9062            0.9765
##                      Class: Suspended
## Sensitivity                   0.00000
## Specificity                   1.00000
## Pos Pred Value                    NaN
## Neg Pred Value                0.99912
## Prevalence                    0.00088
## Detection Rate                0.00000
## Detection Prevalence          0.00000
## Balanced Accuracy             0.50000

Conclusion

The random forests and first decision tree produce negligible differences in accuracy. This is due to class imbalance, and ensemble methods cannot mitigate the negative effect on accuracy. The only way to address the problem is to fix the class imbalance by gathering more samples.

HW2: Decision Tree Algorithms

Tora Mullings

2023-04-10

Introduction

Feature Engineering

Data Exploration

Create training and test sets for decision tree

Decision Tree 1: No feature engineering.

Decision Tree 2: Includes feature engineering.

Random forest using the `caret` library

Generating the 2nd Random Forest

Conclusion

HW2: Decision Tree Algorithms

Tora Mullings

2023-04-10

Introduction

Feature Engineering

Data Exploration

Create training and test sets for decision tree

Decision Tree 1: No feature engineering.

Decision Tree 2: Includes feature engineering.

Random forest using the caret library

Generating the 2nd Random Forest

Conclusion

Random forest using the `caret` library