Kickstarter project-Part 2: Creating consistent data, Performing Exploratoy data analysis, and building the Decision tree

library(readr)
library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v dplyr   0.8.4
## v tibble  2.1.3     v stringr 1.4.0
## v tidyr   1.0.2     v forcats 0.4.0
## v purrr   0.3.3

## Warning: package 'ggplot2' was built under R version 3.6.3

## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(DataExplorer)
library(Hmisc)

## Warning: package 'Hmisc' was built under R version 3.6.3

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(corrplot)

## Warning: package 'corrplot' was built under R version 3.6.3

## corrplot 0.84 loaded

library(ggplot2)

options(tibble.width = Inf)
options(tibble.print_max = 10440, tibble.print_min = 6)

getwd()

## [1] "C:/Users/user/Desktop/Data analysis projects/Kickstarter/code"

KS <- read_csv("Kickstarter2.csv")

## Parsed with column specification:
## cols(
##   id = col_double(),
##   CategoryID = col_double(),
##   Category = col_character(),
##   SubcategoryId = col_double(),
##   Subcategory = col_character(),
##   goal = col_double(),
##   pledged = col_double(),
##   backers = col_double(),
##   launched = col_datetime(format = ""),
##   deadline = col_datetime(format = ""),
##   days = col_double(),
##   outcome = col_character()
## )

str(KS)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 14257 obs. of  12 variables:
##  $ id           : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ CategoryID   : num  2 2 2 9 2 11 10 2 2 6 ...
##  $ Category     : chr  "Film & Video" "Film & Video" "Film & Video" "Fashion" ...
##  $ SubcategoryId: num  23 47 24 52 9 27 20 23 54 12 ...
##  $ Subcategory  : chr  "Webseries" "Shorts" "Animation" "Apparel" ...
##  $ goal         : num  15000 224 5000 6000 2000000 ...
##  $ pledged      : num  20 414 1497 8795 2 ...
##  $ backers      : num  3 23 28 218 2 5 45 4 0 11 ...
##  $ launched     : POSIXct, format: "2013-04-25" "2015-11-16" ...
##  $ deadline     : POSIXct, format: "2013-04-25" "2015-11-16" ...
##  $ days         : num  30 30 30 30 35 37 45 30 30 30 ...
##  $ outcome      : chr  "failed" "successful" "failed" "successful" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   CategoryID = col_double(),
##   ..   Category = col_character(),
##   ..   SubcategoryId = col_double(),
##   ..   Subcategory = col_character(),
##   ..   goal = col_double(),
##   ..   pledged = col_double(),
##   ..   backers = col_double(),
##   ..   launched = col_datetime(format = ""),
##   ..   deadline = col_datetime(format = ""),
##   ..   days = col_double(),
##   ..   outcome = col_character()
##   .. )

Checking the inconsistency of the data

Convert Category, Subcategory and outcome into factor

KS <- KS %>%
    mutate_if(is.character,as.factor)

class(KS$outcome)

## [1] "factor"

Create columns of year, month, and day of month. Convert them into numeric

KS$launchedMonth <- as.numeric(format(KS$launched, "%m"))
KS$launchedDay <- as.numeric(format(KS$launched, "%d"))

KS$deadlineMonth <- as.numeric(format(KS$deadline, "%m"))
KS$deadlineDay <- as.numeric(format(KS$deadline, "%d"))

Rearrange columns using tidyverse (This step can be done with SQL)

KS <- KS %>% 
  dplyr::select(id,
    Category, 
    Subcategory,
    launchedMonth, launchedDay,
    deadlineMonth, deadlineDay,
    days,
    goal, backers, pledged,
    outcome
    )

KS %>% 
  group_by(launchedMonth) %>% 
  count()

## # A tibble: 12 x 2
## # Groups:   launchedMonth [12]
##    launchedMonth     n
##            <dbl> <int>
##  1             1  1056
##  2             2  1095
##  3             3  1354
##  4             4  1249
##  5             5  1298
##  6             6  1240
##  7             7  1346
##  8             8  1214
##  9             9  1145
## 10            10  1255
## 11            11  1229
## 12            12   776

Recoding

KS <- KS %>% 
  mutate(launchedmonth = case_when(
    launchedMonth == "1" ~ "Jan",
    launchedMonth == "2" ~ "Fev",
    launchedMonth == "3" ~ "Mar",
    launchedMonth == "4" ~ "Apr",
    launchedMonth == "5" ~ "May",
    launchedMonth == "6" ~ "June",                
    launchedMonth == "7" ~ "July",
    launchedMonth == "8" ~ "Aug",
    launchedMonth == "9" ~ "Sept",
    launchedMonth == "10" ~ "Oct",
    launchedMonth == "11" ~ "Nov",
    launchedMonth == "12" ~ "Dec"))

KS$launchedmonth <- factor(KS$launchedmonth,
                                levels = c("Jan","Fev","Mar",
                                           "Apr","May","June",
                                           "July","Aug", "Sept",
                                           "Oct","Nov","Dec"),ordered = TRUE)

KS <- subset(KS, select = -c(launchedMonth))

KS %>% 
  group_by(launchedmonth) %>% 
  count()

## # A tibble: 12 x 2
## # Groups:   launchedmonth [12]
##    launchedmonth     n
##    <ord>         <int>
##  1 Jan            1056
##  2 Fev            1095
##  3 Mar            1354
##  4 Apr            1249
##  5 May            1298
##  6 June           1240
##  7 July           1346
##  8 Aug            1214
##  9 Sept           1145
## 10 Oct            1255
## 11 Nov            1229
## 12 Dec             776

Projects on Kickstarter can last anywhere from 1 - 60 days.

KS %>% 
  filter(days > 60) %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1   234

Only keep records where projects can last anywhere from 1 to 60 days

KS <- KS %>% 
  filter(days <= 60)

KS %>% 
  filter(days == 0) %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1     0

There is no project that launched and ended within a day.

Missing values

library(DataExplorer)

plot_missing(KS)

The data does not contain missing values

Checking consistency between columns. Errors could be

The “successful” projects do not have any supporters (i.e. bakers = 0).

KS %>% 
  filter(backers == 0 & outcome == "successful") %>% 
  summarise( count = n())

## # A tibble: 1 x 1
##   count
##   <int>
## 1     7

There are seven records that contain error. What are these rows?

KS %>% 
  select(id, Category, backers, outcome) %>% 
  filter(backers == 0 & outcome == "successful")

## # A tibble: 7 x 4
##      id Category     backers outcome   
##   <dbl> <fct>          <dbl> <fct>     
## 1   589 Music              0 successful
## 2  3851 Music              0 successful
## 3  4122 Music              0 successful
## 4  4200 Film & Video       0 successful
## 5  6738 Music              0 successful
## 6  9516 Music              0 successful
## 7 14872 Film & Video       0 successful

To remove these seven records

row_toremove <- with(KS, which(backers == 0 & outcome == "successful", arr.ind=TRUE))
KS <- KS[-row_toremove,]

A project is failed when goal > pledged. There will be errors if b1. goal < pleged and the outcome = failed (outcome should be successful)

KS %>% 
  select(id, Category, backers, pledged, goal, outcome) %>% 
  filter(pledged > goal & outcome == "failed")

## # A tibble: 1 x 6
##      id Category   backers pledged  goal outcome
##   <dbl> <fct>        <dbl>   <dbl> <dbl> <fct>  
## 1 11467 Publishing      66   10797 10000 failed

KS$outcome[KS$pledged >  KS$goal & KS$outcome == "failed"] <- "successful"

b2. goal > pleded and the outcome = successful (the cutcome should be failed)

KS %>% 
  select(id, Category, backers, pledged, goal, outcome) %>% 
  filter(pledged < goal & outcome == "successful")

## # A tibble: 0 x 6
## # ... with 6 variables: id <dbl>, Category <fct>, backers <dbl>, pledged <dbl>,
## #   goal <dbl>, outcome <fct>

b3. goal > plegded and the outcome = canceled, because the outcome should be labelled as failed. It may not be an error if the founder decided to cancel the project. However, for this project, we will change the outcome of these records from canceled to succefull

KS %>% 
  filter(pledged < goal & outcome == "canceled") %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1  1053

There are 1073 rows.

KS %>% 
  select(id, Category, backers, pledged, goal, outcome) %>% 
  filter(pledged < goal & outcome == "canceled") %>% 
  head(10)

## # A tibble: 10 x 6
##       id Category    backers pledged  goal outcome 
##    <dbl> <fct>         <dbl>   <dbl> <dbl> <fct>   
##  1    38 Technology        0       0 25000 canceled
##  2    50 Design           19     912  5000 canceled
##  3    63 Publishing       30    4373 50000 canceled
##  4    98 Music            16    1979 18000 canceled
##  5    99 Games            24    5816 50000 canceled
##  6   101 Publishing        1       9  1500 canceled
##  7   133 Fashion           0       0  8000 canceled
##  8   135 Art               0       0  4303 canceled
##  9   169 Technology        0       0 20000 canceled
## 10   177 Photography       0       0 15000 canceled

These projects are canceled because pledged money is way less than goal. These projects are propably failed. We will apply “Conditionally replace multiple rows in a data frame”

KS$outcome[KS$pledged <  KS$goal & KS$outcome == "canceled"] <- "failed"

b4. plegded > goal and the outcome = canceled, because the outcome should be labelled as successful. Similarly, we can change the outcomes of these records from canceled to faile

KS %>% 
  filter(pledged > goal & outcome == "canceled") %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1    13

KS %>% 
  filter(pledged > goal & outcome == "canceled") %>% 
  head(10)

## # A tibble: 10 x 12
##       id Category   Subcategory    launchedDay deadlineMonth deadlineDay  days
##    <dbl> <fct>      <fct>                <dbl>         <dbl>       <dbl> <dbl>
##  1   797 Food       Drinks                  16             9          16    30
##  2  2871 Games      Tabletop Games           2             3           2    29
##  3  4761 Games      Video Games             12            10          12    30
##  4  6489 Technology Flight                  18             2          18    30
##  5  6530 Design     Architecture            16            12          16    45
##  6  7596 Technology Technology              28             1          28    46
##  7  8858 Food       Food                     9             7           9    30
##  8 10121 Fashion    Footwear                15            10          15    28
##  9 10796 Design     Product Design          22            11          22    31
## 10 11701 Technology Gadgets                 24             4          24    36
##      goal backers pledged outcome  launchedmonth
##     <dbl>   <dbl>   <dbl> <fct>    <ord>        
##  1   1000      45   1951  canceled Sept         
##  2  35000     305  37954  canceled Mar          
##  3  10000     163  10275. canceled Oct          
##  4 100000     243 257771  canceled Fev          
##  5     80       2     85  canceled Dec          
##  6 100000     575 137377  canceled Jan          
##  7      8       8    237  canceled July         
##  8   2000      68   2249  canceled Oct          
##  9   3000     228  13599  canceled Nov          
## 10  60000     613  96583  canceled Apr

The outcome of these records will be rewritten as successful.

KS$outcome[KS$pledged >  KS$goal & KS$outcome == "canceled"] <- "successful"

How many projects that were canceled?

KS %>% 
  group_by(outcome) %>% 
  count()

## # A tibble: 3 x 2
## # Groups:   outcome [3]
##   outcome        n
##   <fct>      <int>
## 1 canceled       1
## 2 failed      8784
## 3 successful  5231

there is only one project that was canceled

KS %>% 
  select(id, Category, backers, pledged, goal, outcome) %>% 
  filter(outcome == "canceled")

## # A tibble: 1 x 6
##      id Category   backers pledged  goal outcome 
##   <dbl> <fct>        <dbl>   <dbl> <dbl> <fct>   
## 1  2540 Journalism       1       1     1 canceled

We can remove that record. A amount of 1 dollar as a goal seem to be not reasonable

KS <- KS %>% 
  filter(id != 2540)

How about goal ? are there any projects that put 0 dollar as goal?

summary(KS$goal)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         1      2000      5296     61730     15293 100000000

How many project that the goal are less than 100 dollars

KS %>% 
  filter(goal < 100) %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1   166

Were these projects successful?

KS %>% 
  group_by(outcome) %>% 
  filter(goal < 100) %>% 
  count()

## # A tibble: 2 x 2
## # Groups:   outcome [2]
##   outcome        n
##   <fct>      <int>
## 1 failed        75
## 2 successful    91

So far, we will keep these records.

str(KS)

## Classes 'tbl_df', 'tbl' and 'data.frame':    14015 obs. of  12 variables:
##  $ id           : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Category     : Factor w/ 15 levels "Art","Comics",..: 7 7 7 6 7 1 15 7 7 3 ...
##  $ Subcategory  : Factor w/ 157 levels "3D Printing",..: 152 128 6 8 39 70 140 152 24 32 ...
##  $ launchedDay  : num  25 16 28 22 24 25 1 21 3 14 ...
##  $ deadlineMonth: num  4 11 8 7 8 9 7 5 3 4 ...
##  $ deadlineDay  : num  25 16 28 22 24 25 1 21 3 14 ...
##  $ days         : num  30 30 30 30 35 37 45 30 30 30 ...
##  $ goal         : num  15000 224 5000 6000 2000000 ...
##  $ backers      : num  3 23 28 218 2 5 45 4 0 11 ...
##  $ pledged      : num  20 414 1497 8795 2 ...
##  $ outcome      : Factor w/ 3 levels "canceled","failed",..: 2 3 2 3 2 2 2 2 2 2 ...
##  $ launchedmonth: Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 4 11 8 7 8 9 7 5 3 4 ...

table(KS$outcome)

## 
##   canceled     failed successful 
##          0       8784       5231

The outcome still has 3 factor levels. We will remove the unused level (“canceled”)

KS$outcome <- factor(KS$outcome)

table(KS$outcome)

## 
##     failed successful 
##       8784       5231

Exploratory data analysis

theme_general <- theme(plot.title = element_text(hjust = 0.5), 
        axis.title = element_text(size = 14, face = "bold", color = "black"), 
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        legend.position = "right", 
        legend.title = element_text(size = 14),
        legend.text = element_text(colour="black", size = 14))

Q1. What categories are most popular? We first begin by examining the number of projects by category

Category_df <- KS %>%
  group_by(Category) %>% 
  summarise(counts = n()) %>% 
  mutate(Percentage = round(counts*100/sum(counts),2)) %>% 
  arrange(desc(counts))

ggplot(data = Category_df,
           aes(x= reorder(Category, -counts),
               y  = counts,
               fill = Percentage)) +
  geom_bar(stat = "identity",
           width = 0.7) +
  scale_fill_gradient(low="skyblue1", high="royalblue4")+
  geom_text(aes(label = paste0(counts,"\n", Percentage,"%")), 
            vjust = -0.1, 
            color = "darkblue", 
            size = 5) +
  scale_y_continuous(limits = c(0,2500)) +
  theme_minimal() +
  labs(x = "Category",
       y = "Frequency",
       caption = "Kickstarter 2009-2014") +
  theme_general

Film & Video, Music, and Publishing lead the chart. Journalism, Dance and Crafts are the least popular.

Preliminary analysis using SQL revealed that there are 153 subcatergories.

What are the top ten subcategories ?

subcat.freq <- KS %>%
  dplyr::group_by(Subcategory) %>%
  dplyr::summarize(Frequency = n()) %>%
  arrange(desc(Frequency)) %>% 
  head(10)

ggplot(subcat.freq, 
       aes(x = reorder(Subcategory,-Frequency), Frequency, fill = Frequency)) + 
  geom_bar(stat="identity",
           width = 0.65,
           fill = "skyblue1",
           colour = "skyblue1") + 
  labs(title = "Number of Projects by subcategory", 
       x = "The top 10 Project Subcategory",
       y = "Frequency") +
  geom_text(aes(label = Frequency), 
            vjust = 0.5, 
            hjust = -0.1,
            color = "darkblue", 
            size = 5) + 
  coord_flip() +
  theme_general

How to find which Category these subcategories belong?

KS %>% 
  select(Category, Subcategory) %>% 
  group_by(Category, Subcategory) %>% 
  filter(Subcategory %in% c("Product Design", "Documentary", "Music", "Tabletop Games",
                            "Shorts", "Video Games", "Food", "Film & Video", "Nonfiction", "Fiction")) %>% 
  summarise(fre =  n()) %>% 
  arrange(desc(fre))

## # A tibble: 10 x 3
## # Groups:   Category [6]
##    Category     Subcategory      fre
##    <fct>        <fct>          <int>
##  1 Design       Product Design   876
##  2 Film & Video Documentary      654
##  3 Games        Tabletop Games   521
##  4 Music        Music            508
##  5 Film & Video Shorts           477
##  6 Games        Video Games      421
##  7 Food         Food             406
##  8 Film & Video Film & Video     356
##  9 Publishing   Fiction          356
## 10 Publishing   Nonfiction       356

These top sub-categories stemm from category: Design, Film & Video, Music, Games, Food and Publishing.

Question 2: What types of projects are being funded? This question is akin to the first question but phrased from the perspective of the backers. In other words, the most funded projects are the most popular projects in the perspective of the backers

KS %>% 
  select(Category, Subcategory, pledged) %>% 
  arrange(desc(pledged)) %>% 
  head(10)

## # A tibble: 10 x 3
##    Category     Subcategory        pledged
##    <fct>        <fct>                <dbl>
##  1 Technology   Web               5408917.
##  2 Games        Tabletop Games    3999796.
##  3 Film & Video Narrative Film    3105473.
##  4 Games        Video Games       2278255.
##  5 Art          Sculpture         1924018 
##  6 Design       Product Design    1842142.
##  7 Technology   Hardware          1652247.
##  8 Games        Tabletop Games    1546270.
##  9 Games        Tabletop Games    1341305.
## 10 Technology   Space Exploration 1241615.

A lot of the projects here fall under the “Games” and “Technology” category.

Similarly, let’s list the top 15 most backed projects (i.e. projects with the most backers).

KS %>% 
  select(Category, Subcategory, backers) %>% 
  arrange(desc(backers)) %>% 
  head(20)

## # A tibble: 20 x 3
##    Category     Subcategory       backers
##    <fct>        <fct>               <dbl>
##  1 Technology   Web                105857
##  2 Film & Video Narrative Film      46520
##  3 Games        Tabletop Games      40642
##  4 Technology   Space Exploration   23331
##  5 Design       Product Design      20680
##  6 Games        Video Games         17765
##  7 Technology   Gadgets             17218
##  8 Games        Video Games         13987
##  9 Games        Tabletop Games      13765
## 10 Design       Graphic Design      11334
## 11 Games        Video Games          9869
## 12 Technology   Hardware             9805
## 13 Film & Video Animation            9122
## 14 Film & Video Science Fiction      8548
## 15 Games        Tabletop Games       8396
## 16 Technology   Hardware             8359
## 17 Technology   Hardware             8019
## 18 Technology   Hardware             8018
## 19 Technology   Hardware             7766
## 20 Games        Video Games          7564

8 technology 7 games 3 film & video

Question 3: What types of projects funding is going towards ? ===> aggregating the amount of funds pledged for each category, providing us with the total amount pledged for each category.

pledged.tot_df <- KS %>%
  dplyr::group_by(Category) %>%
  dplyr::summarize(total = sum(pledged/1000000)) %>%
  mutate(Percentage = round(total*100/sum(total),2)) %>% 
  arrange(desc(total))

ggplot(data = pledged.tot_df,
           aes(x= reorder(Category, -total),
               y  = total,
               fill = Percentage)) +
  geom_bar(stat = "identity",
           width = 0.7) +
  scale_fill_gradient(low="skyblue1", high="royalblue4")+
  geom_text(aes(label = paste0("$",round(total,1),"M", "\n",Percentage,"%")), 
            vjust = -0.1, 
            color = "darkblue", 
            size = 5) +
  scale_y_continuous(limits = c(0,30)) +
  theme_minimal() +
  labs(x = "Category",
       y = "Pledges (Million USD)",
       caption = "Kickstarter 2009-2014") +
  ggtitle("Total Amount Pledged by Category") +
  theme_general

Technology, Games, and Design are highest grossing categories by far.

Film& Video is the most popular among category. Less than 1 % of projects is Dance or Journalism. For each categpry, which catergory has higher successful rate?

library(arsenal)

## Warning: package 'arsenal' was built under R version 3.6.3

## 
## Attaching package: 'arsenal'

## The following object is masked from 'package:Hmisc':
## 
##     %nin%

table_one <- tableby(outcome ~ Category + goal+ pledged + backers + days , data = KS)
summary(table_one, title = "Summary statistic of Kickstarter data")

## 
## Table: Summary statistic of Kickstarter data
## 
## |                               |     failed (N=8784)     |  successful (N=5231)   |     Total (N=14015)     | p value|
## |:------------------------------|:-----------------------:|:----------------------:|:-----------------------:|-------:|
## |**Category**                   |                         |                        |                         | < 0.001|
## |&nbsp;&nbsp;&nbsp;Art          |       621 (7.1%)        |       425 (8.1%)       |       1046 (7.5%)       |        |
## |&nbsp;&nbsp;&nbsp;Comics       |       171 (1.9%)        |       243 (4.6%)       |       414 (3.0%)        |        |
## |&nbsp;&nbsp;&nbsp;Crafts       |       246 (2.8%)        |       90 (1.7%)        |       336 (2.4%)        |        |
## |&nbsp;&nbsp;&nbsp;Dance        |        41 (0.5%)        |       81 (1.5%)        |       122 (0.9%)        |        |
## |&nbsp;&nbsp;&nbsp;Design       |       730 (8.3%)        |       426 (8.1%)       |       1156 (8.2%)       |        |
## |&nbsp;&nbsp;&nbsp;Fashion      |       604 (6.9%)        |       218 (4.2%)       |       822 (5.9%)        |        |
## |&nbsp;&nbsp;&nbsp;Film & Video |      1492 (17.0%)       |      909 (17.4%)       |      2401 (17.1%)       |        |
## |&nbsp;&nbsp;&nbsp;Food         |       675 (7.7%)        |       253 (4.8%)       |       928 (6.6%)        |        |
## |&nbsp;&nbsp;&nbsp;Games        |       773 (8.8%)        |       463 (8.9%)       |       1236 (8.8%)       |        |
## |&nbsp;&nbsp;&nbsp;Journalism   |        82 (0.9%)        |       28 (0.5%)        |       110 (0.8%)        |        |
## |&nbsp;&nbsp;&nbsp;Music        |       917 (10.4%)       |      932 (17.8%)       |      1849 (13.2%)       |        |
## |&nbsp;&nbsp;&nbsp;Photography  |       260 (3.0%)        |       141 (2.7%)       |       401 (2.9%)        |        |
## |&nbsp;&nbsp;&nbsp;Publishing   |      1026 (11.7%)       |       507 (9.7%)       |      1533 (10.9%)       |        |
## |&nbsp;&nbsp;&nbsp;Technology   |       960 (10.9%)       |       251 (4.8%)       |       1211 (8.6%)       |        |
## |&nbsp;&nbsp;&nbsp;Theater      |       186 (2.1%)        |       264 (5.0%)       |       450 (3.2%)        |        |
## |**goal**                       |                         |                        |                         |   0.007|
## |&nbsp;&nbsp;&nbsp;Mean (SD)    | 92612.452 (2211566.793) |  9870.564 (36745.974)  | 61729.625 (1751416.964) |        |
## |&nbsp;&nbsp;&nbsp;Range        |  1.000 - 100000000.000  |  0.750 - 2000000.000   |  0.750 - 100000000.000  |        |
## |**pledged**                    |                         |                        |                         | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD)    |   1472.989 (7024.771)   | 22676.391 (131737.279) |  9387.009 (81319.461)   |        |
## |&nbsp;&nbsp;&nbsp;Range        |   0.000 - 260372.610    |  4.640 - 5408916.950   |   0.000 - 5408916.950   |        |
## |**backers**                    |                         |                        |                         | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD)    |     18.065 (89.154)     |   285.578 (1913.957)   |   117.912 (1178.488)    |        |
## |&nbsp;&nbsp;&nbsp;Range        |    0.000 - 4201.000     |   1.000 - 105857.000   |   0.000 - 105857.000    |        |
## |**days**                       |                         |                        |                         | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD)    |     34.576 (11.941)     |    31.615 (10.422)     |     33.471 (11.487)     |        |
## |&nbsp;&nbsp;&nbsp;Range        |     1.000 - 60.000      |     1.000 - 60.000     |     1.000 - 60.000      |        |

It is important to take the number of backers into account as well ==> determine the average amount pledged per backer for each category. ==> calculate this by taking the total amount pledged for each category and dividing it by the total number of backers for each category.

library(pander)

pledged.avg <- KS %>%
  dplyr::group_by(Category) %>%
  dplyr::summarize(pledged = sum(pledged), backers = sum(backers)) %>%
  mutate(avg = pledged/backers) %>%
  arrange(desc(avg)) 

pledged.avg$Category <- factor(pledged.avg$Category, levels = pledged.avg$Category)

pledged.avg %>% 
  pander()

Category	pledged	backers	avg
Art	4907098	43591	112.6
Food	4619730	45354	101.9
Fashion	5412036	53726	100.7
Design	23300952	256999	90.67
Film & Video	18921532	217493	87
Photography	1515172	17513	86.52
Technology	27749278	327101	84.83
Dance	483046	5947	81.23
Theater	1474214	18359	80.3
Journalism	439491	6021	72.99
Games	27480040	406795	67.55
Music	6850120	103560	66.15
Publishing	5149074	84421	60.99
Crafts	571697	10361	55.18
Comics	2685461	55302	48.56

library(hrbrthemes)

## Warning: package 'hrbrthemes' was built under R version 3.6.3

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

ggplot(pledged.avg, aes(Category, avg, fill = avg)) + 
  geom_bar(stat = "identity") + 
  labs(title = "Average Amount Pledged per Backer",
       x = "Project Category",
       y = "Amount Pledged (USD)") +
  geom_text(aes(label=paste0("$", round(avg,2))), 
            vjust= 0.5,
            hjust = -0.1) + 
  scale_fill_distiller(palette = "Spectral") +
  coord_flip() +
  theme_general

“Art” has the highest average amount pledged, whereas Comics has the lowest average amount pledged. An interesting note here is that the average amount pledged for “Art” is nearly double that of “Comics”, even though Games had the higher aggregate amount pledged as shown in the previous graph.

Question 4: Distribution of pledged for each category

ggplot(KS, aes(Category, pledged, fill = Category)) + 
  geom_boxplot() + 
  ggtitle("Amount Pledged vs. Project Category") + 
  xlab("Project Category") + 
  ylab("Amount Pledged (USD)") + 
  theme(plot.title = element_text(size = 15, face = "bold", hjust = 0.5), 
        axis.title = element_text(size = 12, face = "bold"), 
        axis.text.x=element_text(size = 12, angle = 90), legend.position = "null") + 
  coord_cartesian(ylim=c(0,30000))

There are no doubt a lot of projects that received little to no funding as well as huge outliers.

KS %>% 
  group_by(Category) %>% 
  summarise(median = median(pledged)) %>% 
  arrange(desc(median))

## # A tibble: 15 x 2
##    Category     median
##    <fct>         <dbl>
##  1 Dance        2328  
##  2 Design       2091. 
##  3 Comics       1720. 
##  4 Theater      1403. 
##  5 Games        1378. 
##  6 Music        1085  
##  7 Film & Video  757. 
##  8 Art           460. 
##  9 Photography   399. 
## 10 Publishing    320  
## 11 Food          307. 
## 12 Technology    302  
## 13 Fashion       279. 
## 14 Journalism    141. 
## 15 Crafts         90.0

Designe, Game and Comics have higher median than other category.

Many projects, smnaller goals = lower median. Higher median == less project, less backer, but the pledged is dispersed.

Having looked at both the amount pledged and goal for different kinds of projects, see how the distribution of each compares to one another. Since we can expect both distributions to be heavily right-skewed due to many projects that received little to no funding and extremely high outliers, we will use a log transformation on both variables to better visualize their distributions.

outcome.pct <- KS %>%
  dplyr::group_by(Category, outcome) %>%
  dplyr::summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  arrange(desc(outcome), pct)

outcome.pct$Category <- factor(outcome.pct$Category,levels=outcome.pct$Category[1:(nrow(outcome.pct)/2)])

plot1 <- ggplot(outcome.pct, aes(reorder(Category,-count), pct, fill = outcome)) + 
  geom_bar(stat="identity") + 
  ggtitle("Success vs. Failure Rate by Project Category") + 
  xlab("Project Category") + 
  ylab("Percentage") + 
  scale_y_continuous(labels=scales::percent) + 
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct*100,1),"%")), 
            position = position_stack(vjust=0.5), 
            colour = "blue", 
            size = 3.5, 
            fontface = "bold") + 
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title = element_text(size = 12), 
        axis.text.x = element_text(size = 12), 
        axis.text.y = element_text(size = 12),
        legend.position = "bottom", 
        legend.title = element_text(size = 12, face = "bold"),
        legend.text = element_text(colour="black", size = 12)) + 
  coord_flip()

plot2 <- ggplot(outcome.pct, aes(reorder(Category,-count),  count, fill = outcome)) + 
  geom_bar(stat="identity") + 
  ggtitle("Success vs. Failure Rate by Project Category") + 
  xlab("Project Category") + ylab("Number of Projects") + 
  scale_y_continuous(labels = scales::comma) + 
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(count,1),"")), 
            position = position_stack(vjust = 0.5), 
            colour = "blue", 
            size = 3, 
            fontface = "bold") + 
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title = element_text(size = 12), 
        axis.text.x = element_text(size = 12), 
        axis.text.y = element_text(size = 12), 
        legend.position = "bottom", 
        legend.title = element_text(size  = 12),
        legend.text = element_text(colour="black", size = 12)) + 
  coord_flip()
gridExtra::grid.arrange(plot1, plot2, ncol = 2)

Dance, Comics, and Theater have the highest success rates and Technology, Fashion, Journalism, Crafts, and Foods have the lowest. This agrees with the box plots for amounts pledged and project goal amounts above as Dance and Comics both had high median amounts pledged and low median goals, with Theater having a low median goal as well. Technology, Journalism, and Crafts had low median amounts pledged, with Technology having a high median goal. In general, the higher the amount pledged and/or the lower the goal, the more likely a project will be successful. Interestingly enough, Crafts, having a low median amount pledged, also has a low success rate despite having a low median goal, which may indicate that people generally are not as interested in this category as a whole.

Question 5 : What types of projects were successful and unsuccessful? reak down the number of projects by their status (e.g. successful, failed, cancelled, etc.).

outcome_prop <- KS %>% 
  group_by(outcome) %>% 
  count() %>% 
  ungroup()%>% 
  arrange(desc(outcome)) %>%
  mutate(percentage = round(n/sum(n),4)*100,
         lab.pos = cumsum(percentage)-0.5*percentage)

ggplot(data = outcome_prop, 
       aes(x = "", 
           y = percentage, 
           fill = outcome))+
  geom_bar(stat = "identity")+
  coord_polar("y") +
  geom_text(aes(y = lab.pos, 
                label = paste(percentage,"%", sep = "")), col = "blue", size = 5) +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  theme_void() +
  theme(legend.title = element_text(color = "black", size = 14),
        legend.text = element_text(color = "black", size = 14))

About two thirds of projects see the light of day.

Question 6: Month of year matters? 1. Number of projects

df_month <- KS %>% 
  group_by(launchedmonth) %>% 
  summarise(Count = n())

ggplot(df_month, aes(launchedmonth, Count)) + 
  geom_point(aes(size = Count, group = Count), colour = "blue") +
  labs(title = "Number of projects by month",
       x = "Month",
       y = "Frequency") + 
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title = element_text(size = 14, face = "bold"), 
        axis.text.x = element_text(size = 14), 
        axis.text.y = element_text(size = 14), 
        legend.position = "right", 
        legend.title = element_text(size= 14, face = "bold"))

Less projects launched in January and February. Much less projects launched in December

Successful vis failed

month.pct <- KS %>%
  dplyr::group_by(launchedmonth, outcome) %>%
  dplyr::summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  arrange(desc(outcome), pct)

ggplot(month.pct, aes(launchedmonth, 
                      pct,
                      fill = outcome)) + 
  geom_bar(stat="identity") + 
  ggtitle("Success vs. Failure Rate by month") + 
  xlab("Month") + 
  ylab("Percentage") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(pct*100,1),"%")), 
            position = position_stack(vjust = 0.5), 
            colour = "blue", 
            size = 3.5, 
            fontface = "bold") + 
  theme_general

The rate of success is lowest in July (32.8%) and highest in March and November (about 40%)

Chi-square test for association

H0: Month is not associated with the outcome Ha: Month is associated with the outcome

chisq.test(KS$launchedmonth, KS$outcome)

## 
##  Pearson's Chi-squared test
## 
## data:  KS$launchedmonth and KS$outcome
## X-squared = 28.469, df = 11, p-value = 0.002742

The p-value = 0.003 < 0.01. We reject the null hypothesis. We conclude that month is associated with the outcome

Day of month matters?

launchedDay_tbl <-  KS %>% 
  group_by(launchedDay) %>% 
  count()

ggplot(launchedDay_tbl, aes(x = launchedDay, y = n)) + 
  geom_line(color = "grey", size = 1.5) + 
  geom_point(color = "blue", size = 4)+
  geom_smooth(colour="green", method = "lm", se = FALSE) +
  scale_x_continuous(breaks = seq(from = 0, to = 31, by = 5)) +
      labs(title = "Number of projects by day of month", 
           x = "Day of month",
           y = "Frequency") + 
  theme_set(theme_classic()) + 
  theme_general

## `geom_smooth()` using formula 'y ~ x'

The number of launched projects is highest on the first day of month and decreases toward the end of month. What are the successful rate in function of day

day.pct <- KS %>%
  dplyr::group_by(launchedDay, outcome) %>%
  dplyr::summarize(count = n()) %>%
  mutate(percentage = count/sum(count)) %>%
  arrange(launchedDay, percentage)


ggplot(day.pct, aes(launchedDay, 
                    percentage,
                    fill = outcome)) + 
  geom_bar(stat="identity") + 
  ggtitle("Success vs. Failure Rate by day of month") + 
  xlab("Day of month") + 
  ylab("Percentage") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_continuous(breaks = c(1:31))+
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  geom_text(aes(label = paste0(round(percentage*100,1),"%")), 
            position = position_stack(vjust = 0.5), 
            colour = "blue", 
            size = 3.5, 
            fontface = "bold") + 
  theme_general

The lowest successful rate is 33.5 and the highest successful rate is 43.4% There is no pattern that can be observed here.

Outcome and duration of project

 # Overlaid histograms
ggplot(KS, aes(x=days, color=outcome)) +
  geom_histogram(fill="white", alpha=0.5, position="identity") +
  scale_color_manual(values=c("red", "blue"))+
  scale_fill_manual(values=c("red", "blue"))+
  theme_general

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(KS, aes(x = factor(outcome), 
               y = days,
               fill = factor(outcome))) +
  geom_boxplot() + 
  geom_jitter(width = 0.35, 
              alpha = 0.2,
              shape = 15,
              color = "steelblue") +
  scale_y_continuous (breaks = seq(0, 70, 15),
                      limits = c(0, 70)) +
  ggtitle("Distribution of days by outcome") +
  xlab("Outcome") +
  ylab("Duration of projects (days)") +
  labs(fill = "Outcome") +
  scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
  theme(axis.line = element_line(size = 0.5, colour = "grey"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        plot.title = element_text(size = 14, face = "bold"),
        text = element_text(colour = "black", size = 14),
        axis.text.x = element_text(colour = "black", size = 14),
        axis.text.y = element_text(colour = "black", size = 14),
        legend.position = "right")

There are many projects that were launched and ended for a duration of 30, 45, and 60 days.

Correlation analysis

library(ggpubr)

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 3.6.3

#Correlation backers, pledge, goal and days
corr <- round(cor(KS[c(7:10)]),2)
ggcorrplot(corr, 
           type = "upper", 
           lab = TRUE,
           outline.col = "white",
           ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"))

There is a strong and positive correlatin between pledged and backers.

ggscatter(KS, x = "backers", y = "pledged", 
          add = "reg.line", 
          conf.int = TRUE, 
          cor.coef = TRUE,
          cor.method = "pearson",
          xlab = "Number of backers", 
          ylab = "Pledged amount (USD))") + theme_general

## `geom_smooth()` using formula 'y ~ x'

df_goal <- KS %>%
  filter(goal!="0")

ggplot(df_goal,
       aes(log(backers),log(pledged), 
           color = outcome)) + 
  geom_point(position = "identity") +  
  ggtitle("Backers in function of pledged amount") + xlab("log(Backers)") + 
  ylab("Pledged") + 
  theme_general

Goals in function of usd pledged

ggplot(df_goal, aes(log(goal),log(pledged), color = outcome)) + 
  geom_point(position="identity") +  
  ggtitle("Goals in function of Pledges") + 
  xlab("log(Goals)") + 
  ylab("log(pledged)") + 
  theme_general

Goals in function of backers

ggplot(df_goal, aes(log(backers),log(goal), color = outcome)) + 
  geom_point(position="identity") +   
  ggtitle("Backers in function of Goals") + 
  xlab("log(Backers)") + 
  ylab("log(Goals") + 
  theme_general

Pledge’s length in function of backers

ggscatter(KS, x = "days", y = "pledged", 
          add = "reg.line", 
          conf.int = TRUE, 
          cor.coef = TRUE,
          cor.method = "pearson",
          xlab = "Number of backers", 
          ylab = "Pledged amount (USD))") + 
  theme_general

## `geom_smooth()` using formula 'y ~ x'

BUILDING MODEL

str(KS)

## Classes 'tbl_df', 'tbl' and 'data.frame':    14015 obs. of  12 variables:
##  $ id           : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Category     : Factor w/ 15 levels "Art","Comics",..: 7 7 7 6 7 1 15 7 7 3 ...
##  $ Subcategory  : Factor w/ 157 levels "3D Printing",..: 152 128 6 8 39 70 140 152 24 32 ...
##  $ launchedDay  : num  25 16 28 22 24 25 1 21 3 14 ...
##  $ deadlineMonth: num  4 11 8 7 8 9 7 5 3 4 ...
##  $ deadlineDay  : num  25 16 28 22 24 25 1 21 3 14 ...
##  $ days         : num  30 30 30 30 35 37 45 30 30 30 ...
##  $ goal         : num  15000 224 5000 6000 2000000 ...
##  $ backers      : num  3 23 28 218 2 5 45 4 0 11 ...
##  $ pledged      : num  20 414 1497 8795 2 ...
##  $ outcome      : Factor w/ 2 levels "failed","successful": 1 2 1 2 1 1 1 1 1 1 ...
##  $ launchedmonth: Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 4 11 8 7 8 9 7 5 3 4 ...

Creative <- KS %>%
  dplyr::select(Category, launchedDay, launchedmonth, backers, days, pledged, goal, outcome)

head(Creative)

## # A tibble: 6 x 8
##   Category     launchedDay launchedmonth backers  days pledged     goal
##   <fct>              <dbl> <ord>           <dbl> <dbl>   <dbl>    <dbl>
## 1 Film & Video          25 Apr                 3    30     20    15000 
## 2 Film & Video          16 Nov                23    30    414.     224.
## 3 Film & Video          28 Aug                28    30   1497     5000 
## 4 Fashion               22 July              218    30   8795     6000 
## 5 Film & Video          24 Aug                 2    35      2  2000000 
## 6 Art                   25 Sept                5    37    230     2500 
##   outcome   
##   <fct>     
## 1 failed    
## 2 successful
## 3 failed    
## 4 successful
## 5 failed    
## 6 failed

#— Training & Validation Sets —#

library(caret)

## Warning: package 'caret' was built under R version 3.6.3

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(100)
Trainindex <- createDataPartition(y = Creative$outcome, p = .70, list = FALSE)
training <- Creative[Trainindex ,]
validation <- Creative[-Trainindex,]
training_new <- training[-8]
validation_new <- validation[-8]
training_label <- training$outcome
validation_label <- validation$outcome

3.1. Logistic regression

library(rpart)

## Warning: package 'rpart' was built under R version 3.6.3

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.6.3

library(caret)

set.seed(123)
default_glm_mod <- train(form = outcome ~ .,
                         data = training,
                         method = "glm",
                         family = "binomial",
                         tuneLength = 5)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

glm_pred <- predict(default_glm_mod, newdata= validation)
confusionMatrix(glm_pred, validation$outcome, positive = "successful")

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   failed successful
##   failed       2633          2
##   successful      2       1567
##                                           
##                Accuracy : 0.999           
##                  95% CI : (0.9976, 0.9997)
##     No Information Rate : 0.6268          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.998           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9987          
##             Specificity : 0.9992          
##          Pos Pred Value : 0.9987          
##          Neg Pred Value : 0.9992          
##              Prevalence : 0.3732          
##          Detection Rate : 0.3727          
##    Detection Prevalence : 0.3732          
##       Balanced Accuracy : 0.9990          
##                                           
##        'Positive' Class : successful      
##

summary(default_glm_mod)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3953   0.0000   0.0000   0.0000   0.4835  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             1.535e+01  2.488e+02   0.062   0.9508    
## CategoryComics         -1.265e+01  9.261e+01  -0.137   0.8914    
## CategoryCrafts         -6.878e+00  6.477e+00  -1.062   0.2883    
## CategoryDance           2.407e+02  2.312e+05   0.001   0.9992    
## CategoryDesign          2.526e+00  6.290e+01   0.040   0.9680    
## CategoryFashion        -5.842e+00  5.825e+00  -1.003   0.3159    
## `CategoryFilm & Video` -2.403e+00  4.308e+00  -0.558   0.5770    
## CategoryFood           -8.931e+00  5.346e+00  -1.671   0.0948 .  
## CategoryGames          -6.475e+00  5.092e+00  -1.271   0.2036    
## CategoryJournalism      4.772e+00  8.147e+01   0.059   0.9533    
## CategoryMusic          -3.628e+00  4.876e+00  -0.744   0.4568    
## CategoryPhotography     7.384e+00  1.090e+02   0.068   0.9460    
## CategoryPublishing     -8.153e+00  3.955e+00  -2.061   0.0393 *  
## CategoryTechnology     -7.458e+00  1.168e+02  -0.064   0.9491    
## CategoryTheater        -2.255e+00  9.079e+00  -0.248   0.8038    
## launchedDay             2.115e-02  1.285e-01   0.165   0.8693    
## launchedmonth.L         4.234e+00  6.241e+02   0.007   0.9946    
## launchedmonth.Q        -7.135e+00  4.629e+02  -0.015   0.9877    
## launchedmonth.C        -1.194e+01  1.040e+03  -0.011   0.9908    
## `launchedmonth^4`      -4.614e+00  4.342e+02  -0.011   0.9915    
## `launchedmonth^5`       9.839e+00  6.861e+02   0.014   0.9886    
## `launchedmonth^6`       1.703e+01  1.114e+03   0.015   0.9878    
## `launchedmonth^7`      -9.812e-01  4.085e+02  -0.002   0.9981    
## `launchedmonth^8`      -5.329e+00  7.602e+02  -0.007   0.9944    
## `launchedmonth^9`      -2.387e+01  1.415e+03  -0.017   0.9865    
## `launchedmonth^10`     -1.047e+01  1.221e+03  -0.009   0.9932    
## `launchedmonth^11`     -8.940e+00  5.862e+02  -0.015   0.9878    
## backers                -1.378e-02  1.396e-01  -0.099   0.9213    
## days                   -1.242e-01  1.065e-01  -1.166   0.2435    
## pledged                 9.599e-01  1.855e-01   5.176 2.27e-07 ***
## goal                   -9.570e-01  1.850e-01  -5.173 2.30e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12963.5705  on 9810  degrees of freedom
## Residual deviance:     1.9256  on 9780  degrees of freedom
## AIC: 63.926
## 
## Number of Fisher Scoring iterations: 25

Only pledged and goal are important. bakers is not important

#— r part —#

set.seed(100)
rparttree  <- rpart(outcome ~ pledged + goal, data = training, 
                 method = "class", 
                 cp = .001, 
                 minsplit = 5, 
                 xval = 5)

# Plot the trees
rpart.plot(rparttree)

plotcp(rparttree)

set.seed(123)

optimal_tree <- rpart(
    formula = outcome ~ .,
    data    = training,
    method  = "class",
    control = list(minsplit = 50, 
                   minbucket= 100, 
                   maxdepth = 4, 
                   xval = 10,
                   cp = 0.0024))

pred <- predict(optimal_tree, newdata = validation,type = "class")

confusionMatrix(data = pred , validation$outcome, positive = "successful")

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   failed successful
##   failed       2472         41
##   successful    163       1528
##                                           
##                Accuracy : 0.9515          
##                  95% CI : (0.9445, 0.9578)
##     No Information Rate : 0.6268          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8979          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9739          
##             Specificity : 0.9381          
##          Pos Pred Value : 0.9036          
##          Neg Pred Value : 0.9837          
##              Prevalence : 0.3732          
##          Detection Rate : 0.3635          
##    Detection Prevalence : 0.4022          
##       Balanced Accuracy : 0.9560          
##                                           
##        'Positive' Class : successful      
##

library(rattle)

## Warning: package 'rattle' was built under R version 3.6.3

## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

fancyRpartPlot(optimal_tree, 
               palettes=c("Greys", "Blues"),
               main = "Creative Kickstarter Classification Tree")

rpart.plot(optimal_tree, type = 4, 
           clip.right.labs = FALSE,
           branch.lty = 3, # dotted branch lines
           branch = 0.7, 
           under = TRUE,
           nn = TRUE)

Kickstarter project-Part 2: Creating consistent data, Performing Exploratoy data analysis, and building the Decision tree

Minh-Trung DANG

11/04/2020