library(readr)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v dplyr 0.8.4
## v tibble 2.1.3 v stringr 1.4.0
## v tidyr 1.0.2 v forcats 0.4.0
## v purrr 0.3.3
## Warning: package 'ggplot2' was built under R version 3.6.3
## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(DataExplorer)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(ggplot2)
options(tibble.width = Inf)
options(tibble.print_max = 10440, tibble.print_min = 6)
getwd()
## [1] "C:/Users/user/Desktop/Data analysis projects/Kickstarter/code"
KS <- read_csv("Kickstarter2.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## CategoryID = col_double(),
## Category = col_character(),
## SubcategoryId = col_double(),
## Subcategory = col_character(),
## goal = col_double(),
## pledged = col_double(),
## backers = col_double(),
## launched = col_datetime(format = ""),
## deadline = col_datetime(format = ""),
## days = col_double(),
## outcome = col_character()
## )
str(KS)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 14257 obs. of 12 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ CategoryID : num 2 2 2 9 2 11 10 2 2 6 ...
## $ Category : chr "Film & Video" "Film & Video" "Film & Video" "Fashion" ...
## $ SubcategoryId: num 23 47 24 52 9 27 20 23 54 12 ...
## $ Subcategory : chr "Webseries" "Shorts" "Animation" "Apparel" ...
## $ goal : num 15000 224 5000 6000 2000000 ...
## $ pledged : num 20 414 1497 8795 2 ...
## $ backers : num 3 23 28 218 2 5 45 4 0 11 ...
## $ launched : POSIXct, format: "2013-04-25" "2015-11-16" ...
## $ deadline : POSIXct, format: "2013-04-25" "2015-11-16" ...
## $ days : num 30 30 30 30 35 37 45 30 30 30 ...
## $ outcome : chr "failed" "successful" "failed" "successful" ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. CategoryID = col_double(),
## .. Category = col_character(),
## .. SubcategoryId = col_double(),
## .. Subcategory = col_character(),
## .. goal = col_double(),
## .. pledged = col_double(),
## .. backers = col_double(),
## .. launched = col_datetime(format = ""),
## .. deadline = col_datetime(format = ""),
## .. days = col_double(),
## .. outcome = col_character()
## .. )
Checking the inconsistency of the data
Convert Category, Subcategory and outcome into factor
KS <- KS %>%
mutate_if(is.character,as.factor)
class(KS$outcome)
## [1] "factor"
Create columns of year, month, and day of month. Convert them into numeric
KS$launchedMonth <- as.numeric(format(KS$launched, "%m"))
KS$launchedDay <- as.numeric(format(KS$launched, "%d"))
KS$deadlineMonth <- as.numeric(format(KS$deadline, "%m"))
KS$deadlineDay <- as.numeric(format(KS$deadline, "%d"))
Rearrange columns using tidyverse (This step can be done with SQL)
KS <- KS %>%
dplyr::select(id,
Category,
Subcategory,
launchedMonth, launchedDay,
deadlineMonth, deadlineDay,
days,
goal, backers, pledged,
outcome
)
KS %>%
group_by(launchedMonth) %>%
count()
## # A tibble: 12 x 2
## # Groups: launchedMonth [12]
## launchedMonth n
## <dbl> <int>
## 1 1 1056
## 2 2 1095
## 3 3 1354
## 4 4 1249
## 5 5 1298
## 6 6 1240
## 7 7 1346
## 8 8 1214
## 9 9 1145
## 10 10 1255
## 11 11 1229
## 12 12 776
Recoding
KS <- KS %>%
mutate(launchedmonth = case_when(
launchedMonth == "1" ~ "Jan",
launchedMonth == "2" ~ "Fev",
launchedMonth == "3" ~ "Mar",
launchedMonth == "4" ~ "Apr",
launchedMonth == "5" ~ "May",
launchedMonth == "6" ~ "June",
launchedMonth == "7" ~ "July",
launchedMonth == "8" ~ "Aug",
launchedMonth == "9" ~ "Sept",
launchedMonth == "10" ~ "Oct",
launchedMonth == "11" ~ "Nov",
launchedMonth == "12" ~ "Dec"))
KS$launchedmonth <- factor(KS$launchedmonth,
levels = c("Jan","Fev","Mar",
"Apr","May","June",
"July","Aug", "Sept",
"Oct","Nov","Dec"),ordered = TRUE)
KS <- subset(KS, select = -c(launchedMonth))
KS %>%
group_by(launchedmonth) %>%
count()
## # A tibble: 12 x 2
## # Groups: launchedmonth [12]
## launchedmonth n
## <ord> <int>
## 1 Jan 1056
## 2 Fev 1095
## 3 Mar 1354
## 4 Apr 1249
## 5 May 1298
## 6 June 1240
## 7 July 1346
## 8 Aug 1214
## 9 Sept 1145
## 10 Oct 1255
## 11 Nov 1229
## 12 Dec 776
Projects on Kickstarter can last anywhere from 1 - 60 days.
KS %>%
filter(days > 60) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 234
Only keep records where projects can last anywhere from 1 to 60 days
KS <- KS %>%
filter(days <= 60)
KS %>%
filter(days == 0) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 0
There is no project that launched and ended within a day.
library(DataExplorer)
plot_missing(KS)
The data does not contain missing values
KS %>%
filter(backers == 0 & outcome == "successful") %>%
summarise( count = n())
## # A tibble: 1 x 1
## count
## <int>
## 1 7
There are seven records that contain error. What are these rows?
KS %>%
select(id, Category, backers, outcome) %>%
filter(backers == 0 & outcome == "successful")
## # A tibble: 7 x 4
## id Category backers outcome
## <dbl> <fct> <dbl> <fct>
## 1 589 Music 0 successful
## 2 3851 Music 0 successful
## 3 4122 Music 0 successful
## 4 4200 Film & Video 0 successful
## 5 6738 Music 0 successful
## 6 9516 Music 0 successful
## 7 14872 Film & Video 0 successful
To remove these seven records
row_toremove <- with(KS, which(backers == 0 & outcome == "successful", arr.ind=TRUE))
KS <- KS[-row_toremove,]
KS %>%
select(id, Category, backers, pledged, goal, outcome) %>%
filter(pledged > goal & outcome == "failed")
## # A tibble: 1 x 6
## id Category backers pledged goal outcome
## <dbl> <fct> <dbl> <dbl> <dbl> <fct>
## 1 11467 Publishing 66 10797 10000 failed
KS$outcome[KS$pledged > KS$goal & KS$outcome == "failed"] <- "successful"
b2. goal > pleded and the outcome = successful (the cutcome should be failed)
KS %>%
select(id, Category, backers, pledged, goal, outcome) %>%
filter(pledged < goal & outcome == "successful")
## # A tibble: 0 x 6
## # ... with 6 variables: id <dbl>, Category <fct>, backers <dbl>, pledged <dbl>,
## # goal <dbl>, outcome <fct>
b3. goal > plegded and the outcome = canceled, because the outcome should be labelled as failed. It may not be an error if the founder decided to cancel the project. However, for this project, we will change the outcome of these records from canceled to succefull
KS %>%
filter(pledged < goal & outcome == "canceled") %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 1053
There are 1073 rows.
KS %>%
select(id, Category, backers, pledged, goal, outcome) %>%
filter(pledged < goal & outcome == "canceled") %>%
head(10)
## # A tibble: 10 x 6
## id Category backers pledged goal outcome
## <dbl> <fct> <dbl> <dbl> <dbl> <fct>
## 1 38 Technology 0 0 25000 canceled
## 2 50 Design 19 912 5000 canceled
## 3 63 Publishing 30 4373 50000 canceled
## 4 98 Music 16 1979 18000 canceled
## 5 99 Games 24 5816 50000 canceled
## 6 101 Publishing 1 9 1500 canceled
## 7 133 Fashion 0 0 8000 canceled
## 8 135 Art 0 0 4303 canceled
## 9 169 Technology 0 0 20000 canceled
## 10 177 Photography 0 0 15000 canceled
These projects are canceled because pledged money is way less than goal. These projects are propably failed. We will apply “Conditionally replace multiple rows in a data frame”
KS$outcome[KS$pledged < KS$goal & KS$outcome == "canceled"] <- "failed"
b4. plegded > goal and the outcome = canceled, because the outcome should be labelled as successful. Similarly, we can change the outcomes of these records from canceled to faile
KS %>%
filter(pledged > goal & outcome == "canceled") %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 13
KS %>%
filter(pledged > goal & outcome == "canceled") %>%
head(10)
## # A tibble: 10 x 12
## id Category Subcategory launchedDay deadlineMonth deadlineDay days
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 797 Food Drinks 16 9 16 30
## 2 2871 Games Tabletop Games 2 3 2 29
## 3 4761 Games Video Games 12 10 12 30
## 4 6489 Technology Flight 18 2 18 30
## 5 6530 Design Architecture 16 12 16 45
## 6 7596 Technology Technology 28 1 28 46
## 7 8858 Food Food 9 7 9 30
## 8 10121 Fashion Footwear 15 10 15 28
## 9 10796 Design Product Design 22 11 22 31
## 10 11701 Technology Gadgets 24 4 24 36
## goal backers pledged outcome launchedmonth
## <dbl> <dbl> <dbl> <fct> <ord>
## 1 1000 45 1951 canceled Sept
## 2 35000 305 37954 canceled Mar
## 3 10000 163 10275. canceled Oct
## 4 100000 243 257771 canceled Fev
## 5 80 2 85 canceled Dec
## 6 100000 575 137377 canceled Jan
## 7 8 8 237 canceled July
## 8 2000 68 2249 canceled Oct
## 9 3000 228 13599 canceled Nov
## 10 60000 613 96583 canceled Apr
The outcome of these records will be rewritten as successful.
KS$outcome[KS$pledged > KS$goal & KS$outcome == "canceled"] <- "successful"
How many projects that were canceled?
KS %>%
group_by(outcome) %>%
count()
## # A tibble: 3 x 2
## # Groups: outcome [3]
## outcome n
## <fct> <int>
## 1 canceled 1
## 2 failed 8784
## 3 successful 5231
there is only one project that was canceled
KS %>%
select(id, Category, backers, pledged, goal, outcome) %>%
filter(outcome == "canceled")
## # A tibble: 1 x 6
## id Category backers pledged goal outcome
## <dbl> <fct> <dbl> <dbl> <dbl> <fct>
## 1 2540 Journalism 1 1 1 canceled
We can remove that record. A amount of 1 dollar as a goal seem to be not reasonable
KS <- KS %>%
filter(id != 2540)
How about goal ? are there any projects that put 0 dollar as goal?
summary(KS$goal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 2000 5296 61730 15293 100000000
How many project that the goal are less than 100 dollars
KS %>%
filter(goal < 100) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 166
Were these projects successful?
KS %>%
group_by(outcome) %>%
filter(goal < 100) %>%
count()
## # A tibble: 2 x 2
## # Groups: outcome [2]
## outcome n
## <fct> <int>
## 1 failed 75
## 2 successful 91
So far, we will keep these records.
str(KS)
## Classes 'tbl_df', 'tbl' and 'data.frame': 14015 obs. of 12 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Category : Factor w/ 15 levels "Art","Comics",..: 7 7 7 6 7 1 15 7 7 3 ...
## $ Subcategory : Factor w/ 157 levels "3D Printing",..: 152 128 6 8 39 70 140 152 24 32 ...
## $ launchedDay : num 25 16 28 22 24 25 1 21 3 14 ...
## $ deadlineMonth: num 4 11 8 7 8 9 7 5 3 4 ...
## $ deadlineDay : num 25 16 28 22 24 25 1 21 3 14 ...
## $ days : num 30 30 30 30 35 37 45 30 30 30 ...
## $ goal : num 15000 224 5000 6000 2000000 ...
## $ backers : num 3 23 28 218 2 5 45 4 0 11 ...
## $ pledged : num 20 414 1497 8795 2 ...
## $ outcome : Factor w/ 3 levels "canceled","failed",..: 2 3 2 3 2 2 2 2 2 2 ...
## $ launchedmonth: Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 4 11 8 7 8 9 7 5 3 4 ...
table(KS$outcome)
##
## canceled failed successful
## 0 8784 5231
The outcome still has 3 factor levels. We will remove the unused level (“canceled”)
KS$outcome <- factor(KS$outcome)
table(KS$outcome)
##
## failed successful
## 8784 5231
theme_general <- theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 14, face = "bold", color = "black"),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
legend.position = "right",
legend.title = element_text(size = 14),
legend.text = element_text(colour="black", size = 14))
Q1. What categories are most popular? We first begin by examining the number of projects by category
Category_df <- KS %>%
group_by(Category) %>%
summarise(counts = n()) %>%
mutate(Percentage = round(counts*100/sum(counts),2)) %>%
arrange(desc(counts))
ggplot(data = Category_df,
aes(x= reorder(Category, -counts),
y = counts,
fill = Percentage)) +
geom_bar(stat = "identity",
width = 0.7) +
scale_fill_gradient(low="skyblue1", high="royalblue4")+
geom_text(aes(label = paste0(counts,"\n", Percentage,"%")),
vjust = -0.1,
color = "darkblue",
size = 5) +
scale_y_continuous(limits = c(0,2500)) +
theme_minimal() +
labs(x = "Category",
y = "Frequency",
caption = "Kickstarter 2009-2014") +
theme_general
Film & Video, Music, and Publishing lead the chart. Journalism, Dance and Crafts are the least popular.
Preliminary analysis using SQL revealed that there are 153 subcatergories.
What are the top ten subcategories ?
subcat.freq <- KS %>%
dplyr::group_by(Subcategory) %>%
dplyr::summarize(Frequency = n()) %>%
arrange(desc(Frequency)) %>%
head(10)
ggplot(subcat.freq,
aes(x = reorder(Subcategory,-Frequency), Frequency, fill = Frequency)) +
geom_bar(stat="identity",
width = 0.65,
fill = "skyblue1",
colour = "skyblue1") +
labs(title = "Number of Projects by subcategory",
x = "The top 10 Project Subcategory",
y = "Frequency") +
geom_text(aes(label = Frequency),
vjust = 0.5,
hjust = -0.1,
color = "darkblue",
size = 5) +
coord_flip() +
theme_general
How to find which Category these subcategories belong?
KS %>%
select(Category, Subcategory) %>%
group_by(Category, Subcategory) %>%
filter(Subcategory %in% c("Product Design", "Documentary", "Music", "Tabletop Games",
"Shorts", "Video Games", "Food", "Film & Video", "Nonfiction", "Fiction")) %>%
summarise(fre = n()) %>%
arrange(desc(fre))
## # A tibble: 10 x 3
## # Groups: Category [6]
## Category Subcategory fre
## <fct> <fct> <int>
## 1 Design Product Design 876
## 2 Film & Video Documentary 654
## 3 Games Tabletop Games 521
## 4 Music Music 508
## 5 Film & Video Shorts 477
## 6 Games Video Games 421
## 7 Food Food 406
## 8 Film & Video Film & Video 356
## 9 Publishing Fiction 356
## 10 Publishing Nonfiction 356
These top sub-categories stemm from category: Design, Film & Video, Music, Games, Food and Publishing.
Question 2: What types of projects are being funded? This question is akin to the first question but phrased from the perspective of the backers. In other words, the most funded projects are the most popular projects in the perspective of the backers
KS %>%
select(Category, Subcategory, pledged) %>%
arrange(desc(pledged)) %>%
head(10)
## # A tibble: 10 x 3
## Category Subcategory pledged
## <fct> <fct> <dbl>
## 1 Technology Web 5408917.
## 2 Games Tabletop Games 3999796.
## 3 Film & Video Narrative Film 3105473.
## 4 Games Video Games 2278255.
## 5 Art Sculpture 1924018
## 6 Design Product Design 1842142.
## 7 Technology Hardware 1652247.
## 8 Games Tabletop Games 1546270.
## 9 Games Tabletop Games 1341305.
## 10 Technology Space Exploration 1241615.
A lot of the projects here fall under the “Games” and “Technology” category.
Similarly, let’s list the top 15 most backed projects (i.e. projects with the most backers).
KS %>%
select(Category, Subcategory, backers) %>%
arrange(desc(backers)) %>%
head(20)
## # A tibble: 20 x 3
## Category Subcategory backers
## <fct> <fct> <dbl>
## 1 Technology Web 105857
## 2 Film & Video Narrative Film 46520
## 3 Games Tabletop Games 40642
## 4 Technology Space Exploration 23331
## 5 Design Product Design 20680
## 6 Games Video Games 17765
## 7 Technology Gadgets 17218
## 8 Games Video Games 13987
## 9 Games Tabletop Games 13765
## 10 Design Graphic Design 11334
## 11 Games Video Games 9869
## 12 Technology Hardware 9805
## 13 Film & Video Animation 9122
## 14 Film & Video Science Fiction 8548
## 15 Games Tabletop Games 8396
## 16 Technology Hardware 8359
## 17 Technology Hardware 8019
## 18 Technology Hardware 8018
## 19 Technology Hardware 7766
## 20 Games Video Games 7564
8 technology 7 games 3 film & video
Question 3: What types of projects funding is going towards ? ===> aggregating the amount of funds pledged for each category, providing us with the total amount pledged for each category.
pledged.tot_df <- KS %>%
dplyr::group_by(Category) %>%
dplyr::summarize(total = sum(pledged/1000000)) %>%
mutate(Percentage = round(total*100/sum(total),2)) %>%
arrange(desc(total))
ggplot(data = pledged.tot_df,
aes(x= reorder(Category, -total),
y = total,
fill = Percentage)) +
geom_bar(stat = "identity",
width = 0.7) +
scale_fill_gradient(low="skyblue1", high="royalblue4")+
geom_text(aes(label = paste0("$",round(total,1),"M", "\n",Percentage,"%")),
vjust = -0.1,
color = "darkblue",
size = 5) +
scale_y_continuous(limits = c(0,30)) +
theme_minimal() +
labs(x = "Category",
y = "Pledges (Million USD)",
caption = "Kickstarter 2009-2014") +
ggtitle("Total Amount Pledged by Category") +
theme_general
Technology, Games, and Design are highest grossing categories by far.
Film& Video is the most popular among category. Less than 1 % of projects is Dance or Journalism. For each categpry, which catergory has higher successful rate?
library(arsenal)
## Warning: package 'arsenal' was built under R version 3.6.3
##
## Attaching package: 'arsenal'
## The following object is masked from 'package:Hmisc':
##
## %nin%
table_one <- tableby(outcome ~ Category + goal+ pledged + backers + days , data = KS)
summary(table_one, title = "Summary statistic of Kickstarter data")
##
## Table: Summary statistic of Kickstarter data
##
## | | failed (N=8784) | successful (N=5231) | Total (N=14015) | p value|
## |:------------------------------|:-----------------------:|:----------------------:|:-----------------------:|-------:|
## |**Category** | | | | < 0.001|
## | Art | 621 (7.1%) | 425 (8.1%) | 1046 (7.5%) | |
## | Comics | 171 (1.9%) | 243 (4.6%) | 414 (3.0%) | |
## | Crafts | 246 (2.8%) | 90 (1.7%) | 336 (2.4%) | |
## | Dance | 41 (0.5%) | 81 (1.5%) | 122 (0.9%) | |
## | Design | 730 (8.3%) | 426 (8.1%) | 1156 (8.2%) | |
## | Fashion | 604 (6.9%) | 218 (4.2%) | 822 (5.9%) | |
## | Film & Video | 1492 (17.0%) | 909 (17.4%) | 2401 (17.1%) | |
## | Food | 675 (7.7%) | 253 (4.8%) | 928 (6.6%) | |
## | Games | 773 (8.8%) | 463 (8.9%) | 1236 (8.8%) | |
## | Journalism | 82 (0.9%) | 28 (0.5%) | 110 (0.8%) | |
## | Music | 917 (10.4%) | 932 (17.8%) | 1849 (13.2%) | |
## | Photography | 260 (3.0%) | 141 (2.7%) | 401 (2.9%) | |
## | Publishing | 1026 (11.7%) | 507 (9.7%) | 1533 (10.9%) | |
## | Technology | 960 (10.9%) | 251 (4.8%) | 1211 (8.6%) | |
## | Theater | 186 (2.1%) | 264 (5.0%) | 450 (3.2%) | |
## |**goal** | | | | 0.007|
## | Mean (SD) | 92612.452 (2211566.793) | 9870.564 (36745.974) | 61729.625 (1751416.964) | |
## | Range | 1.000 - 100000000.000 | 0.750 - 2000000.000 | 0.750 - 100000000.000 | |
## |**pledged** | | | | < 0.001|
## | Mean (SD) | 1472.989 (7024.771) | 22676.391 (131737.279) | 9387.009 (81319.461) | |
## | Range | 0.000 - 260372.610 | 4.640 - 5408916.950 | 0.000 - 5408916.950 | |
## |**backers** | | | | < 0.001|
## | Mean (SD) | 18.065 (89.154) | 285.578 (1913.957) | 117.912 (1178.488) | |
## | Range | 0.000 - 4201.000 | 1.000 - 105857.000 | 0.000 - 105857.000 | |
## |**days** | | | | < 0.001|
## | Mean (SD) | 34.576 (11.941) | 31.615 (10.422) | 33.471 (11.487) | |
## | Range | 1.000 - 60.000 | 1.000 - 60.000 | 1.000 - 60.000 | |
It is important to take the number of backers into account as well ==> determine the average amount pledged per backer for each category. ==> calculate this by taking the total amount pledged for each category and dividing it by the total number of backers for each category.
library(pander)
pledged.avg <- KS %>%
dplyr::group_by(Category) %>%
dplyr::summarize(pledged = sum(pledged), backers = sum(backers)) %>%
mutate(avg = pledged/backers) %>%
arrange(desc(avg))
pledged.avg$Category <- factor(pledged.avg$Category, levels = pledged.avg$Category)
pledged.avg %>%
pander()
Category | pledged | backers | avg |
---|---|---|---|
Art | 4907098 | 43591 | 112.6 |
Food | 4619730 | 45354 | 101.9 |
Fashion | 5412036 | 53726 | 100.7 |
Design | 23300952 | 256999 | 90.67 |
Film & Video | 18921532 | 217493 | 87 |
Photography | 1515172 | 17513 | 86.52 |
Technology | 27749278 | 327101 | 84.83 |
Dance | 483046 | 5947 | 81.23 |
Theater | 1474214 | 18359 | 80.3 |
Journalism | 439491 | 6021 | 72.99 |
Games | 27480040 | 406795 | 67.55 |
Music | 6850120 | 103560 | 66.15 |
Publishing | 5149074 | 84421 | 60.99 |
Crafts | 571697 | 10361 | 55.18 |
Comics | 2685461 | 55302 | 48.56 |
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 3.6.3
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
ggplot(pledged.avg, aes(Category, avg, fill = avg)) +
geom_bar(stat = "identity") +
labs(title = "Average Amount Pledged per Backer",
x = "Project Category",
y = "Amount Pledged (USD)") +
geom_text(aes(label=paste0("$", round(avg,2))),
vjust= 0.5,
hjust = -0.1) +
scale_fill_distiller(palette = "Spectral") +
coord_flip() +
theme_general
“Art” has the highest average amount pledged, whereas Comics has the lowest average amount pledged. An interesting note here is that the average amount pledged for “Art” is nearly double that of “Comics”, even though Games had the higher aggregate amount pledged as shown in the previous graph.
Question 4: Distribution of pledged for each category
ggplot(KS, aes(Category, pledged, fill = Category)) +
geom_boxplot() +
ggtitle("Amount Pledged vs. Project Category") +
xlab("Project Category") +
ylab("Amount Pledged (USD)") +
theme(plot.title = element_text(size = 15, face = "bold", hjust = 0.5),
axis.title = element_text(size = 12, face = "bold"),
axis.text.x=element_text(size = 12, angle = 90), legend.position = "null") +
coord_cartesian(ylim=c(0,30000))
There are no doubt a lot of projects that received little to no funding as well as huge outliers.
KS %>%
group_by(Category) %>%
summarise(median = median(pledged)) %>%
arrange(desc(median))
## # A tibble: 15 x 2
## Category median
## <fct> <dbl>
## 1 Dance 2328
## 2 Design 2091.
## 3 Comics 1720.
## 4 Theater 1403.
## 5 Games 1378.
## 6 Music 1085
## 7 Film & Video 757.
## 8 Art 460.
## 9 Photography 399.
## 10 Publishing 320
## 11 Food 307.
## 12 Technology 302
## 13 Fashion 279.
## 14 Journalism 141.
## 15 Crafts 90.0
Designe, Game and Comics have higher median than other category.
Many projects, smnaller goals = lower median. Higher median == less project, less backer, but the pledged is dispersed.
Having looked at both the amount pledged and goal for different kinds of projects, see how the distribution of each compares to one another. Since we can expect both distributions to be heavily right-skewed due to many projects that received little to no funding and extremely high outliers, we will use a log transformation on both variables to better visualize their distributions.
outcome.pct <- KS %>%
dplyr::group_by(Category, outcome) %>%
dplyr::summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
arrange(desc(outcome), pct)
outcome.pct$Category <- factor(outcome.pct$Category,levels=outcome.pct$Category[1:(nrow(outcome.pct)/2)])
plot1 <- ggplot(outcome.pct, aes(reorder(Category,-count), pct, fill = outcome)) +
geom_bar(stat="identity") +
ggtitle("Success vs. Failure Rate by Project Category") +
xlab("Project Category") +
ylab("Percentage") +
scale_y_continuous(labels=scales::percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct*100,1),"%")),
position = position_stack(vjust=0.5),
colour = "blue",
size = 3.5,
fontface = "bold") +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 12),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
legend.position = "bottom",
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(colour="black", size = 12)) +
coord_flip()
plot2 <- ggplot(outcome.pct, aes(reorder(Category,-count), count, fill = outcome)) +
geom_bar(stat="identity") +
ggtitle("Success vs. Failure Rate by Project Category") +
xlab("Project Category") + ylab("Number of Projects") +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(count,1),"")),
position = position_stack(vjust = 0.5),
colour = "blue",
size = 3,
fontface = "bold") +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 12),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
legend.position = "bottom",
legend.title = element_text(size = 12),
legend.text = element_text(colour="black", size = 12)) +
coord_flip()
gridExtra::grid.arrange(plot1, plot2, ncol = 2)
Dance, Comics, and Theater have the highest success rates and Technology, Fashion, Journalism, Crafts, and Foods have the lowest. This agrees with the box plots for amounts pledged and project goal amounts above as Dance and Comics both had high median amounts pledged and low median goals, with Theater having a low median goal as well. Technology, Journalism, and Crafts had low median amounts pledged, with Technology having a high median goal. In general, the higher the amount pledged and/or the lower the goal, the more likely a project will be successful. Interestingly enough, Crafts, having a low median amount pledged, also has a low success rate despite having a low median goal, which may indicate that people generally are not as interested in this category as a whole.
Question 5 : What types of projects were successful and unsuccessful? reak down the number of projects by their status (e.g. successful, failed, cancelled, etc.).
outcome_prop <- KS %>%
group_by(outcome) %>%
count() %>%
ungroup()%>%
arrange(desc(outcome)) %>%
mutate(percentage = round(n/sum(n),4)*100,
lab.pos = cumsum(percentage)-0.5*percentage)
ggplot(data = outcome_prop,
aes(x = "",
y = percentage,
fill = outcome))+
geom_bar(stat = "identity")+
coord_polar("y") +
geom_text(aes(y = lab.pos,
label = paste(percentage,"%", sep = "")), col = "blue", size = 5) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
theme_void() +
theme(legend.title = element_text(color = "black", size = 14),
legend.text = element_text(color = "black", size = 14))
About two thirds of projects see the light of day.
Question 6: Month of year matters? 1. Number of projects
df_month <- KS %>%
group_by(launchedmonth) %>%
summarise(Count = n())
ggplot(df_month, aes(launchedmonth, Count)) +
geom_point(aes(size = Count, group = Count), colour = "blue") +
labs(title = "Number of projects by month",
x = "Month",
y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
legend.position = "right",
legend.title = element_text(size= 14, face = "bold"))
Less projects launched in January and February. Much less projects launched in December
month.pct <- KS %>%
dplyr::group_by(launchedmonth, outcome) %>%
dplyr::summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
arrange(desc(outcome), pct)
ggplot(month.pct, aes(launchedmonth,
pct,
fill = outcome)) +
geom_bar(stat="identity") +
ggtitle("Success vs. Failure Rate by month") +
xlab("Month") +
ylab("Percentage") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(pct*100,1),"%")),
position = position_stack(vjust = 0.5),
colour = "blue",
size = 3.5,
fontface = "bold") +
theme_general
The rate of success is lowest in July (32.8%) and highest in March and November (about 40%)
Chi-square test for association
H0: Month is not associated with the outcome Ha: Month is associated with the outcome
chisq.test(KS$launchedmonth, KS$outcome)
##
## Pearson's Chi-squared test
##
## data: KS$launchedmonth and KS$outcome
## X-squared = 28.469, df = 11, p-value = 0.002742
The p-value = 0.003 < 0.01. We reject the null hypothesis. We conclude that month is associated with the outcome
Day of month matters?
launchedDay_tbl <- KS %>%
group_by(launchedDay) %>%
count()
ggplot(launchedDay_tbl, aes(x = launchedDay, y = n)) +
geom_line(color = "grey", size = 1.5) +
geom_point(color = "blue", size = 4)+
geom_smooth(colour="green", method = "lm", se = FALSE) +
scale_x_continuous(breaks = seq(from = 0, to = 31, by = 5)) +
labs(title = "Number of projects by day of month",
x = "Day of month",
y = "Frequency") +
theme_set(theme_classic()) +
theme_general
## `geom_smooth()` using formula 'y ~ x'
The number of launched projects is highest on the first day of month and decreases toward the end of month. What are the successful rate in function of day
day.pct <- KS %>%
dplyr::group_by(launchedDay, outcome) %>%
dplyr::summarize(count = n()) %>%
mutate(percentage = count/sum(count)) %>%
arrange(launchedDay, percentage)
ggplot(day.pct, aes(launchedDay,
percentage,
fill = outcome)) +
geom_bar(stat="identity") +
ggtitle("Success vs. Failure Rate by day of month") +
xlab("Day of month") +
ylab("Percentage") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(breaks = c(1:31))+
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
geom_text(aes(label = paste0(round(percentage*100,1),"%")),
position = position_stack(vjust = 0.5),
colour = "blue",
size = 3.5,
fontface = "bold") +
theme_general
The lowest successful rate is 33.5 and the highest successful rate is 43.4% There is no pattern that can be observed here.
Outcome and duration of project
# Overlaid histograms
ggplot(KS, aes(x=days, color=outcome)) +
geom_histogram(fill="white", alpha=0.5, position="identity") +
scale_color_manual(values=c("red", "blue"))+
scale_fill_manual(values=c("red", "blue"))+
theme_general
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(KS, aes(x = factor(outcome),
y = days,
fill = factor(outcome))) +
geom_boxplot() +
geom_jitter(width = 0.35,
alpha = 0.2,
shape = 15,
color = "steelblue") +
scale_y_continuous (breaks = seq(0, 70, 15),
limits = c(0, 70)) +
ggtitle("Distribution of days by outcome") +
xlab("Outcome") +
ylab("Duration of projects (days)") +
labs(fill = "Outcome") +
scale_fill_manual(values=c("#E3CD81FF", "#B1B3B3FF")) +
theme(axis.line = element_line(size = 0.5, colour = "grey"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, face = "bold"),
text = element_text(colour = "black", size = 14),
axis.text.x = element_text(colour = "black", size = 14),
axis.text.y = element_text(colour = "black", size = 14),
legend.position = "right")
There are many projects that were launched and ended for a duration of 30, 45, and 60 days.
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.6.3
#Correlation backers, pledge, goal and days
corr <- round(cor(KS[c(7:10)]),2)
ggcorrplot(corr,
type = "upper",
lab = TRUE,
outline.col = "white",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
There is a strong and positive correlatin between pledged and backers.
ggscatter(KS, x = "backers", y = "pledged",
add = "reg.line",
conf.int = TRUE,
cor.coef = TRUE,
cor.method = "pearson",
xlab = "Number of backers",
ylab = "Pledged amount (USD))") + theme_general
## `geom_smooth()` using formula 'y ~ x'
df_goal <- KS %>%
filter(goal!="0")
ggplot(df_goal,
aes(log(backers),log(pledged),
color = outcome)) +
geom_point(position = "identity") +
ggtitle("Backers in function of pledged amount") + xlab("log(Backers)") +
ylab("Pledged") +
theme_general
Goals in function of usd pledged
ggplot(df_goal, aes(log(goal),log(pledged), color = outcome)) +
geom_point(position="identity") +
ggtitle("Goals in function of Pledges") +
xlab("log(Goals)") +
ylab("log(pledged)") +
theme_general
Goals in function of backers
ggplot(df_goal, aes(log(backers),log(goal), color = outcome)) +
geom_point(position="identity") +
ggtitle("Backers in function of Goals") +
xlab("log(Backers)") +
ylab("log(Goals") +
theme_general
Pledge’s length in function of backers
ggscatter(KS, x = "days", y = "pledged",
add = "reg.line",
conf.int = TRUE,
cor.coef = TRUE,
cor.method = "pearson",
xlab = "Number of backers",
ylab = "Pledged amount (USD))") +
theme_general
## `geom_smooth()` using formula 'y ~ x'
str(KS)
## Classes 'tbl_df', 'tbl' and 'data.frame': 14015 obs. of 12 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Category : Factor w/ 15 levels "Art","Comics",..: 7 7 7 6 7 1 15 7 7 3 ...
## $ Subcategory : Factor w/ 157 levels "3D Printing",..: 152 128 6 8 39 70 140 152 24 32 ...
## $ launchedDay : num 25 16 28 22 24 25 1 21 3 14 ...
## $ deadlineMonth: num 4 11 8 7 8 9 7 5 3 4 ...
## $ deadlineDay : num 25 16 28 22 24 25 1 21 3 14 ...
## $ days : num 30 30 30 30 35 37 45 30 30 30 ...
## $ goal : num 15000 224 5000 6000 2000000 ...
## $ backers : num 3 23 28 218 2 5 45 4 0 11 ...
## $ pledged : num 20 414 1497 8795 2 ...
## $ outcome : Factor w/ 2 levels "failed","successful": 1 2 1 2 1 1 1 1 1 1 ...
## $ launchedmonth: Ord.factor w/ 12 levels "Jan"<"Fev"<"Mar"<..: 4 11 8 7 8 9 7 5 3 4 ...
Creative <- KS %>%
dplyr::select(Category, launchedDay, launchedmonth, backers, days, pledged, goal, outcome)
head(Creative)
## # A tibble: 6 x 8
## Category launchedDay launchedmonth backers days pledged goal
## <fct> <dbl> <ord> <dbl> <dbl> <dbl> <dbl>
## 1 Film & Video 25 Apr 3 30 20 15000
## 2 Film & Video 16 Nov 23 30 414. 224.
## 3 Film & Video 28 Aug 28 30 1497 5000
## 4 Fashion 22 July 218 30 8795 6000
## 5 Film & Video 24 Aug 2 35 2 2000000
## 6 Art 25 Sept 5 37 230 2500
## outcome
## <fct>
## 1 failed
## 2 successful
## 3 failed
## 4 successful
## 5 failed
## 6 failed
#— Training & Validation Sets —#
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
set.seed(100)
Trainindex <- createDataPartition(y = Creative$outcome, p = .70, list = FALSE)
training <- Creative[Trainindex ,]
validation <- Creative[-Trainindex,]
training_new <- training[-8]
validation_new <- validation[-8]
training_label <- training$outcome
validation_label <- validation$outcome
3.1. Logistic regression
library(rpart)
## Warning: package 'rpart' was built under R version 3.6.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.3
library(caret)
set.seed(123)
default_glm_mod <- train(form = outcome ~ .,
data = training,
method = "glm",
family = "binomial",
tuneLength = 5)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_pred <- predict(default_glm_mod, newdata= validation)
confusionMatrix(glm_pred, validation$outcome, positive = "successful")
## Confusion Matrix and Statistics
##
## Reference
## Prediction failed successful
## failed 2633 2
## successful 2 1567
##
## Accuracy : 0.999
## 95% CI : (0.9976, 0.9997)
## No Information Rate : 0.6268
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.998
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9987
## Specificity : 0.9992
## Pos Pred Value : 0.9987
## Neg Pred Value : 0.9992
## Prevalence : 0.3732
## Detection Rate : 0.3727
## Detection Prevalence : 0.3732
## Balanced Accuracy : 0.9990
##
## 'Positive' Class : successful
##
summary(default_glm_mod)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3953 0.0000 0.0000 0.0000 0.4835
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.535e+01 2.488e+02 0.062 0.9508
## CategoryComics -1.265e+01 9.261e+01 -0.137 0.8914
## CategoryCrafts -6.878e+00 6.477e+00 -1.062 0.2883
## CategoryDance 2.407e+02 2.312e+05 0.001 0.9992
## CategoryDesign 2.526e+00 6.290e+01 0.040 0.9680
## CategoryFashion -5.842e+00 5.825e+00 -1.003 0.3159
## `CategoryFilm & Video` -2.403e+00 4.308e+00 -0.558 0.5770
## CategoryFood -8.931e+00 5.346e+00 -1.671 0.0948 .
## CategoryGames -6.475e+00 5.092e+00 -1.271 0.2036
## CategoryJournalism 4.772e+00 8.147e+01 0.059 0.9533
## CategoryMusic -3.628e+00 4.876e+00 -0.744 0.4568
## CategoryPhotography 7.384e+00 1.090e+02 0.068 0.9460
## CategoryPublishing -8.153e+00 3.955e+00 -2.061 0.0393 *
## CategoryTechnology -7.458e+00 1.168e+02 -0.064 0.9491
## CategoryTheater -2.255e+00 9.079e+00 -0.248 0.8038
## launchedDay 2.115e-02 1.285e-01 0.165 0.8693
## launchedmonth.L 4.234e+00 6.241e+02 0.007 0.9946
## launchedmonth.Q -7.135e+00 4.629e+02 -0.015 0.9877
## launchedmonth.C -1.194e+01 1.040e+03 -0.011 0.9908
## `launchedmonth^4` -4.614e+00 4.342e+02 -0.011 0.9915
## `launchedmonth^5` 9.839e+00 6.861e+02 0.014 0.9886
## `launchedmonth^6` 1.703e+01 1.114e+03 0.015 0.9878
## `launchedmonth^7` -9.812e-01 4.085e+02 -0.002 0.9981
## `launchedmonth^8` -5.329e+00 7.602e+02 -0.007 0.9944
## `launchedmonth^9` -2.387e+01 1.415e+03 -0.017 0.9865
## `launchedmonth^10` -1.047e+01 1.221e+03 -0.009 0.9932
## `launchedmonth^11` -8.940e+00 5.862e+02 -0.015 0.9878
## backers -1.378e-02 1.396e-01 -0.099 0.9213
## days -1.242e-01 1.065e-01 -1.166 0.2435
## pledged 9.599e-01 1.855e-01 5.176 2.27e-07 ***
## goal -9.570e-01 1.850e-01 -5.173 2.30e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12963.5705 on 9810 degrees of freedom
## Residual deviance: 1.9256 on 9780 degrees of freedom
## AIC: 63.926
##
## Number of Fisher Scoring iterations: 25
Only pledged and goal are important. bakers is not important
#— r part —#
set.seed(100)
rparttree <- rpart(outcome ~ pledged + goal, data = training,
method = "class",
cp = .001,
minsplit = 5,
xval = 5)
# Plot the trees
rpart.plot(rparttree)
plotcp(rparttree)
set.seed(123)
optimal_tree <- rpart(
formula = outcome ~ .,
data = training,
method = "class",
control = list(minsplit = 50,
minbucket= 100,
maxdepth = 4,
xval = 10,
cp = 0.0024))
pred <- predict(optimal_tree, newdata = validation,type = "class")
confusionMatrix(data = pred , validation$outcome, positive = "successful")
## Confusion Matrix and Statistics
##
## Reference
## Prediction failed successful
## failed 2472 41
## successful 163 1528
##
## Accuracy : 0.9515
## 95% CI : (0.9445, 0.9578)
## No Information Rate : 0.6268
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8979
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9739
## Specificity : 0.9381
## Pos Pred Value : 0.9036
## Neg Pred Value : 0.9837
## Prevalence : 0.3732
## Detection Rate : 0.3635
## Detection Prevalence : 0.4022
## Balanced Accuracy : 0.9560
##
## 'Positive' Class : successful
##
library(rattle)
## Warning: package 'rattle' was built under R version 3.6.3
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(optimal_tree,
palettes=c("Greys", "Blues"),
main = "Creative Kickstarter Classification Tree")
rpart.plot(optimal_tree, type = 4,
clip.right.labs = FALSE,
branch.lty = 3, # dotted branch lines
branch = 0.7,
under = TRUE,
nn = TRUE)