packages <- c('tidyverse', 'ggplot2', 'cowplot', 'DBI', 'lubridate', 'scales', 'randomForest')
sapply(packages, require, character.only = TRUE)
## tidyverse ggplot2 cowplot DBI lubridate scales
## TRUE TRUE TRUE TRUE TRUE TRUE
## randomForest
## TRUE
sql_file <- 'ks-data.sql'
options(scipen = 999)
# Connection object, SQLite driver, db in memory
con <- dbConnect(RSQLite::SQLite(), dbname = ':memory:')
getSQL <- function(sql_file){
sql_script <- ''
# Read .sql by lines and remove those that affects execution in SQLite3 driver
for (line in read_lines(sql_file, skip = 21, skip_empty_rows = TRUE)) {
if (sum(startsWith(x = line, prefix = c('/*', '--', ' KEY', 'LOCK', 'UNLOCK'))) == 0) {
sql_script <- c(sql_script, line)
}
}
# Remove the first element since it's blank, then joined all lines with line break
sql_script <- paste(sql_script[-1], collapse = '\n')
# Replace certain patterns with a ',,,' handle for splitting into CREATE and INSERT chunks
sql_script <- gsub(sql_script,
pattern = ' ENGINE=InnoDB DEFAULT CHARSET=latin1;',
replacement = ';,,,', fixed = TRUE)
sql_script <- gsub(sql_script,
pattern = 'CREATE TABLE',
replacement = ',,,CREATE TABLE', fixed = TRUE)
sql_script <- gsub(sql_script,
pattern = 'DROP TABLE',
replacement = ',,,DROP TABLE', fixed = TRUE)
# Changing \\ into \
sql_script <- gsub(sql_script,
pattern = '\\\'',
replacement = '', fixed = TRUE)
# Splitting into individual statement of DROP, CREATE, and INSERT
statements <- strsplit(sql_script, ',,,', fixed = TRUE)
return(statements[[1]][-1])
}
The reason to load the database here is to be able to use R markdown format of code blocks (both R and SQL queries), annotations, and plots to generate a clean report. Importing the .sql file here removes the requirement for MySQL server/workbench and allowing this script to be reproduced anywhere as long as the ks-data.sql file is present.
sql_script <- getSQL(sql_file)
for (statement in sql_script) {
dbExecute(conn = con, statement = statement)
}
SELECT * FROM campaign LIMIT 5
| id | name | sub_category_id | country_id | currency_id | launched | deadline | goal | pledged | backers | outcome |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Ragdolls | 23 | 2 | 2 | 2013-04-25 00:00:00 | 2013-05-25 00:00:00 | 15000.00 | 20.00 | 3 | failed |
| 2 | One To Ten | 47 | 1 | 1 | 2015-11-16 00:00:00 | 2015-12-16 00:00:00 | 223.68 | 413.81 | 23 | successful |
| 3 | Future Gun - A short film about kids and advanced hardware | 24 | 2 | 2 | 2013-08-28 00:00:00 | 2013-09-27 00:00:00 | 5000.00 | 1497.00 | 28 | failed |
| 4 | Fourth Wave Apparel—Vintage Fashion for the Modern Feminist | 52 | 2 | 2 | 2014-07-22 00:00:00 | 2014-08-21 00:00:00 | 6000.00 | 8795.00 | 218 | successful |
| 5 | The Rich Lifestyle | 9 | 2 | 2 | 2016-08-24 00:00:00 | 2016-09-28 00:00:00 | 2000000.00 | 2.00 | 2 | failed |
The executive team of a small board game company has approached you seeking assistance with setting up their Kickstarter campaign. The team has decided that they will need a minimum of $15,000 USD to get this project off the ground. However, they have ambitions of expanding the business and would like to maximize their funding. They must decide how much money to ask for and determine how many backers it will need to succeed.
SELECT cam.id, cam.name, cam.goal, cam.pledged, cam.outcome, cur.name as currency, coun.name as country
FROM campaign cam
JOIN currency cur ON cam.currency_id = cur.id
JOIN country coun ON cam.country_id = coun.id
/* Output to dataframe q1 */
Converting currency to USD and defining unsuccessful to include failed, canceled, and suspended:
# Some categorical variables can be factors
q1[c('outcome', 'currency', 'country')] <- lapply(q1[c('outcome', 'currency', 'country')], as.factor)
unique(q1$currency)
## [1] USD GBP EUR CAD SEK AUD HKD MXN NZD DKK NOK SGD CHF
## Levels: AUD CAD CHF DKK EUR GBP HKD MXN NOK NZD SEK SGD USD
# On 2021-05-12 10:05:37 PDT, conversion rates are as followed (provided by Morningstar)
exchange_rate <- data.frame(currency = unique(q1$currency),
exr = c(1, 1.41, 1.21, 0.83, 0.12, 0.77, 0.13, 0.05, 0.72, 0.16, 0.12, 0.75, 1.10))
# Adding new columns in q1 for rate and converted currency
q1 <- q1 %>% inner_join(exchange_rate, by = 'currency')
q1$goal_usd <- round(q1$goal * q1$exr, 2)
q1$pledged_usd <- round(q1$pledged * q1$exr, 2)
q1$pledged_to_goal <- round(q1$pledged_usd / q1$goal_usd, 2)
# Defining "unsuccessful"
unsuccessful <- c('failed', 'canceled', 'suspended')
q1$outcome2 <- ifelse(q1$outcome %in% unsuccessful, 'unsuccessful',
ifelse(q1$outcome == 'successful', 'successful', 'other'))
Mean differences between successful and other outcomes:
q1_outcome <-
q1 %>%
group_by(outcome) %>%
filter(outcome %in% c('successful', 'failed')) %>%
summarise(count = n(),
total_goal_usd = sum(goal_usd),
total_pledged_usd = sum(pledged_usd),
avg_goal_usd = mean(goal_usd),
avg_pledged_usd = mean(pledged_usd),
avg_pledged_to_goal = mean(pledged_to_goal)) %>%
add_row(
q1 %>%
group_by(outcome2) %>%
filter(outcome2 %in% c('unsuccessful')) %>%
summarise(count = n(),
total_goal_usd = sum(goal_usd),
total_pledged_usd = sum(pledged_usd),
avg_goal_usd = mean(goal_usd),
avg_pledged_usd = mean(pledged_usd),
avg_pledged_to_goal = mean(pledged_to_goal)) %>%
rename(outcome = outcome2)
)
pg_compare <-
q1_outcome %>%
pivot_longer(!outcome, names_to = 'categories') %>%
subset(categories %in% c('avg_pledged_usd', 'avg_goal_usd')) %>%
ggplot(aes(x = outcome, y = value)) +
geom_bar(aes(fill = categories), position = 'dodge', stat = 'identity') +
labs(title = 'Dollars (USD) raised by Outcomes',
x = 'Outcome', y = 'Amount (USD)') +
scale_y_continuous(breaks = pretty_breaks(n = 5),
labels = dollar) +
scale_fill_discrete(name = 'Fund types (average)', labels = c('Goal', 'Pledged')) +
theme(plot.title = element_text(hjust = 0.5),
panel.grid = element_blank(),
legend.position = 'top')
pg_ratio <-
q1_outcome %>%
pivot_longer(!outcome, names_to = 'categories') %>%
subset(categories %in% c('avg_pledged_to_goal')) %>%
ggplot(aes(x = outcome, y = value)) +
geom_bar(position = 'dodge', stat = 'identity') +
labs(title = 'Pledged:Goal Ratio',
x = 'Outcome', y = 'Pledged:Goal') +
theme(plot.title = element_text(hjust = 0.5),
panel.grid = element_blank())
plot_grid(pg_compare, pg_ratio, nrow = 1)
It would appear that in a comparison of pledged amount vs goal amount, the failed and the unsuccessful outcomes (including failed, canceled, and suspended) have fairly unrealistic goal amounts. Perhaps as a result, pledges to these unrealistic goals are very low.
Plotting Pledge:Goal Ratio shows that on average, pledges only made up to around 10% of the goal, whereas pledges for succesful projects could be more than double of the goal:
Central Tendencies:
# Quick function for mode
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Central tendencies, group by outcomes
q1 %>%
group_by(outcome) %>%
summarise(count = n(),
min = min(goal_usd),
mean = mean(goal_usd),
mode = Mode(goal_usd),
median = median(goal_usd),
max = max(goal_usd))
## # A tibble: 6 x 7
## outcome count min mean mode median max
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 canceled 875 0.15 161554. 5000 10000 107408764.
## 2 failed 4579 1 94182. 5000 7500 100000000
## 3 live 62 133. 28394. 20000 6250 300000
## 4 successful 3023 1 9628. 5000 4000 1000000
## 5 suspended 48 1 2110451. 50000 3250 100000000
## 6 undefined 79 866. 4433. 5000 4500 11058.
There are some fairly skewed distributions, especially for the outcomes under consideration.
Assume we are looking at categories as a whole, not as projects in the categories that had the most backers:
SELECT cam.id, cam.name, cam.launched, cam.deadline, cam.backers, cam.pledged, s_cat.name as sub_cat, cat.name as cat
FROM campaign cam
JOIN sub_category s_cat ON cam.sub_category_id = s_cat.id
JOIN category cat ON cat.id = s_cat.category_id
/* Output to dataframe q2 */
# Some variables to date and factor objects
q2[c('launched', 'deadline')] <- lapply(q2[c('launched', 'deadline')], as_date)
q2[c('sub_cat', 'cat')] <- lapply(q2[c('sub_cat', 'cat')], as.factor)
# Group by provided group variable name in characters, sum by provided metric variable name in characters
cat_sum <- function(df, group, metric) {
temp <- df %>%
group_by(across(all_of(group))) %>%
summarise(total_metric = sum(across(all_of(metric)))) %>%
arrange(total_metric)
return(temp)
}
# Return duration in days, given df and category name
avg_dur <- function(df, categories) {
durs <- 0
for (category in categories) {
temp <- df[df$cat == category, ] %>%
summarise(avg_len = round(mean(interval(launched, deadline) / days(1)), 2))
durs <- c(durs, temp$avg_len)
}
return(durs[-1])
}
top_backers <- cat_sum(q2, 'cat', 'backers') %>% tail(3)
top_backers <- top_backers[order(top_backers$total_metric, decreasing = TRUE), ]$cat
bottom_backers <- cat_sum(q2, 'cat', 'backers') %>% head(3)
The top three categories with the most backers are Games, Technology, Design. The average length of their campaigns are 32.27, 35.48, 35.08 days, respectively.
The bottom three categories are Dance, Journalism, Crafts. The average length of their campaigns are 31.66, 36.31, 31.74 days, respectively.
Assuming the question meant money pledged, regardless of success or not:
In SQL it should look like this for top 3:
SELECT cat.name as category, SUM(cam.pledged) as total_money_raised
FROM campaign cam
JOIN sub_category s_cat ON cam.sub_category_id = s_cat.id
JOIN category cat ON cat.id = s_cat.category_id
GROUP BY cat.name
ORDER BY SUM(cam.pledged) DESC
LIMIT 3
| category | total_money_raised |
|---|---|
| Games | 16787321 |
| Technology | 15682750 |
| Design | 12722717 |
For bottom 3:
SELECT cat.name as category, SUM(cam.pledged) as total_money_raised
FROM campaign cam
JOIN sub_category s_cat ON cam.sub_category_id = s_cat.id
JOIN category cat ON cat.id = s_cat.category_id
GROUP BY cat.name
ORDER BY SUM(cam.pledged) ASC
LIMIT 3
| category | total_money_raised |
|---|---|
| Journalism | 228386.9 |
| Crafts | 308738.5 |
| Dance | 309326.3 |
But since the function is written already for answering Question 2, this can be done on the q2 dataset:
top_pledges <- cat_sum(q2, 'cat', 'pledged') %>% tail(3)
top_pledges <- top_pledges[order(top_pledges$total_metric, decreasing = TRUE), ]$cat
bottom_pledges <- cat_sum(q2, 'cat', 'pledged') %>% head(3)
The top three categories with the most pledges are Games, Technology, Design.
The bottom three categories are Journalism, Crafts, Dance. The groups are the same when compared to number of backers, just a little different in the order. This makes sense as number of backers directly impact pledges.
First we find the subcategory name that represent board games:
SELECT *
FROM sub_category
WHERE name LIKE '%Game%'
| id | name | category_id |
|---|---|---|
| 13 | Games | 7 |
| 14 | Tabletop Games | 7 |
| 44 | Video Games | 7 |
| 66 | Mobile Games | 7 |
| 122 | Live Games | 7 |
SQL Query:
SELECT cam.name as project, cam.backers as backers, cat.name as category, cam.goal as goal, cam.pledged as money_raised, s_cat.name as subcategory
FROM campaign cam
JOIN sub_category s_cat ON cam.sub_category_id = s_cat.id
JOIN category cat ON cat.id = s_cat.category_id
WHERE s_cat.name = 'Tabletop Games'
AND cam.outcome = 'successful'
ORDER BY money_raised DESC
LIMIT 5
/* Output to dataframe q4 */
The most successful board game (subcategory: Tabletop Games) is Ghostbustersâ„¢: The Board Game, clocking at $1,546,269.50, which is about 6 times more than the goal $250,000.00. This project had 8396 backers.
SELECT cam.name as campaign, coun.name as country, cam.outcome as outcome, SUM(cam.pledged) as money_raised
FROM campaign cam
JOIN country coun ON cam.country_id = coun.id
WHERE cam.outcome = 'successful'
GROUP BY coun.name
ORDER BY SUM(cam.pledged) DESC
LIMIT 10
/* Output to dataframe q5 */
Top 3 countries with the most successful campaigns in terms of dollars are US, GB, CA, with campaigns Fourth Wave Apparel—Vintage Fashion for the Modern Feminist, One To Ten, Ryan Snooks Thesis Film \"META\", which raised $53,799,790.46, $4,299,118.10, $1,181,331.28 respectively.
SELECT coun.name as country, cam.outcome as outcome, COUNT(cam.name) as total_campaigns
FROM campaign cam
JOIN country coun ON cam.country_id = coun.id
WHERE cam.outcome = 'successful'
GROUP BY coun.name
ORDER BY COUNT(cam.name) DESC
LIMIT 10
Top 3 countries with the most successful campaigns in terms of the number of campaigns backed are US, GB, CA, which backed 2489, 275, 78 campaigns, respectively.
SQL Query to retrieve a dataframe that contains currency names:
SELECT cam.id, cam.name, cam.launched, cam.deadline, cam.goal, cam.pledged, cam.outcome, cam.backers, cur.name as currency
FROM campaign cam
JOIN currency cur ON cam.currency_id = cur.id
/* Output to dataframe q6 */
Prepare the dataframe to include campaign duration and pledges in USD:
q6_outcome <-
q6 %>%
inner_join(exchange_rate, by = 'currency') %>%
mutate(pledged_usd = pledged * exr) %>%
mutate(dur = round(interval(launched, deadline) / days(1))) %>%
select(id, name, outcome, backers, pledged_usd, dur)
q6_all_outcomes <-
q6_outcome %>%
ggplot(aes(x = dur, y = pledged_usd)) +
geom_point(alpha = 0.5) +
facet_grid(~ outcome) +
labs(title = 'Total Pledges vs Campaign Duration',
subtitle = 'All Outcomes',
x = 'Duration (Days)',
y = 'Dollars (USD)') +
scale_y_continuous(breaks = pretty_breaks(n = 5),
labels = dollar) +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
panel.grid = element_blank(),
panel.grid.major.y = element_line(color = 'white'),
strip.text.x = element_text(size = 4),
axis.text.x = element_text(size = 4))
q6_only_successful <-
q6_outcome %>%
filter(outcome == 'successful') %>%
ggplot(aes(x = dur, y = pledged_usd)) +
geom_point(alpha = 0.5) +
facet_grid(~ outcome) +
scale_y_continuous(breaks = pretty_breaks(n = 5),
labels = dollar) +
labs(title = 'Total Pledges vs Campaign Duration',
subtitle = 'Outcome: successful',
x = 'Duration (Days)',
y = '') +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
panel.grid = element_blank(),
panel.grid.major.y = element_line(color = 'white'),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
plot_grid(q6_all_outcomes, q6_only_successful, nrow = 1)
In a quick scatterplot to view pledges and campaign durations, there seems to be a more pronounced pattern, if any, in the successful outcome. Focusing on that particular outcome does show that higher total pledges tend to aggregate in the duration range of 25 to 65 days. But there is no linear relationship nor clear trend present. This may be category specific.
q6_outcome %>%
# filter(outcome == 'successful') %>%
ggplot() +
geom_histogram(aes(x = dur)) +
facet_wrap(~ outcome) +
labs(title = 'Duration Distribution',
subtitle = 'All Outcomes',
x = 'Duration (Days)',
y = 'Count') +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
panel.grid = element_blank())
Additionally, distribution of the durations showed that there is a much stronger preference for around 30 days, contributing to the variance in total pledges. Therefore, the campaign duration would not make a good predictor for total money raised. Perhaps a better choice would be category or sub_category, or even number of backers.
SQL Query to retrieve all usable variables:
SELECT cam.name as project, cam.launched, cam.deadline, cam.goal, cam.pledged, cam.outcome, cam.backers,
cur.name as currency, coun.name as country, cat.name as cat, s_cat.name as sub_cat
FROM campaign cam
JOIN currency cur ON cam.currency_id = cur.id
JOIN country coun ON cam.country_id = coun.id
JOIN sub_category s_cat ON cam.sub_category_id = s_cat.id
JOIN category cat ON cat.id = s_cat.category_id
Clean up the dataframe:
complete <-
vis %>%
inner_join(exchange_rate, by = 'currency') %>% # Exchange rates respective to country
mutate(pledged_usd = pledged * exr, goal_usd = goal * exr) %>% # Converting currency to USD
mutate(duration = round(interval(launched, deadline) / days(1))) %>% # Converting duration in days
select(project, outcome, launched, backers, pledged_usd, goal_usd, duration, cat, sub_cat, country) %>%
# Transforming data types
transform(outcome = as.factor(outcome),
cat = as.factor(cat),
country = as.factor(country),
launched = as_date(launched))
Visualizing Kickstarter popularity:
complete %>%
group_by(outcome, month = cut.Date(launched, breaks = 'month') %>% as_date()) %>%
filter(outcome %in% c('successful', 'failed')) %>%
summarise(count = n()) %>%
arrange(month) %>%
# Line
ggplot(aes(month, count)) +
geom_line(aes(group = outcome, color = outcome), size = 1, alpha = 0.8) +
labs(title = 'Monthly Campaign Count, 2009-2017',
x = 'Date',
y = 'Count',
color = 'Outcome') +
scale_x_date(date_labels = '%Y-%m', date_breaks = '24 weeks') +
scale_y_continuous(breaks = pretty_breaks(n = 10)) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 0),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = 'top')
When Monthly Campaign Count is plotted along Date since founding Kickstarter in 2009, it can be noted a steady growing trend. There are a few irregular spikes in failed campaigns near June 2014 and another near April 2015.
To identify these spikes, we can facet this plot further down by categories:
complete %>%
group_by(outcome, cat, month = cut.Date(launched, breaks = 'month') %>% as_date()) %>%
filter(outcome %in% c('successful', 'failed')) %>%
summarise(count = n()) %>%
arrange(month) %>%
# Line
ggplot(aes(month, count)) +
geom_line(aes(group = outcome, color = outcome), size = 1, alpha = 0.8) +
geom_vline(xintercept = c(as_date('2014-06-01'), as_date('2015-04-01')), color = 'white', alpha = 0.8) +
facet_grid(rows = vars(cat), scales = 'free_x') +
labs(title = 'Monthly Campaign Count, 2009-2017',
subtitle = 'By Category',
x = 'Date',
y = 'Count',
color = 'Outcome') +
scale_x_date(date_labels = '%Y-%m', date_breaks = '24 weeks') +
scale_y_continuous(breaks = pretty_breaks(n = 10)) +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 0),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = 'top',
strip.text.y = element_text(angle = 0))
It appears that Film & Video, Food, Music, Publishing and Technology are the few categories that stand out in terms of contributing to the spikes.
We can find the specific months during those time periods that had the highest failed campaigns:
complete %>%
group_by(month = cut.Date(launched, breaks = 'month') %>% as_date()) %>%
filter(outcome %in% c('failed')) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>% head(5)
## # A tibble: 5 x 2
## month count
## <date> <int>
## 1 2014-07-01 156
## 2 2015-03-01 124
## 3 2015-02-01 119
## 4 2015-04-01 112
## 5 2014-08-01 103
And use that finding to narrow down those categories:
hi_fail_cats <-
complete %>%
filter(outcome == 'failed',
launched > as_date('2014-07-01') & launched < as_date('2014-09-01') |
launched > as_date('2015-02-01') & launched < as_date('2015-05-01')) %>%
group_by(cat) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>% head(5) %>% .$cat %>% factor()
Which confirmed what was discovered in the facet plots – categories exhibited sudden increase in failed outcomes: Food, Film & Video, Technology, Publishing, Music.
Examining goal and pledged patterns of these categories compared to the rest:
hfc_time <-
complete %>%
filter(outcome == 'failed') %>%
mutate(pledged_to_goal = round(pledged_usd / goal_usd, 2),
hi_fail = cat %in% hi_fail_cats,
month = cut.Date(launched, breaks = 'month') %>% as_date()) %>%
ggplot() +
labs(x = 'Date') +
scale_x_date(date_labels = '%Y-%m', date_breaks = '24 weeks') +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 0),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = 'top')
hfc_time +
geom_point(aes(x = month, y = (goal_usd), color = hi_fail), alpha = 0.2) +
facet_wrap(~ hi_fail, labeller = labeller(hi_fail = c('FALSE' = paste(hi_fail_cats, collapse = ', '),
'TRUE' = 'Others'))) +
geom_smooth(aes(x = month, y = (goal_usd))) +
labs(title = 'Goal in USD',
y = 'Dollars',
color = 'Categories') +
# scale_color_discrete(labels = c('High Fail', 'Others')) +
scale_y_continuous(labels = dollar) +
scale_x_date(date_labels = '%Y-%m', date_breaks = '48 weeks') +
theme(plot.title = element_text(hjust = 0.5),
legend.position = 'none')
It’s difficult to visualize in dollars due to the wide range and a non-parametric distribution. For relative comparison we could log-transform the dollar data:
hfc_log_goal <-
hfc_time +
geom_point(aes(x = month, y = log(goal_usd, 10), color = hi_fail), alpha = 0.2) +
facet_wrap(~ hi_fail, labeller = labeller(hi_fail = c('FALSE' = paste(hi_fail_cats, collapse = ', '),
'TRUE' = 'Others'))) +
geom_smooth(aes(x = month, y = log(goal_usd, 10))) +
labs(title = 'Log-Transformed Goal in USD',
y = 'log(Dollars)',
color = 'Categories') +
# scale_color_discrete(labels = c('High Fail', 'Others')) +
scale_x_date(date_labels = '%Y-%m', date_breaks = '48 weeks') +
theme(plot.title = element_text(hjust = 0.5),
legend.position = 'none',
axis.text.x = element_blank(),
axis.title.x = element_blank(),
panel.grid.major.x = element_line(color = 'white'))
hfc_log_pledges <-
hfc_time +
geom_point(aes(x = month, y = log(pledged_usd, 10), color = hi_fail), alpha = 0.2) +
facet_wrap(~ hi_fail, labeller = labeller(hi_fail = c('FALSE' = paste(hi_fail_cats, collapse = ', '),
'TRUE' = 'Others'))) +
geom_smooth(aes(x = month, y = log(pledged_usd, 10))) +
labs(title = 'Log-Transformed Pledges in USD',
y = 'log(Dollars)',
color = 'Categories') +
# scale_color_discrete(labels = c('High Fail', 'Others')) +
scale_x_date(date_labels = '%Y-%m', date_breaks = '48 weeks') +
theme(plot.title = element_text(hjust = 0.5),
legend.position = 'none',
panel.grid.major.x = element_line(color = 'white'))
plot_grid(hfc_log_goal, hfc_log_pledges, nrow = 2)
There is no clear difference in trends for these categories exhibiting spikes in failed outcomes in terms of goal or pledged dollars.
Examining the ratio of pledged to goal to see if a trend can be captured here:
hfc_pg_ratio <-
hfc_time +
geom_point(aes(x = month, y = pledged_to_goal, color = hi_fail), alpha = 0.2) +
facet_wrap(~ hi_fail, labeller = labeller(hi_fail = c('FALSE' = paste(hi_fail_cats, collapse = ', '),
'TRUE' = 'Others'))) +
geom_smooth(aes(x = month, y = pledged_to_goal)) +
labs(title = 'Pledge:Goal Ratio',
y = 'Pledge:Goal',
color = 'Categories') +
scale_color_discrete(labels = c(paste(hi_fail_cats, collapse = ', '), 'Others')) +
scale_x_date(date_labels = '%Y-%m', date_breaks = '48 weeks') +
theme(plot.title = element_text(hjust = 0.5),
legend.position = 'none')
plot_grid(hfc_pg_ratio)
The categories responsible for the sudden failed spikes showed a pretty consistent trend in this ratio, whereas the rest of the categories showed a slight dip when the spikes took place.
Next, we can compare successful and failed outcomes by category:
complete %>%
group_by(cat, outcome) %>%
summarise(count = n()) %>%
filter(outcome %in% c('successful', 'failed')) %>%
# Bar
ggplot(aes(x = cat)) +
geom_bar(aes(y = count, fill = outcome), stat = 'identity', position = 'dodge')+
labs(title = 'Campaign count by category, 2009-2017',
x = 'Category',
y = 'Count',
fill = 'Outcome') +
scale_y_continuous(breaks = pretty_breaks(n = 5)) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 1),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = 'top')
There are quite a few categories where there are more failed outcomes than successful (i.e. fail rate). We can sort by fail rate to see which categories are more likely to fail:
cat_reorder <-
complete %>%
group_by(cat, outcome) %>%
summarise(count = n()) %>%
filter(outcome %in% c('successful', 'failed')) %>%
pivot_wider(names_from = outcome, values_from = count) %>%
mutate(fail_success = failed / successful) %>%
arrange(desc(fail_success)) %>%
pull(cat) %>% as.vector()
complete %>%
group_by(cat, outcome) %>%
summarise(count = n()) %>%
filter(outcome %in% c('successful', 'failed')) %>%
transform(cat = factor(cat, levels = cat_reorder)) %>%
# Bar
ggplot(aes(x = cat)) +
geom_bar(aes(y = count, fill = outcome), stat = 'identity', position = 'dodge')+
labs(title = 'Campaign count by category, 2009-2017, sort by "fail rate"',
x = 'Category',
y = 'Count',
fill = 'Outcome') +
scale_y_continuous(breaks = pretty_breaks(n = 5)) +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 1),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = 'top')
We can look at the pattern of goal and fulfilling pledges in failed and successful outcomes:
complete %>%
group_by(cat, outcome) %>%
summarise(avg_goal = mean(goal_usd),
avg_pledge = mean(pledged_usd)) %>%
filter(outcome %in% c('successful', 'failed')) %>%
pivot_longer(cols = c(avg_goal, avg_pledge),
names_to = 'fund_type', values_to = 'usd') %>%
# Bar stacked
ggplot(aes(x = cat)) +
geom_bar(aes(y = usd, fill = fund_type), stat = 'identity', position = 'stack') +
facet_grid(~ outcome) +
labs(title = 'Average Funding (USD), 2009-2017',
x = 'Category',
y = 'Dollars (USD)',
fill = 'Funding (Avg)') +
scale_fill_discrete(labels = c('Goal', 'Pledge')) +
scale_y_continuous(breaks = pretty_breaks(n = 5),
labels = dollar) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 1),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = 'top')
The above stacked barplot is a variant of the one used to answer Part 1 Question 1. This one is broken down by categories with funding goal and pledge stacked. The failed campaigns typically have much higher goals set, and pledges filling up only fractions. Successful campaigns, on the other hand, seem to have more realistic goals, and are rewarded with pledges as high as 4 times the goals.
complete %>%
group_by(outcome, cat) %>%
summarise(avg = mean(duration),
median = median(duration),
mode = Mode(duration)) %>%
filter(outcome %in% c('successful', 'failed'))
## `summarise()` has grouped output by 'outcome'. You can override using the `.groups` argument.
## # A tibble: 30 x 5
## # Groups: outcome [2]
## outcome cat avg median mode
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 failed Art 33.5 30 30
## 2 failed Comics 35.8 30 30
## 3 failed Crafts 33.1 30 30
## 4 failed Dance 29.6 30 30
## 5 failed Design 35.0 30 30
## 6 failed Fashion 33.5 30 30
## 7 failed Film & Video 36.3 30 30
## 8 failed Food 34.6 30 30
## 9 failed Games 32.5 30 30
## 10 failed Journalism 39.8 32 30
## # ... with 20 more rows
Central tendencies for the duration seem quite aligned. We can utilize boxplot to visualize the spread.
complete %>%
group_by(cat, outcome) %>%
filter(outcome %in% c('successful', 'failed')) %>%
# Boxplot
ggplot(aes(x = cat)) +
geom_boxplot(aes(y = duration, fill = outcome, middle = mean(duration)),
outlier.alpha = 0.3, outlier.stroke = 0) +
labs(title = 'Average Duration, 2009-2017',
x = 'Category',
y = 'Days',
fill = 'Outcome') +
scale_y_continuous(breaks = pretty_breaks(n = 5)) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 1),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = 'top')
As expected, duration has a fairly consistent median across categories.
Backer count, on the other hand, varies widely: as low as 0 and as high as 105857, with a low mean 110.6396261. To properly visualize this, we can utilize log transformation again:
complete %>%
group_by(cat, outcome) %>%
filter(outcome %in% c('successful', 'failed')) %>%
# Boxplot
ggplot(aes(x = cat)) +
geom_boxplot(aes(y = log(backers, 10), fill = outcome, middle = mean(log(backers, 10))),
outlier.alpha = 0.3, outlier.stroke = 0) +
labs(title = 'Log-Transformed Average Number of Backers, 2009-2017',
x = 'Category',
y = 'Log(Backer Count)',
fill = 'Outcome') +
scale_y_continuous(breaks = pretty_breaks(n = 5)) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 1),
axis.ticks = element_blank(),
panel.grid = element_blank(),
legend.position = 'top')
It’s clear (and expected) the failed projects had significantly lower backers, on the scale of hundreds.
To simplify Random Forest modeling, I kept only successful and failed outcomes, and kept backers, goal in USD, category, and country as predictors for the training:
complete_rf <-
complete %>%
filter(outcome %in% c('successful', 'failed')) %>%
select(outcome, backers, goal_usd, cat, country) %>%
transform(outcome = factor(outcome))
set.seed(210513)
rf_model <- randomForest(data = complete_rf, outcome ~ ., proximity = TRUE)
Before any fine tuning, this Random Forest model provides fairly good classifications of failed or successful:
The out-of-bag (OOB) error rate is 7.34% which means 92.66% of the OOB samples were correctly classified.
The confusion matrix indicates very good predictions when classifying our input data:
## failed successful class.error
## failed 4263 316 0.06901070
## successful 242 2781 0.08005293
We can plot the OOB error rates for the 500 trees:
oob_error_df <- data.frame(
trees = rep(1:nrow(rf_model$err.rate), times = 3),
type = rep(c('OOB', 'failed', 'successful'), each = nrow(rf_model$err.rate)),
error = c(rf_model$err.rate[, 'OOB'],
rf_model$err.rate[, 'failed'],
rf_model$err.rate[, 'successful'])
)
ggplot(data = oob_error_df, aes(x = trees, y = error)) +
geom_line(aes(color = type)) +
labs(title = 'Random Forest Model Error Rate Progression',
y = 'Error Rate',
x = 'Number of Tree',
color = 'Error for:') +
theme(plot.title = element_text(hjust = 0.5))
This model can still be useful at this point without any fine tuning, and we can use it to predict a test dataset made up of the rest of the outcomes (canceled, suspended, live, and undefined):
test_set <-
complete %>%
filter(!outcome %in% c('successful', 'failed')) %>%
select(outcome, backers, goal_usd, pledged_usd, cat, sub_cat, country) %>%
transform(outcome = factor(outcome))
test_predictions <-
test_set %>%
mutate(prediction = predict(rf_model, newdata = test_set),
pledged_to_goal = pledged_usd / goal_usd) %>%
select(outcome, prediction, backers, pledged_to_goal, goal_usd, pledged_usd, cat, sub_cat, country)
Predicting on live outcome:
live_pred <-
test_predictions %>%
filter(outcome == 'live') %>%
arrange(desc(prediction)) %>%
ggplot(aes(x = cat, color = prediction)) +
facet_wrap(~ prediction) +
labs(x = 'Category') +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.3, hjust = 1),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'none',
panel.grid = element_blank(),
panel.grid.major = element_line(color = 'white'))
live_backers <-
live_pred +
geom_point(aes(y = backers)) +
labs(y = 'Backer Count',
title = 'RF Prediction on "live" Campaigns') +
theme(axis.text.x = element_blank())
live_pledged <-
live_pred +
geom_point(aes(y = pledged_usd)) +
labs(y = 'Pledged (USD)') +
scale_y_continuous(breaks = pretty_breaks(n = 5),
labels = dollar) +
theme(strip.background.x = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank())
live_goal <-
live_pred +
geom_point(aes(y = goal_usd)) +
labs(y = 'Goal (USD)') +
scale_y_continuous(breaks = pretty_breaks(n = 5),
labels = dollar) +
theme(strip.background.x = element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_blank())
live_ratio <-
live_pred +
geom_point(aes(y = pledged_to_goal)) +
labs(y = 'Pledged:Goal') +
theme(strip.background.x = element_blank(),
strip.text.x = element_blank())
plot_grid(live_backers, live_pledged, live_goal, live_ratio, nrow = 4, align = 'v', rel_heights = c(1, 1, 1, 1))
The Random Forest model predicts the outcome for the currently live campaigns.
Those predicted to be successful tend to have a much lower goal, supported by more backers and received more in pledges.
# Dataset for board games
bg_df <-
complete %>%
filter(sub_cat == 'Tabletop Games',
outcome %in% c('successful', 'failed')) %>%
select(project, outcome, backers, duration, pledged_usd, goal_usd, country)
bg_goal_summary <-
bg_df %>%
group_by(outcome) %>%
summarise(avg_goal = dollar(mean(goal_usd)),
med_goal = dollar(median(goal_usd)),
count = n())
bg_df %>%
ggplot(aes(x = outcome, fill = outcome)) +
geom_boxplot(aes(y = goal_usd), outlier.alpha = 0.3, outlier.stroke = 0) +
coord_cartesian(ylim = c(0, 50000)) +
stat_summary(geom = 'text', fun = 'median',
aes(y = goal_usd, label = dollar(..y..)),
position = position_nudge(y = -1200),
size = 3.5) +
geom_hline(yintercept = 15000, color = 'red', linetype = 'dashed') +
labs(x = 'Outcome',
y = 'Dollar',
title = 'Summary Goal in USD for Tabletop Games') +
theme(plot.title = element_text(hjust = 0.5),
legend.position = 'none',
panel.grid = element_blank())
Taking 179 successful outcomes for Tabletop Games into consideration, $15,000 (red dotted line) is well beyond the median $5,000 and the mean $10,816.71. A recommended realistic goal should be around $10,000.
# Building a new test set with varying backers
names <- c('backers', 'goal_usd', 'cat', 'country')
backers <- seq(100, 300)
bg_backers <-
data.frame(
backers,
rep(15000, times = length(backers)),
rep('Games', times = length(backers)),
rep('US', times = length(backers)))
names(bg_backers) <- names
# Factor levels need to be identical to the training set else RF prediction throws error
bg_backers %>%
transform(backers = as.integer(backers),
cat = factor(cat, levels = complete_rf$cat %>% levels()),
country = factor(country, levels = complete_rf$country %>% levels())) -> bg_backers
bg_backers %>%
mutate(prediction = predict(rf_model, newdata = bg_backers)) %>%
filter(prediction == 'successful') %>%
arrange(backers) %>% head(5)
## backers goal_usd cat country prediction
## 1 185 15000 Games US successful
## 2 186 15000 Games US successful
## 3 187 15000 Games US successful
## 4 188 15000 Games US successful
## 5 189 15000 Games US successful
Using the following predictors: goal $15,000, category Games, country US, the Random Forest model predicts a minimum of 185 backers to meet that goal.
zoom_out <-
complete %>%
filter(cat == 'Games',
outcome %in% c('successful', 'failed')) %>%
ggplot(aes(x = backers, y = goal_usd)) +
geom_point(aes(color = outcome)) +
geom_smooth() +
geom_hline(yintercept = 15000, color = 'red', alpha = 0.5, linetype = 'dashed') +
facet_wrap(~ outcome) +
scale_y_continuous(breaks = pretty_breaks(n = 5),
labels = dollar) +
labs(y = 'Dollars (USD)',
title = 'Goals vs Backers') +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'none',
panel.grid = element_blank(),
panel.grid.major.x = element_line(color = 'white'))
zoom_in <-
zoom_out +
coord_cartesian(ylim = c(14800, 15200)) +
labs(title = 'Zoomed in around Goal $15,000',
x = 'Backers') +
theme(axis.text.x = element_text(),
axis.title.x = element_text())
plot_grid(zoom_out, zoom_in, nrow = 2, align = 'v')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
avg_backers <-
complete %>%
filter(cat == 'Games',
outcome %in% c('successful', 'failed'),
goal_usd > 14800,
goal_usd < 15200) %>%
select(outcome, backers) %>%
group_by(outcome) %>%
summarise(avg_backers = mean(backers))
Under LOESS smoothing, for a goal of $15,000 in the Games category, the client can expect anywhere from 31 to 1115 backers.
con objectdbDisconnect(con)