First we load the dataset by reading it in from the csv file and convert date columns into usable date formats.
setwd("/Users/elinaazrilyan/Documents/Data698/")
df <- read.csv("ks-projects-201801.csv", stringsAsFactors = F)
df$launched <- as.Date(df$launched)
df$deadline <- as.Date(df$deadline)
head(df)
names(df)
## [1] "ID" "name" "category" "main_category"
## [5] "currency" "deadline" "goal" "launched"
## [9] "pledged" "state" "backers" "country"
## [13] "usd.pledged" "usd_pledged_real" "usd_goal_real"
Taking a look at a summary of the data, there seem to be missing values in the usd.pledged
column. We will remove those rows since it is only around 1% of the data.
df <- df %>% filter((!(is.na(.$usd.pledged))))
summary(df)
## ID name category main_category
## Min. :5.971e+03 Length:374864 Length:374864 Length:374864
## 1st Qu.:5.381e+08 Class :character Class :character Class :character
## Median :1.075e+09 Mode :character Mode :character Mode :character
## Mean :1.075e+09
## 3rd Qu.:1.610e+09
## Max. :2.147e+09
## currency deadline goal
## Length:374864 Min. :2009-05-03 Min. : 0
## Class :character 1st Qu.:2013-06-01 1st Qu.: 2000
## Mode :character Median :2015-01-05 Median : 5500
## Mean :2014-10-30 Mean : 49523
## 3rd Qu.:2016-05-01 3rd Qu.: 16500
## Max. :2018-03-03 Max. :100000000
## launched pledged state backers
## Min. :1970-01-01 Min. : 0 Length:374864 Min. : 0.0
## 1st Qu.:2013-04-30 1st Qu.: 31 Class :character 1st Qu.: 2.0
## Median :2014-12-02 Median : 620 Mode :character Median : 12.0
## Mean :2014-09-25 Mean : 9750 Mean : 106.7
## 3rd Qu.:2016-03-29 3rd Qu.: 4080 3rd Qu.: 57.0
## Max. :2018-01-02 Max. :20338986 Max. :219382.0
## country usd.pledged usd_pledged_real usd_goal_real
## Length:374864 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.: 17 1st Qu.: 31 1st Qu.: 2000
## Mode :character Median : 395 Median : 624 Median : 5500
## Mean : 7037 Mean : 9121 Mean : 45863
## 3rd Qu.: 3034 3rd Qu.: 4051 3rd Qu.: 16000
## Max. :20338986 Max. :20338986 Max. :166361391
Let’s view the distribution of project statuses.
ggplot(df, aes(state)) +
geom_bar() +
ylab("Number of Projects") + xlab("Final State") +
ggtitle("State Distribution of the Kickstarter projects")
Even though there is a significant amount of canceled projects (close to 40K), we will focus on Failed and Successful status only since our goal is determining success predictors and it is not clear what the reason for canceling a project might be - so we can’t clasify those as failed projects.
ksdf <- df %>% filter(state == "failed" | state == "successful")
ksdf$state <- as.character(ksdf$state)
ksdf$state <- as.factor(ksdf$state)
ggplot(ksdf, aes(state)) +
geom_bar() +
ylab("Number of Projects") + xlab("Final State") +
ggtitle("State Distribution of the Kickstarter projects")
I will calculate the duration for each project by subtracting deadline from launched date and round it to the number of days.
ksdf$duration = round(difftime(ksdf$deadline,ksdf$launched, units = "days"),0 )
ksdf$duration <- as.numeric(ksdf$duration)
Let’s examing the distibution of projects by country.
ggplot(ksdf, aes(country)) +
geom_bar() +
ylab("Number of Projects") + xlab("Country") +
ggtitle("Country Distribution of the Kickstarter projects")
I will reduce the number of countries by combining all countries except major ones into “Other” bucket.
ksdf$country <- as.character(ksdf$country)
ksdf$country[ksdf$country %in% c("AT", "BE", "CH", "DE", "DK", "ES", "FR", "HK", "IE", "IT", "JP", "LU", "MX", "NL", "NO", "NZ", "SE", "SG")] <- "Other"
ksdf$country <- as.factor(ksdf$country)
ggplot(ksdf, aes(country)) +
geom_bar() +
ylab("Number of Projects") + xlab("Country") +
ggtitle("Country Distribution of the Kickstarter projects")
n2 = data.frame(table(ksdf$main_category, ksdf$state))
p=ggplot(data=n2,aes(x=Var1,y=Freq,fill=Var2))
p + geom_bar(stat="identity", position=position_dodge())+
labs(title="State Of Projects by Category", x = "Categories", y="Number of Projects", fill="State of Projects") +
theme(axis.text.x = element_text(angle = 45, size = 8, hjust = 1))
There are several categories that appear to have higher success vs failure rate based on the above plot. Those categories are: Comics, Dance, Music, and Theater. Also, the categories with the highest failure vs success ratio are Technology, Publishing, Photography, Fashion, and Crafts.
For the categories with high ratio of success, word clouds were created to identify if there is any specific words in the title that can predict success for those categories.
set.seed(1111)
ksdfs <- ksdf %>% filter(state == "successful" & main_category == "Theater")
corpus <- Corpus(VectorSource(ksdfs$name))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords()))
tdm <-TermDocumentMatrix(corpus, control=list(wordLengths=c(1,Inf)))
freqs <- slam::row_sums(tdm)
wordss <- names(freqs)
wordcloud(wordss, freqs, min.freq=5, max.words = 150, colors = brewer.pal(8, "Dark2"))
set.seed(1111)
ksdff <- ksdf %>% filter(state == "failed" & main_category == "Theater")
corpus <- Corpus(VectorSource(ksdff$name))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords()))
tdm <-TermDocumentMatrix(corpus, control=list(wordLengths=c(1,Inf)))
freqf <- slam::row_sums(tdm)
wordsf <- names(freqf)
wordcloud(wordsf, freqf, min.freq=5, max.words = 150, colors = brewer.pal(8, "Dark2"))
set.seed(1111)
ksdfs <- ksdf %>% filter(state == "successful" & main_category == "Dance")
corpus <- Corpus(VectorSource(ksdfs$name))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords()))
tdm <-TermDocumentMatrix(corpus, control=list(wordLengths=c(1,Inf)))
freqs <- slam::row_sums(tdm)
wordss <- names(freqs)
wordcloud(wordss, freqs, min.freq=5, max.words = 150, colors = brewer.pal(8, "Dark2"))
set.seed(1111)
ksdff <- ksdf %>% filter(state == "failed" & main_category == "Dance")
corpus <- Corpus(VectorSource(ksdff$name))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords()))
tdm <-TermDocumentMatrix(corpus, control=list(wordLengths=c(1,Inf)))
freqf <- slam::row_sums(tdm)
wordsf <- names(freqf)
wordcloud(wordsf, freqf, min.freq=5, max.words = 150, colors = brewer.pal(8, "Dark2"))
set.seed(1111)
ksdfs <- ksdf %>% filter(state == "successful" & main_category == "Comics")
corpus <- Corpus(VectorSource(ksdfs$name))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords()))
tdm <-TermDocumentMatrix(corpus, control=list(wordLengths=c(1,Inf)))
freqs <- slam::row_sums(tdm)
wordss <- names(freqs)
wordcloud(wordss, freqs, min.freq=5, max.words = 150, colors = brewer.pal(8, "Dark2"))
set.seed(1111)
ksdff <- ksdf %>% filter(state == "failed" & main_category == "Comics")
corpus <- Corpus(VectorSource(ksdff$name))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords()))
tdm <-TermDocumentMatrix(corpus, control=list(wordLengths=c(1,Inf)))
freqf <- slam::row_sums(tdm)
wordsf <- names(freqf)
wordcloud(wordsf, freqf, min.freq=5, max.words = 150, colors = brewer.pal(8, "Dark2"))
set.seed(1111)
ksdfs <- ksdf %>% filter(state == "successful" & main_category == "Music")
corpus <- Corpus(VectorSource(ksdfs$name))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords()))
tdm <-TermDocumentMatrix(corpus, control=list(wordLengths=c(1,Inf)))
freqs <- slam::row_sums(tdm)
wordss <- names(freqs)
wordcloud(wordss, freqs, min.freq=5, max.words = 150, colors = brewer.pal(8, "Dark2"))
set.seed(1111)
ksdff <- ksdf %>% filter(state == "failed" & main_category == "Music")
corpus <- Corpus(VectorSource(ksdff$name))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords()))
tdm <-TermDocumentMatrix(corpus, control=list(wordLengths=c(1,Inf)))
freqf <- slam::row_sums(tdm)
wordsf <- names(freqf)
wordcloud(wordsf, freqf, min.freq=5, max.words = 150, colors = brewer.pal(8, "Dark2"))
Duration of projects seems to be inversely corelated with project success.
ggplot(ksdf, aes(x = state, y = duration, fill = ksdf$state)) +
geom_boxplot() +
theme(legend.position = "right") +
ylab("Duration") + xlab("") +
ggtitle("Duration of projects")
## Warning: Use of `ksdf$state` is discouraged. Use `state` instead.
Goal of the projects also seems to be inversely corelated with project success.
ggplot(ksdf, aes(x = state, y = log(usd_goal_real), fill = ksdf$state)) +
geom_boxplot() +
theme(legend.position = "right") +
ylab("Goal (Log trans)") + xlab("") +
ggtitle("Goal for projects (log transformed)")
## Warning: Use of `ksdf$state` is discouraged. Use `state` instead.
ggplot(ksdf, aes(x = state, y = log(usd_pledged_real), fill = ksdf$state)) +
geom_boxplot() +
theme(legend.position = "right") +
ylab("Pledged (Log trans)") + xlab("") +
ggtitle("Pledged for projects (log transformed)")
## Warning: Use of `ksdf$state` is discouraged. Use `state` instead.
## Warning: Removed 38634 rows containing non-finite values (stat_boxplot).
inc4 <- ksdf %>%
group_by(main_category) %>%
summarize(Total_Pledged = round(sum(usd_pledged_real),0)) %>% top_n(5)
## Selecting by Total_Pledged
inc4 <- arrange(inc4, desc(main_category))
inc4
ksdff <- ksdf %>% filter(state == "failed")
ksdfs <- ksdf %>% filter(state == "successful")
inc1 <- ksdf %>%
group_by(main_category) %>%
summarize(meanGoal = round(mean(usd_goal_real),0))
inc1 <- arrange(inc1, desc(main_category))
inc2 <- ksdfs %>%
group_by(main_category) %>%
summarize(meanGoal = round(mean(usd_goal_real),0))
inc2 <- arrange(inc2, desc(main_category))
inc3 <- ksdff %>%
group_by(main_category) %>%
summarize(meanGoal = round(mean(usd_goal_real),0))
inc3 <- arrange(inc3, desc(main_category))
inc1$Mean_Goal_Sucessful <- inc2$meanGoal
inc1$Mean_Goal_Failed <- inc3$meanGoal
names(inc1)[1] <- "Category"
names(inc1)[2] <- "Mean_Goal_All"
inc1
#Overall mean goal
mean(ksdf$usd_goal_real)
## [1] 41523.2
#Successful mean goal
mean(ksdfs$usd_goal_real)
## [1] 9535.702
#Failed mean goal
mean(ksdff$usd_goal_real)
## [1] 63189.47
#Overall median goal
median(ksdf$usd_goal_real)
## [1] 5000
#Failed median goal
median(ksdff$usd_goal_real)
## [1] 7500
#Successful median goal
median(ksdfs$usd_goal_real)
## [1] 3840
I want to use only certain columns for my modeling, so I will subset my data.
ksdf.sub <- dplyr::select(ksdf, -1, -2, -3, -5, -6, -7, -8, -9, -13)
I will split the data into Test and Train set using 70 - 30 split
set.seed(1111133)
spec = c(train = .70, test = .30)
x = sample(cut(seq(nrow(ksdf.sub)), nrow(ksdf.sub)*cumsum(c(0,spec)), labels = names(spec)))
res = split(ksdf.sub, x)
train.data <- res$train
test.data <- res$test
ml <- glm(state~., data = train.data, family = "binomial")
ml1 <- step(ml, direction = 'backward')
## Start: AIC=1962.01
## state ~ main_category + backers + country + usd_pledged_real +
## usd_goal_real + duration
##
## Df Deviance AIC
## - country 4 1924 1962
## <none> 1916 1962
## - duration 1 1920 1964
## - main_category 14 2000 2018
## - backers 1 2012 2056
## - usd_pledged_real 1 144213 144257
## - usd_goal_real 1 202558 202602
##
## Step: AIC=1961.71
## state ~ main_category + backers + usd_pledged_real + usd_goal_real +
## duration
##
## Df Deviance AIC
## <none> 1924 1962
## - duration 1 1929 1965
## - main_category 14 2009 2019
## - backers 1 2024 2060
## - usd_pledged_real 1 144270 144306
## - usd_goal_real 1 202736 202772
summary(ml1)
##
## Call:
## glm(formula = state ~ main_category + backers + usd_pledged_real +
## usd_goal_real + duration, family = "binomial", data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.390797 0.163991 8.481 < 2e-16 ***
## main_categoryComics 0.177726 0.411628 0.432 0.665912
## main_categoryCrafts -0.098074 0.266621 -0.368 0.712993
## main_categoryDance 1.660660 0.722880 2.297 0.021603 *
## main_categoryDesign 0.141364 0.301235 0.469 0.638868
## main_categoryFashion 0.198409 0.267749 0.741 0.458678
## main_categoryFilm & Video 1.037797 0.198843 5.219 1.80e-07 ***
## main_categoryFood -1.166486 0.162034 -7.199 6.07e-13 ***
## main_categoryGames -0.152878 0.242308 -0.631 0.528090
## main_categoryJournalism 0.646121 0.463137 1.395 0.162987
## main_categoryMusic 0.797271 0.197728 4.032 5.53e-05 ***
## main_categoryPhotography 0.139881 0.272423 0.513 0.607623
## main_categoryPublishing 0.355228 0.206802 1.718 0.085847 .
## main_categoryTechnology 0.183854 0.319180 0.576 0.564602
## main_categoryTheater 1.512006 0.422279 3.581 0.000343 ***
## backers 0.088774 0.007349 12.080 < 2e-16 ***
## usd_pledged_real 0.089864 0.002506 35.865 < 2e-16 ***
## usd_goal_real -0.089707 0.002495 -35.954 < 2e-16 ***
## duration -0.006812 0.003789 -1.798 0.072219 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 313149.7 on 232024 degrees of freedom
## Residual deviance: 1923.7 on 232006 degrees of freedom
## AIC: 1961.7
##
## Number of Fisher Scoring iterations: 25
According to this GLM model - the variable significantly correlated with predictin project performance are Catogory (Dance, Film & Video, Food, Music, Theater), backers, amount pledged, and goal. Positive correcalation is observed with Dance, Film & Video and Theater Category, number of backers, and amount pledged. Negative correlation is observed with Food category, goal, and duration.
pred <- predict(ml1, newdata = test.data, type = "response")
predictionval <- ifelse(pred < 0.5,0,1)
table(as.factor(test.data$state), predictionval)
## predictionval
## 0 1
## failed 59379 79
## successful 2 39980
#Reference: https://datascienceplus.com/how-to-perform-logistic-regression-lda-qda-in-r/
The results are exceptionally good.
I will use a tree mode to determine the predictors with the highest significance for funding success and failure.
mltree <- rpart(state ~ ., data = train.data, method = "class")
rpart.plot(mltree)
Building a classification tree on the variable ‘State’ resulted in the above classification tree. According to this figure backers, goal and usd_pledged_real are the most significant predictors for the classification of response variable.
pred1 <- predict(mltree, test.data, type = "class")
confusionMatrix(test.data$state, pred1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction failed successful
## failed 56949 2509
## successful 956 39026
##
## Accuracy : 0.9652
## 95% CI : (0.964, 0.9663)
## No Information Rate : 0.5823
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.928
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9835
## Specificity : 0.9396
## Pos Pred Value : 0.9578
## Neg Pred Value : 0.9761
## Prevalence : 0.5823
## Detection Rate : 0.5727
## Detection Prevalence : 0.5979
## Balanced Accuracy : 0.9615
##
## 'Positive' Class : failed
##
set.seed(1111333)
train.data$main_category <- as.factor(train.data$main_category)
test.data$main_category <- as.factor(test.data$main_category)
mlrf <- randomForest(formula = state ~ ., data = train.data, importance = TRUE)
print(mlrf)
##
## Call:
## randomForest(formula = state ~ ., data = train.data, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0.19%
## Confusion matrix:
## failed successful class.error
## failed 137725 431 0.0031196618
## successful 9 93860 0.0000958783
pred2 <- predict(mlrf, test.data, type = "class")
confusionMatrix(test.data$state, pred2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction failed successful
## failed 59282 176
## successful 1 39981
##
## Accuracy : 0.9982
## 95% CI : (0.9979, 0.9985)
## No Information Rate : 0.5962
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9963
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.9956
## Pos Pred Value : 0.9970
## Neg Pred Value : 1.0000
## Prevalence : 0.5962
## Detection Rate : 0.5962
## Detection Prevalence : 0.5979
## Balanced Accuracy : 0.9978
##
## 'Positive' Class : failed
##
Random Forest model on the variable ‘State’ is performing extremely well as well.
In conclusion, both Random Forest and GLM are demonstration exceptional results and either one could be used. I would select GLM since it is much less computationally intensive than Random Forest.