1. Identifying Successful Projects

a) Success by category

There are several ways to identify success of a project:

- State (state): Whether a campaign was successful or not - Pledged Amount (pledged) - Achievement Ratio: Create a variable achievement_ratio by calculating the percentage of the original monetary goal reached by the actual amount pledged (that is pledged/goal x 100) - Number of backers (backers_count) - How quickly the goal was reached (difference between launched_at and state_changed_at) for those campaigns that were successful

Use one or more of these measures to visually summarize which categories were most successful in attracting funding on kickstarter. Briefly summarize your findings.

ks <- read_csv("kickstarter_projects_2020_02.csv")
ks %>%
  filter(state != "live") %>%
  group_by(top_category) %>%
  tally(sort = TRUE) %>%
  head()
ks_2 <- ks %>%
  filter(state != "live") %>%
  mutate(state = factor(state, levels = c("successful", "suspended", "canceled", "failed"))) %>%
  mutate(state_cat = case_when(
    state == "successful" ~ "successful",
    TRUE                  ~ "not successful")) %>%
  mutate(state_cat = factor(state_cat, levels = c("successful", "not successful"))) %>%
  mutate(achievement_ratio = pledged/goal*100) %>%
  mutate(deadline_2 = ymd(deadline)) %>%
  mutate(deadline_3 = as.POSIXct(deadline_2, tz = Sys.timezone())) %>%
  mutate(launched_at_2 = ymd(launched_at)) %>%
  mutate(launched_at_3 = as.POSIXct(launched_at_2, tz = Sys.timezone())) %>%
  mutate(state_changed_at_2 = ymd(state_changed_at)) %>%
  mutate(state_changed_at_3 = as.POSIXct(state_changed_at_2, tz = Sys.timezone())) %>%
  mutate(days_to_deadline = as.numeric(difftime(deadline_3, launched_at_3, units = "days")))%>%
  mutate(days_to_change_state = as.numeric(difftime(state_changed_at_3, launched_at_3, units = "days")))%>%
  mutate(funding_days_pct = as.numeric(days_to_change_state)/as.numeric(days_to_deadline)) %>%
  mutate(top_category_2 = case_when(
     top_category == "music"          ~ "music",
     top_category == "film & video"   ~ "film & video",
     top_category == "art"            ~ "art",
     top_category == "publishing"     ~ "publishing",
     top_category == "technology"     ~ "technology",
     top_category == "food"           ~ "food",
     TRUE                             ~ "other")) %>%
  mutate(top_category_2 = factor(top_category_2, levels = c("music", "film & video", "art", "publishing", "technology", "food", "other")))
theme_solar_pretty <- function()
  {
    theme_solarized(base_size = 12, base_family = "Trebuchet MS") %+replace% 
    theme(text = element_text(color = "#586e75"), 
          plot.title = element_text(size = 13, face = "bold", hjust = 0),
          plot.subtitle = element_text(hjust = 0),
          axis.title = element_text(face = "bold"), 
          legend.title = element_text(face = "bold"),
          element_line(size = 0.5, color = "#586e75"), 
          panel.background = element_blank(),
          panel.grid.major.x = element_line(),
          panel.grid.minor.x = element_blank(), 
          panel.grid.major.y = element_line(),
          panel.grid.minor.y = element_blank())
  }
ggplot() + 
  geom_jitter(data = ks_2, aes(x = goal/1000, y = backers_count, color = factor(state)), size = 1) +
  geom_smooth(data = subset(ks_2, state == "successful"), aes(x = goal/1000, y = backers_count), 
              lwd = 1.5, se = FALSE, color = "#586e75") +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  scale_x_continuous(labels = function(x) paste0("$", x, "K")) +
  scale_y_continuous(labels = comma) +
  coord_cartesian(xlim = c(0, 1000), ylim = c(0, 50000)) +
  xlab(NULL) +
  ylab("Number of backers\n") +
  labs(title = "Campaign popularity by funding goal", 
       subtitle = "\nA goal less than $2,000 is the most likely to succeed\n", 
       caption = "\nSource: webrobots.io") +
  ggplot2::annotate(geom = "text", x = 1025, y = 40000, label = "Only successful campaigns \nhave a positive corelation ", 
                    hjust = "right", fontface = "bold", size = 4, color = "#586e75")
ks_3 <- ks_2 %>%
  filter(goal <= 10000) %>%
  mutate(goal_cat = cut(goal, breaks = 5, labels = c("20%", "40%", "60%", "80%", "100%"))) %>%
  mutate(state_2 = state) %>%
  mutate(state_2 = factor(state_2, levels = c("suspended", "canceled", "failed", "successful")))
state_levels <- ks_3 %>%
  group_by(state) %>%
  summarise_at(vars(backers_count), sum) %>%
  arrange(desc(backers_count)) %>%
  select(state) %>%
  mutate(state = as.character(state)) %>%
  flatten() %>%
  unlist()
ggplot(ks_3, aes(x = goal_cat, fill = state)) + 
  geom_bar(position = "stack", stat = "count") +
  theme_solar_pretty() +
  scale_fill_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top", panel.grid.major.x = element_blank()) +
  scale_x_discrete(labels = c("$10 - $2,000", "$2K - $4K", "$4K - $6K", 
                              "$6K - $8K", "$8K - $10K")) +
  scale_y_continuous(labels = comma) +
  ylab(NULL) +
  xlab(NULL) +
  labs(title = "Campaign success by fundraising goal", 
       subtitle = "\nA goal less than $2,000 is the most likely to succeed\n", 
       caption = "\nSource: webrobots.io")

I have limited the data to campaigns with goals under $10,000. The smaller gthe goal,the more likely a campaign is to succeed. Campaigns with goals under $6,000 are more likely than not to succeed, whereas campaigns with goals over $6,000 are at least 50% likely to fail.

ggplot(data = subset(ks_2, top_category_2 != "other"), aes(fill = state_cat, x = top_category_2)) + 
  geom_bar(position = "dodge", stat = "count") +
  theme_solar_pretty() +
  scale_fill_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top", panel.grid.major.x = element_blank()) +
  scale_x_discrete(labels = c("Music", "Film & Video", "Art", 
                              "Publishing", "Technology", "Food", "Other")) +
  scale_y_continuous(labels = comma) +
  ylab(NULL) +
  xlab(NULL) +
  labs(title = "Campaign success vs. failure by topic", 
       subtitle = "\nMusic is the most successful category, food is the least successful\n", 
       caption = "\nSource: webrobots.io")

I have limited the data to campaigns in the 6 most popular categories. Music and publishing campaigns are two times as likely to succeed instead of fail, film + video and art campaigns are slightly more than 50% likely to succeed, and technology and food campaigns are more likely than not to fail.

BONUS ONLY: b) Success by location

Now, use the location information to calculate the total number of successful projects by state (if you are ambitious, normalize by population). Also, identify the Top 50 “innovative” cities in the U.S. (by whatever measure you find plausible). Provide a leaflet map showing the most innovative states and cities in the U.S. on a single map based on these information.

2. Writing your success story

Each project contains a blurb – a short description of the project. While not the full description of the project, the short headline is arguably important for inducing interest in the project (and ultimately popularity and success). Let’s analyze the text.

a) Cleaning the text and word cloud

To reduce the time for analysis, select the 1000 most successful projects and a sample of 1000 unsuccessful projects. Use the cleaning functions introduced in lecture (or write your own in addition) to remove unnecessary words (stop words), syntax, punctuation, numbers, white space etc. Note, that many projects use their own unique brand names in upper cases, so try to remove these fully capitalized words as well (since we are aiming to identify common words across descriptions). Stem the words left over and complete the stems. Create a document-term-matrix.

Provide a word cloud of the most frequent or important words (your choice which frequency measure you choose) among the most successful projects.

In order to create the corpus without splitting and rejoining the original data, I took both the top and bottom 1000 rows based on achievement ratio.

ks_3 <- ks_2 %>%
  mutate(doc_id = id) %>%
  mutate(text = blurb)
ks_4 <- ks_3 %>%
  select(doc_id, text, backers_count, goal, pledged, state, top_category, top_category_2, achievement_ratio, days_to_change_state, funding_days_pct)
ks_top_bottom <- ks_4 %>%
  filter(state != ("canceled")) %>%
  filter(state != ("suspended")) %>%
  arrange(desc(achievement_ratio)) %>%
  filter(!row_number() %in% 001001:116851)
ks_meta <- ks_top_bottom %>%
  select(!text) 
ks_ds <- DataframeSource(ks_top_bottom)
ks_corpus <- VCorpus(ks_ds)

I was unable to reattach the metadata after completing the stems for the corpus text, so I will also created two separate corpuses of the top successful campaigns and a random sample of failed campaigns:

top_s <- ks_4 %>%
  filter(state == "successful") %>%
  top_n(1000, achievement_ratio)
sample_f <- ks_4 %>%
  filter(state == "failed") %>%
  sample_n(1000)
ks_ds_s <- DataframeSource(top_s)
ks_corpus_s <- VCorpus(ks_ds_s)
ks_ds_f <- DataframeSource(sample_f)
ks_corpus_f <- VCorpus(ks_ds_f)
clean_corpus <- function(corpus){
  require(tm)
  require(qdap)
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, content_transformer(replace_abbreviation))
  corpus <- tm_map(corpus, content_transformer(replace_contraction))
  corpus <- tm_map(corpus, content_transformer(replace_symbol))
  corpus <- tm_map(corpus, content_transformer(bracketX))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
  }
ks_corpus_s_clean <- clean_corpus(ks_corpus_s)
ks_corpus_f_clean <- clean_corpus(ks_corpus_f)
ks_corpus_s_clean_stemmed <- tm_map(ks_corpus_s_clean, stemDocument)
ks_corpus_f_clean_stemmed <- tm_map(ks_corpus_f_clean, stemDocument)
stemCompletionBrambor <- function(x, dictionary) {
   x <- unlist(strsplit(as.character(x), " "))
   x <- x[x != ""]
   x <- stemCompletion(x, dictionary = dictionary)
   x <- paste(x, sep = "", collapse = " ")
   PlainTextDocument(stripWhitespace(x))
   }
no_cores <- detectCores() - 1
ks_corpus_s_final <- mclapply(ks_corpus_s_clean_stemmed, stemCompletionBrambor, dictionary = ks_corpus_s_clean, mc.cores = no_cores)
ks_corpus_f_final <- mclapply(ks_corpus_f_clean_stemmed, stemCompletionBrambor, dictionary = ks_corpus_f_clean, mc.cores = no_cores)
ks_corpus_s_final <- as.VCorpus(ks_corpus_s_final)
ks_corpus_f_final <- as.VCorpus(ks_corpus_f_final)
ks_tdm_s <- TermDocumentMatrix(ks_corpus_s_final)
ks_tdm_s_tidy <- tidytext::tidy(ks_tdm_s)
ks_tdm_f <- TermDocumentMatrix(ks_corpus_f_final)
ks_tdm_f_tidy <- tidytext::tidy(ks_tdm_f)
ks_tf_idf_s <- ks_tdm_s_tidy %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf)) 
wordcloud(ks_tf_idf_s$term, ks_tf_idf_s$tf, max.words = 100, title.bg.colors = "#fdf6e3", colors = "#859900")

b) Success in words

Provide a pyramid plot to show how the words between successful and unsuccessful projects differ in frequency. A selection of 10-20 top words is sufficient here.

Since I was unable to reattach the metadata in Question 1a, I could not create plots for common or different words.

ks_tf_idf_f <- ks_tdm_f_tidy %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf)) 
wordcloud(ks_tf_idf_f$term, ks_tf_idf_f$tf, max.words = 100, colors = "#cb4b16")

c) Simplicity as a virtue

These blurbs are short in length (max. 150 characters) but let’s see whether brevity and simplicity still matters. Calculate a readability measure (Flesh Reading Ease, Flesh Kincaid or any other comparable measure) for the texts. Visualize the relationship between the readability measure and one of the measures of success. Briefly comment on your finding.

ks_text <- ks_top_bottom %>%
  select(text) %>%
  deframe()
ks_text_vec <- VectorSource(ks_text)
ks_text_corpus <- VCorpus(ks_text_vec)
ks_corpus_q <- corpus(ks_corpus)
ks_corpus_q$backers_count <- ks_meta$backers_count
ks_corpus_q$goal <- ks_meta$goal
ks_corpus_q$pledged <- ks_meta$pledged
ks_corpus_q$state <- ks_meta$state
ks_corpus_q$top_category <- ks_meta$top_category
ks_corpus_q$top_category_2 <- ks_meta$top_category_2
ks_corpus_q$achievement_ratio <- ks_meta$achievement_ratio
ks_corpus_q$days_to_change_state <- ks_meta$days_to_change_state
ks_corpus_q$funding_days_pct <- ks_meta$funding_days_pct
ks_text_name <- ks_top_bottom %>% 
  select(doc_id) %>%
  mutate(doc_id = as.character(doc_id)) %>%
  deframe()
docnames(ks_corpus_q) <- ks_text_name
ks_corpus_q_read <- textstat_readability(ks_corpus_q, measure = 'Flesch.Kincaid')
ks_corpus_q_read_clean <- ks_corpus_q_read %>%
  mutate(fkgl = Flesch.Kincaid) %>%
  select(!Flesch.Kincaid) %>%
  mutate(fkgl_int = as.integer(fkgl + 0.5)) %>%
  mutate(fkgl_cat = cut(fkgl_int, breaks = c(-3, 1, 5, 9, 13, 17, Inf), ordered_result = TRUE, labels = paste("Level", 1:6, sep = " "))) %>%
  mutate(fkgl_cat = case_when(
    fkgl_cat == "Level 1" ~ "No School",
    fkgl_cat == "Level 2" ~ "Elementary School",
    fkgl_cat == "Level 3" ~ "Middle School",
    fkgl_cat == "Level 4" ~ "High School",
    fkgl_cat == "Level 5" ~ "College",
    fkgl_cat == "Level 6" ~ "Graduate School")) %>% 
  mutate(fkgl_cat = ordered(fkgl_cat, levels = c("No School", "Elementary School", "Middle School", "High School", "College", 
                                                "Graduate School"))) %>%
  mutate(fkgl_cat_2 = cut(fkgl_int, breaks = c(-3, 1, 7, 13, 19, Inf), ordered_result = TRUE, labels = paste("Level", 1:5, sep = " "))) %>%
  mutate(fkgl_cat_2 = case_when(
    fkgl_cat_2 == "Level 1" ~ "No School",
    fkgl_cat_2 == "Level 2" ~ "1-6 Years",
    fkgl_cat_2 == "Level 3" ~ "7-12 Years",
    fkgl_cat_2 == "Level 4" ~ "13-18 Years",
    fkgl_cat_2 == "Level 5" ~ "19+ Years")) %>% 
  mutate(fkgl_cat_2 = ordered(fkgl_cat_2, levels = c("No School", "1-6 Years", "1-8 Years", "7-12 Years", "13-18 Years", "19+ Years")))
ks_corpus_q_df <- data_frame(
  document = ks_meta$doc_id,
  backers_count = ks_meta$backers_count,
  goal = ks_meta$goal,
  pledged = ks_meta$pledged,
  state = ks_meta$state,
  top_category = ks_meta$top_category,
  top_category_2 = ks_meta$top_category_2,
  achievement_ratio = ks_meta$achievement_ratio,
  days_to_change_state = ks_meta$days_to_change_state,
  funding_days_pct = ks_meta$funding_days_pct,
  words = ntoken(ks_corpus_q))
ks_corpus_q_df_clean <- ks_corpus_q_df %>%
  mutate(document = as.character(document), 
         backers_count = as.numeric(backers_count),
         goal = as.numeric(goal),
         pledged = as.numeric(pledged),
         achievement_ratio = as.numeric(achievement_ratio),
         days_to_change_state = as.numeric(days_to_change_state),
         funding_days_pct = as.numeric(funding_days_pct),
         words = as.numeric(words)) %>%
  mutate(top_category_2 = factor(top_category_2, levels = c("music", "film & video", "art", "publishing", "technology", "food", "other")))
ks_corpus_fkgl <- inner_join(ks_corpus_q_df_clean, ks_corpus_q_read_clean, by = "document")
ggplot(data = subset(ks_corpus_fkgl, state == "successful"), 
       aes(x = fkgl_cat, y = backers_count, fill = fkgl_cat)) +
  geom_violin(color = "#073642", size = 0.3) +
  theme_solar_pretty() +
  scale_fill_solarized(accent = 'yellow') +
  theme(legend.position = "none", panel.grid.major.x = element_blank()) +
  scale_x_discrete(labels = c("No School", "Elementary\nSchool", "Middle\nSchool", 
                              "High School", "College", "Graduate\nSchool")) +
  scale_y_continuous(labels = comma, limits = c(0,10000)) +
  xlab("\nFlesch-Kinkaid readability score") +
  ylab(NULL) +
  labs(title = "Successful campaign popularity by reading level", 
       subtitle = "\nReading levels with the most backers also have many campaigns with only a few backers\n", 
       caption = "Source: webrobots.io")

I have limited the data to campaigns with fewer than 10,000 backers. Campaigns with reading levels between elementary school and college are more likely to get above 5,000 backers, but a large proportion of these campaigns garner less than 1,000 backers. This pattenr is likely due to the small smaple size of campaigns with very low and very high scores.

state_med_fkgl <- ddply(ks_corpus_fkgl, "state", summarise, med = median(fkgl))
ggplot(ks_corpus_fkgl, aes(x = fkgl, color = state)) + 
  geom_density(size = .75) +
  geom_vline(data = state_med_fkgl, aes(xintercept = med, color = state), linetype = "dashed", size = .75) +
  theme_solar_pretty() +
  scale_colour_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  scale_x_continuous(breaks = c(-3, 0, 3, 6, 9, 12, 15, 18, 21), limits = c(-3, 23)) +
  scale_y_continuous(breaks = c(0, .03, .06, .09)) +
  xlab("\nFlesch-Kinkaid readability score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by reading level",
       subtitle = "\nCampaigns succeed most often with a score between 6 and 10\n",
       caption = "Source: webrobots.io")

The sweet spot for success appears to be a reading level between 6th and 10th grade. Within this boundary, successful campaigns have the greatest density. Slightly outside this boundary on both sides, failed campaigns have the greatest density. Also, the median readability score for successful campaigns is only slightly lower than the median score for failed campaigns.

3. Sentiment

Now, let’s check whether the use of positive/negative words or specific emotions helps a project to be successful.

a) Stay positive

Calculate the tone of each text based on the positive and negative words that are being used. You can rely on the Hu & Liu dictionary provided in lecture or use the Bing dictionary contained in the tidytext package (tidytext::sentiments). Visualize the relationship between tone of the document and success. Briefly comment.

bing_dict <- as.dictionary(get_sentiments("bing"))
bing_dict_ks_dtm <- dfm(ks_corpus_q, dictionary = bing_dict)
bing_dict_ks_df <- melt(as.matrix(bing_dict_ks_dtm)) %>%
  spread(key = features, value = value) %>%
  mutate(document = as.numeric(docs)) %>%
  select(!docs) %>%
  mutate(pos_score = as.integer(positive-negative)) %>%
  mutate(pos_score_cat = as.factor(pos_score)) %>%
  mutate(pos_ratio = (positive+1)/(negative+1)) 
  
ks_corpus_bing <- inner_join(ks_corpus_q_df, bing_dict_ks_df, by = "document")
ggplot(data = ks_corpus_bing, aes(x = pos_score, y = backers_count, color = state)) +
  geom_jitter() +
  xlim(-3.5, 4.5) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.position = "none", panel.grid.major.x = element_blank()) +
  scale_y_continuous(labels = comma, limits = c(0, 10000)) +
  xlab("\nBing positivity score") +
  ylab("Number of backers\n") +
  labs(title = "Campaign popularity by positivity", 
       subtitle = "\nNeutral and slightly positive cmapaigns are the most successful\n", 
       caption = "Source: webrobots.io")

I have limited the data to campaigns with fewer than 10,000 backers. Campaigns with positivity scores between 0 and 2 are the most likely to have more than 5,000 backers.

ggplot(data = ks_corpus_bing, aes(x = pos_score, color = state)) +
  geom_density(size = .75) +
  xlim(-4, 6) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  scale_y_continuous(breaks = c(0, .2, .4, .6, .8)) +
  xlab("\nBing positivity score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by positivity",
       subtitle = "\n...\n",
       caption = "Source: webrobots.io")

Campaigns on the extremes of the positivity scale (very positive or negative) have a higher success rate than campaigns with neutral or slightly positive scores. This suggests that campaign blurbs with emotive language do better than those with a more flat affect.

b) Positive vs. negative

Segregate all 2,000 blurbs into positive and negative texts based on their polarity score calculated in step (a). Now, collapse the positive and negative texts into two larger documents. Create a document-term-matrix based on this collapsed set of two documents. Generate a comparison cloud showing the most-frequent positive and negative words.

ks_corpus_bing <- ks_corpus_bing %>%
  mutate(sent_cat = case_when(
    pos_ratio >= 1 ~ "positive",
    TRUE           ~ "negative"))
ks_docs_pos <- ks_corpus_bing %>%
  filter(sent_cat == "positive") %>%
  mutate(doc_id = document) %>%
  select(doc_id)
ks_docs_neg <- ks_corpus_bing %>%
  filter(sent_cat == "negative") %>%
  mutate(doc_id = document) %>%
  select(doc_id)
ks_pos <- inner_join(ks_docs_pos, ks_top_bottom, key = doc_id)
ks_neg <- inner_join(ks_docs_neg, ks_top_bottom, key = doc_id)
ks_ds_pos <- DataframeSource(ks_pos)
ks_corpus_pos <- VCorpus(ks_ds_pos)
ks_ds_neg <- DataframeSource(ks_neg)
ks_corpus_neg <- VCorpus(ks_ds_neg)
ks_corpus_pos_clean <- clean_corpus(ks_corpus_pos)
no_cores <- detectCores() - 1
ks_corpus_pos_final <- mclapply(ks_corpus_pos_clean_stemmed, stemCompletionBrambor, dictionary = ks_corpus_pos_clean, mc.cores = no_cores)
ks_corpus_pos_final <- as.VCorpus(ks_corpus_pos_final)
ks_corpus_neg_final <- as.VCorpus(ks_corpus_neg_final)
ks_tdm_pos <- TermDocumentMatrix(ks_corpus_pos_final)
ks_tdm_pos_tidy <- tidytext::tidy(ks_tdm_pos)
ks_tdm_neg <- TermDocumentMatrix(ks_corpus_neg_final)
ks_tdm_neg_tidy <- tidytext::tidy(ks_tdm_neg)
ks_tf_idf_pos <- ks_tdm_pos_tidy %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf)) 
wordcloud(ks_tf_idf_pos$term, ks_tf_idf_pos$tf, max.words = 100, colors = "#b58900")

ks_tf_idf_neg <- ks_tdm_neg_tidy %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf)) 
wordcloud(ks_tf_idf_neg$term, ks_tf_idf_neg$tf, max.words = 100, colors = "#268bd2")

c) Get in their mind

Now, use the NRC Word-Emotion Association Lexicon in the tidytext package to identify a larger set of emotions (anger, anticipation, disgust, fear, joy, sadness, surprise, trust). Again, visualize the relationship between the use of words from these categories and success. What is your finding?

nrc_dict <- as.dictionary(get_sentiments("nrc"))
nrc_dict_ks_dtm <- dfm(ks_corpus_q, dictionary = nrc_dict)
nrc_dict_ks_df <- melt(as.matrix(nrc_dict_ks_dtm)) %>%
  spread(key = features, value = value) %>%
  mutate(document = as.numeric(docs)) %>%
  select(!docs) %>%
  mutate(sent_score = anger + anticipation + disgust + fear + joy + sadness + surprise + trust)
ks_corpus_nrc <- inner_join(ks_corpus_q_df, nrc_dict_ks_df, by = "document")
ggplot(data = ks_corpus_nrc, aes(x = anticipation, color = state)) +
  geom_density(size = .75) +
  xlim(0,4) +
  ylim(0,1.6) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  xlab("\nNRC anticipation score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by anticipation",
       subtitle = "\n...\n",
       caption = "Source: webrobots.io")

ggplot(data = ks_corpus_nrc, aes(x = joy, color = state)) +
  geom_density(size = .75) +
  xlim(0,4) +
  ylim(0,1.6) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  xlab("\nNRC joy score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by joy",
       subtitle = "\n...\n",
       caption = "Source: webrobots.io")

ggplot(data = ks_corpus_nrc, aes(x = trust, color = state)) +
  geom_density(size = .75) +
  xlim(0,4) +
  ylim(0,1.6) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  xlab("\nNRC trust score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by trust",
       subtitle = "\n...\n",
       caption = "Source: webrobots.io")

I limited the data to campaigns with emotion scores less than 4. I looked at anticipation, joy, and trust, which I consider the most likely to inspire support for Kickstarter campaigns. However, for each emotion, campaigns with low scores are more likely to succeed, whereas those with higher scores were more likely to fail.

ggplot(data = ks_corpus_nrc, aes(x = sent_score, color = state)) +
  geom_density(size = .75) +
  xlim(0,20) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  xlab("\nNRC total sentiment score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by all emotions",
       subtitle = "\n...\n",
       caption = "Source: webrobots.io")

I then combined the scores for all emotions into a single “emotive” score. Campaigns with lower scores for any emotion are the most likely to be successful. The sweet spot appears to be scores between 0 and 3.

---
title: "Homework 3: Kickstarter"
author: "Alison Ryland"
date: "April 20, 2020"
output: html_notebook
---

```{r Setup, include = FALSE, results = 'hide', warning = FALSE}
library(knitr)
library(pacman)

p_load(tidyverse, lubridate, ggplot2, ggthemes, tm, qdap, quanteda, SnowballC, parallel, NLP, tidytext, wordcloud, plotrix, RColorBrewer, plyr, forcats, extrafont, scales, reshape2, stringr, textdata)
```

## 1. Identifying Successful Projects

### a) Success by category

*There are several ways to identify success of a project:*

*- State (state): Whether a campaign was successful or not*
*- Pledged Amount (pledged)*
*- Achievement Ratio: Create a variable achievement_ratio by calculating the percentage of the original monetary goal reached by the actual amount pledged (that is pledged/goal x 100)*
*- Number of backers (backers_count)*
*- How quickly the goal was reached (difference between launched_at and state_changed_at) for those campaigns that were successful*

*Use one or more of these measures to visually summarize which categories were most successful in attracting funding on kickstarter. Briefly summarize your findings.*

```{r, message = FALSE, result = 'hide'}
ks <- read_csv("kickstarter_projects_2020_02.csv")
```

```{r, result = 'hide'}
ks %>%
  filter(state != "live") %>%
  group_by(top_category) %>%
  tally(sort = TRUE) %>%
  head()
```

```{r}
ks_2 <- ks %>%
  filter(state != "live") %>%
  mutate(state = factor(state, levels = c("successful", "suspended", "canceled", "failed"))) %>%
  mutate(state_cat = case_when(
    state == "successful" ~ "successful",
    TRUE                  ~ "not successful")) %>%
  mutate(state_cat = factor(state_cat, levels = c("successful", "not successful"))) %>%
  mutate(achievement_ratio = pledged/goal*100) %>%
  mutate(deadline_2 = ymd(deadline)) %>%
  mutate(deadline_3 = as.POSIXct(deadline_2, tz = Sys.timezone())) %>%
  mutate(launched_at_2 = ymd(launched_at)) %>%
  mutate(launched_at_3 = as.POSIXct(launched_at_2, tz = Sys.timezone())) %>%
  mutate(state_changed_at_2 = ymd(state_changed_at)) %>%
  mutate(state_changed_at_3 = as.POSIXct(state_changed_at_2, tz = Sys.timezone())) %>%
  mutate(days_to_deadline = as.numeric(difftime(deadline_3, launched_at_3, units = "days")))%>%
  mutate(days_to_change_state = as.numeric(difftime(state_changed_at_3, launched_at_3, units = "days")))%>%
  mutate(funding_days_pct = as.numeric(days_to_change_state)/as.numeric(days_to_deadline)) %>%
  mutate(top_category_2 = case_when(
     top_category == "music"          ~ "music",
     top_category == "film & video"   ~ "film & video",
     top_category == "art"            ~ "art",
     top_category == "publishing"     ~ "publishing",
     top_category == "technology"     ~ "technology",
     top_category == "food"           ~ "food",
     TRUE                             ~ "other")) %>%
  mutate(top_category_2 = factor(top_category_2, levels = c("music", "film & video", "art", "publishing", "technology", "food", "other")))
```

```{r}
theme_solar_pretty <- function()
  {
    theme_solarized(base_size = 12, base_family = "Trebuchet MS") %+replace% 
    theme(text = element_text(color = "#586e75"), 
          plot.title = element_text(size = 13, face = "bold", hjust = 0),
          plot.subtitle = element_text(hjust = 0),
          axis.title = element_text(face = "bold"), 
          legend.title = element_text(face = "bold"),
          element_line(size = 0.5, color = "#586e75"), 
          panel.background = element_blank(),
          panel.grid.major.x = element_line(),
          panel.grid.minor.x = element_blank(), 
          panel.grid.major.y = element_line(),
          panel.grid.minor.y = element_blank())
  }
```

```{r, message = FALSE, warning = FALSE}
ggplot() + 
  geom_jitter(data = ks_2, aes(x = goal/1000, y = backers_count, color = factor(state)), size = 1) +
  geom_smooth(data = subset(ks_2, state == "successful"), aes(x = goal/1000, y = backers_count), 
              lwd = 1.5, se = FALSE, color = "#586e75") +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  scale_x_continuous(labels = function(x) paste0("$", x, "K")) +
  scale_y_continuous(labels = comma) +
  coord_cartesian(xlim = c(0, 1000), ylim = c(0, 50000)) +
  xlab(NULL) +
  ylab("Number of backers\n") +
  labs(title = "Campaign popularity by funding goal", 
       subtitle = "\nA goal less than $2,000 is the most likely to succeed\n", 
       caption = "\nSource: webrobots.io") +
  ggplot2::annotate(geom = "text", x = 1025, y = 40000, label = "Only successful campaigns \nhave a positive corelation ", 
                    hjust = "right", fontface = "bold", size = 4, color = "#586e75")
```

```{r}
ks_3 <- ks_2 %>%
  filter(goal <= 10000) %>%
  mutate(goal_cat = cut(goal, breaks = 5, labels = c("20%", "40%", "60%", "80%", "100%"))) %>%
  mutate(state_2 = state) %>%
  mutate(state_2 = factor(state_2, levels = c("suspended", "canceled", "failed", "successful")))

state_levels <- ks_3 %>%
  group_by(state) %>%
  summarise_at(vars(backers_count), sum) %>%
  arrange(desc(backers_count)) %>%
  select(state) %>%
  mutate(state = as.character(state)) %>%
  flatten() %>%
  unlist()
```

```{r}
ggplot(ks_3, aes(x = goal_cat, fill = state)) + 
  geom_bar(position = "stack", stat = "count") +
  theme_solar_pretty() +
  scale_fill_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top", panel.grid.major.x = element_blank()) +
  scale_x_discrete(labels = c("$10 - $2,000", "$2K - $4K", "$4K - $6K", 
                              "$6K - $8K", "$8K - $10K")) +
  scale_y_continuous(labels = comma) +
  ylab(NULL) +
  xlab(NULL) +
  labs(title = "Campaign success by fundraising goal", 
       subtitle = "\nSmaller goals have a greater chance of success\n", 
       caption = "\nSource: webrobots.io")
```

I have limited the data to campaigns with goals under `$`10,000. The smaller gthe goal,the more likely a campaign is to succeed. Campaigns with goals under `$`6,000 are more likely than not to succeed, whereas campaigns with goals over `$`6,000 are at least 50% likely to fail.

```{r}
ggplot(data = subset(ks_2, top_category_2 != "other"), aes(fill = state_cat, x = top_category_2)) + 
  geom_bar(position = "dodge", stat = "count") +
  theme_solar_pretty() +
  scale_fill_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top", panel.grid.major.x = element_blank()) +
  scale_x_discrete(labels = c("Music", "Film & Video", "Art", 
                              "Publishing", "Technology", "Food", "Other")) +
  scale_y_continuous(labels = comma) +
  ylab(NULL) +
  xlab(NULL) +
  labs(title = "Campaign success vs. failure by topic", 
       subtitle = "\nMusic is the most successful category, food is the least successful\n", 
       caption = "\nSource: webrobots.io")
```

I have limited the data to campaigns in the 6 most popular categories. Music and publishing campaigns are two times as likely to succeed instead of fail, film + video and art campaigns are slightly more than 50% likely to succeed, and technology and food campaigns are more likely than not to fail.

### BONUS ONLY: b) Success by location

*Now, use the location information to calculate the total number of successful projects by state (if you are ambitious, normalize by population). Also, identify the Top 50 “innovative” cities in the U.S. (by whatever measure you find plausible). Provide a leaflet map showing the most innovative states and cities in the U.S. on a single map based on these information.*


## 2. Writing your success story

*Each project contains a blurb – a short description of the project. While not the full description of the project, the short headline is arguably important for inducing interest in the project (and ultimately popularity and success). Let’s analyze the text.*

### a) Cleaning the text and word cloud

*To reduce the time for analysis, select the 1000 most successful projects and a sample of 1000 unsuccessful projects. Use the cleaning functions introduced in lecture (or write your own in addition) to remove unnecessary words (stop words), syntax, punctuation, numbers, white space etc. Note, that many projects use their own unique brand names in upper cases, so try to remove these fully capitalized words as well (since we are aiming to identify common words across descriptions). Stem the words left over and complete the stems. Create a document-term-matrix.*

*Provide a word cloud of the most frequent or important words (your choice which frequency measure you choose) among the most successful projects.*

In order to create the corpus without splitting and rejoining the original data, I took both the top and bottom 1000 rows based on achievement ratio.

```{r}
ks_3 <- ks_2 %>%
  mutate(doc_id = id) %>%
  mutate(text = blurb)

ks_4 <- ks_3 %>%
  select(doc_id, text, backers_count, goal, pledged, state, top_category, top_category_2, achievement_ratio, days_to_change_state, funding_days_pct)

ks_top_bottom <- ks_4 %>%
  filter(state != ("canceled")) %>%
  filter(state != ("suspended")) %>%
  arrange(desc(achievement_ratio)) %>%
  filter(!row_number() %in% 001001:116851)

ks_meta <- ks_top_bottom %>%
  select(!text) 

ks_ds <- DataframeSource(ks_top_bottom)
ks_corpus <- VCorpus(ks_ds)
```

I was unable to reattach the metadata after completing the stems for the corpus text, so I will also created two separate corpuses of the top successful campaigns and a random sample of failed campaigns:

```{r}
top_s <- ks_4 %>%
  filter(state == "successful") %>%
  top_n(1000, achievement_ratio)

sample_f <- ks_4 %>%
  filter(state == "failed") %>%
  sample_n(1000)

ks_ds_s <- DataframeSource(top_s)
ks_corpus_s <- VCorpus(ks_ds_s)

ks_ds_f <- DataframeSource(sample_f)
ks_corpus_f <- VCorpus(ks_ds_f)
```

```{r}
clean_corpus <- function(corpus){
  require(tm)
  require(qdap)
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, content_transformer(replace_abbreviation))
  corpus <- tm_map(corpus, content_transformer(replace_contraction))
  corpus <- tm_map(corpus, content_transformer(replace_symbol))
  corpus <- tm_map(corpus, content_transformer(bracketX))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
  }
```

```{r, message = FALSE}
ks_corpus_s_clean <- clean_corpus(ks_corpus_s)
ks_corpus_f_clean <- clean_corpus(ks_corpus_f)

ks_corpus_s_clean_stemmed <- tm_map(ks_corpus_s_clean, stemDocument)
ks_corpus_f_clean_stemmed <- tm_map(ks_corpus_f_clean, stemDocument)
```

```{r}
stemCompletionBrambor <- function(x, dictionary) {
   x <- unlist(strsplit(as.character(x), " "))
   x <- x[x != ""]
   x <- stemCompletion(x, dictionary = dictionary)
   x <- paste(x, sep = "", collapse = " ")
   PlainTextDocument(stripWhitespace(x))
   }
```

```{r}
no_cores <- detectCores() - 1

ks_corpus_s_final <- mclapply(ks_corpus_s_clean_stemmed, stemCompletionBrambor, dictionary = ks_corpus_s_clean, mc.cores = no_cores)
ks_corpus_f_final <- mclapply(ks_corpus_f_clean_stemmed, stemCompletionBrambor, dictionary = ks_corpus_f_clean, mc.cores = no_cores)
```

```{r}
ks_corpus_s_final <- as.VCorpus(ks_corpus_s_final)
ks_corpus_f_final <- as.VCorpus(ks_corpus_f_final)
```

```{r}
ks_tdm_s <- TermDocumentMatrix(ks_corpus_s_final)
ks_tdm_s_tidy <- tidytext::tidy(ks_tdm_s)

ks_tdm_f <- TermDocumentMatrix(ks_corpus_f_final)
ks_tdm_f_tidy <- tidytext::tidy(ks_tdm_f)
```

```{r, warning = FALSE}
ks_tf_idf_s <- ks_tdm_s_tidy %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf)) 

wordcloud(ks_tf_idf_s$term, ks_tf_idf_s$tf, max.words = 100, title.bg.colors = "#fdf6e3", colors = "#859900")
```

### b) Success in words

*Provide a pyramid plot to show how the words between successful and unsuccessful projects differ in frequency. A selection of 10-20 top words is sufficient here.*

Since I was unable to reattach the metadata in Question 1a, I could not create plots for common or different words. 

```{r, warning = FALSE}
ks_tf_idf_f <- ks_tdm_f_tidy %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf)) 

wordcloud(ks_tf_idf_f$term, ks_tf_idf_f$tf, max.words = 100, colors = "#cb4b16")
```

### c) Simplicity as a virtue

*These blurbs are short in length (max. 150 characters) but let’s see whether brevity and simplicity still matters. Calculate a readability measure (Flesh Reading Ease, Flesh Kincaid or any other comparable measure) for the texts. Visualize the relationship between the readability measure and one of the measures of success. Briefly comment on your finding.*

```{r}
ks_text <- ks_top_bottom %>%
  select(text) %>%
  deframe()

ks_text_vec <- VectorSource(ks_text)
ks_text_corpus <- VCorpus(ks_text_vec)

ks_corpus_q <- corpus(ks_corpus)

ks_corpus_q$backers_count <- ks_meta$backers_count
ks_corpus_q$goal <- ks_meta$goal
ks_corpus_q$pledged <- ks_meta$pledged
ks_corpus_q$state <- ks_meta$state
ks_corpus_q$top_category <- ks_meta$top_category
ks_corpus_q$top_category_2 <- ks_meta$top_category_2
ks_corpus_q$achievement_ratio <- ks_meta$achievement_ratio
ks_corpus_q$days_to_change_state <- ks_meta$days_to_change_state
ks_corpus_q$funding_days_pct <- ks_meta$funding_days_pct
```

```{r}
ks_text_name <- ks_top_bottom %>% 
  select(doc_id) %>%
  mutate(doc_id = as.character(doc_id)) %>%
  deframe()

docnames(ks_corpus_q) <- ks_text_name

ks_corpus_q_read <- textstat_readability(ks_corpus_q, measure = 'Flesch.Kincaid')

ks_corpus_q_read_clean <- ks_corpus_q_read %>%
  mutate(fkgl = Flesch.Kincaid) %>%
  select(!Flesch.Kincaid) %>%
  mutate(fkgl_int = as.integer(fkgl + 0.5)) %>%
  mutate(fkgl_cat = cut(fkgl_int, breaks = c(-3, 1, 5, 9, 13, 17, Inf), ordered_result = TRUE, labels = paste("Level", 1:6, sep = " "))) %>%
  mutate(fkgl_cat = case_when(
    fkgl_cat == "Level 1" ~ "No School",
    fkgl_cat == "Level 2" ~ "Elementary School",
    fkgl_cat == "Level 3" ~ "Middle School",
    fkgl_cat == "Level 4" ~ "High School",
    fkgl_cat == "Level 5" ~ "College",
    fkgl_cat == "Level 6" ~ "Graduate School")) %>% 
  mutate(fkgl_cat = ordered(fkgl_cat, levels = c("No School", "Elementary School", "Middle School", "High School", "College", 
                                                "Graduate School"))) %>%
  mutate(fkgl_cat_2 = cut(fkgl_int, breaks = c(-3, 1, 7, 13, 19, Inf), ordered_result = TRUE, labels = paste("Level", 1:5, sep = " "))) %>%
  mutate(fkgl_cat_2 = case_when(
    fkgl_cat_2 == "Level 1" ~ "No School",
    fkgl_cat_2 == "Level 2" ~ "1-6 Years",
    fkgl_cat_2 == "Level 3" ~ "7-12 Years",
    fkgl_cat_2 == "Level 4" ~ "13-18 Years",
    fkgl_cat_2 == "Level 5" ~ "19+ Years")) %>% 
  mutate(fkgl_cat_2 = ordered(fkgl_cat_2, levels = c("No School", "1-6 Years", "1-8 Years", "7-12 Years", "13-18 Years", "19+ Years")))
```

```{r, warning = FALSE}
ks_corpus_q_df <- data_frame(
  document = ks_meta$doc_id,
  backers_count = ks_meta$backers_count,
  goal = ks_meta$goal,
  pledged = ks_meta$pledged,
  state = ks_meta$state,
  top_category = ks_meta$top_category,
  top_category_2 = ks_meta$top_category_2,
  achievement_ratio = ks_meta$achievement_ratio,
  days_to_change_state = ks_meta$days_to_change_state,
  funding_days_pct = ks_meta$funding_days_pct,
  words = ntoken(ks_corpus_q))
```

```{r}
ks_corpus_q_df_clean <- ks_corpus_q_df %>%
  mutate(document = as.character(document), 
         backers_count = as.numeric(backers_count),
         goal = as.numeric(goal),
         pledged = as.numeric(pledged),
         achievement_ratio = as.numeric(achievement_ratio),
         days_to_change_state = as.numeric(days_to_change_state),
         funding_days_pct = as.numeric(funding_days_pct),
         words = as.numeric(words)) %>%
  mutate(top_category_2 = factor(top_category_2, levels = c("music", "film & video", "art", "publishing", "technology", "food", "other")))
```

```{r}
ks_corpus_fkgl <- inner_join(ks_corpus_q_df_clean, ks_corpus_q_read_clean, by = "document")
```

```{r, warning = FALSE}
ggplot(data = subset(ks_corpus_fkgl, state == "successful"), 
       aes(x = fkgl_cat, y = backers_count, fill = fkgl_cat)) +
  geom_violin(color = "#073642", size = 0.3) +
  theme_solar_pretty() +
  scale_fill_solarized(accent = 'yellow') +
  theme(legend.position = "none", panel.grid.major.x = element_blank()) +
  scale_x_discrete(labels = c("No School", "Elementary\nSchool", "Middle\nSchool", 
                              "High School", "College", "Graduate\nSchool")) +
  scale_y_continuous(labels = comma, limits = c(0,10000)) +
  xlab("\nFlesch-Kinkaid readability score") +
  ylab(NULL) +
  labs(title = "Successful campaign popularity by reading level", 
       subtitle = "\nElementary- to middle-school reading levels are the most successful and popular\n", 
       caption = "Source: webrobots.io")
```

I have limited the data to campaigns with fewer than 10,000 backers. Campaigns with reading levels between elementary school and college are more likely to get above 5,000 backers, but a large proportion of these campaigns garner less than 1,000 backers. This pattenr is likely due to the small smaple size of campaigns with very low and very high scores. 

```{r, warning = FALSE}
state_med_fkgl <- ddply(ks_corpus_fkgl, "state", summarise, med = median(fkgl))

ggplot(ks_corpus_fkgl, aes(x = fkgl, color = state)) + 
  geom_density(size = .75) +
  geom_vline(data = state_med_fkgl, aes(xintercept = med, color = state), linetype = "dashed", size = .75) +
  theme_solar_pretty() +
  scale_colour_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  scale_x_continuous(breaks = c(-3, 0, 3, 6, 9, 12, 15, 18, 21), limits = c(-3, 23)) +
  scale_y_continuous(breaks = c(0, .03, .06, .09)) +
  xlab("\nFlesch-Kinkaid readability score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by reading level",
       subtitle = "\nCampaigns succeed most often with a score between 6 and 10\n",
       caption = "Source: webrobots.io")
```

The sweet spot for success appears to be a reading level between 6th and 10th grade. Within this boundary, successful campaigns have the greatest density. Slightly outside this boundary on both sides, failed campaigns have the greatest density. Also, the median readability score for successful campaigns is only slightly lower than the median score for failed campaigns.


## 3. Sentiment

*Now, let’s check whether the use of positive/negative words or specific emotions helps a project to be successful.*

### a) Stay positive

*Calculate the tone of each text based on the positive and negative words that are being used. You can rely on the Hu & Liu dictionary provided in lecture or use the Bing dictionary contained in the tidytext package (tidytext::sentiments). Visualize the relationship between tone of the document and success. Briefly comment.*

```{r}
bing_dict <- as.dictionary(get_sentiments("bing"))

bing_dict_ks_dtm <- dfm(ks_corpus_q, dictionary = bing_dict)

bing_dict_ks_df <- melt(as.matrix(bing_dict_ks_dtm)) %>%
  spread(key = features, value = value) %>%
  mutate(document = as.numeric(docs)) %>%
  select(!docs) %>%
  mutate(pos_score = as.integer(positive-negative)) %>%
  mutate(pos_score_cat = as.factor(pos_score)) %>%
  mutate(pos_ratio = (positive+1)/(negative+1)) 
  
ks_corpus_bing <- inner_join(ks_corpus_q_df, bing_dict_ks_df, by = "document")
```

```{r, warning = FALSE}
ggplot(data = ks_corpus_bing, aes(x = pos_score, y = backers_count, color = state)) +
  geom_jitter() +
  xlim(-3.5, 4.5) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.position = "none", panel.grid.major.x = element_blank()) +
  scale_y_continuous(labels = comma, limits = c(0, 10000)) +
  xlab("\nBing positivity score") +
  ylab("Number of backers\n") +
  labs(title = "Campaign popularity by positivity", 
       subtitle = "\nNeutral and slightly positive campaigns are the most popular\n", 
       caption = "Source: webrobots.io")
```

I have limited the data to campaigns with fewer than 10,000 backers. Campaigns with positivity scores between 0 and 2 are the most likely to have more than 5,000 backers.

```{r, warning = FALSE}
ggplot(data = ks_corpus_bing, aes(x = pos_score, color = state)) +
  geom_density(size = .75) +
  xlim(-4, 6) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  scale_y_continuous(breaks = c(0, .2, .4, .6, .8)) +
  xlab("\nBing positivity score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by positivity",
       subtitle = "\nSlightly negative campaigns have the highest rate of success\n",
       caption = "Source: webrobots.io")
```

Campaigns on the extremes of the positivity scale (very positive or negative) have a higher success rate than campaigns with neutral or slightly positive scores. This suggests that campaign blurbs with emotive language do better than those with a more flat affect.

### b) Positive vs. negative

*Segregate all 2,000 blurbs into positive and negative texts based on their polarity score calculated in step (a). Now, collapse the positive and negative texts into two larger documents. Create a document-term-matrix based on this collapsed set of two documents. Generate a comparison cloud showing the most-frequent positive and negative words.*

```{r}
ks_corpus_bing <- ks_corpus_bing %>%
  mutate(sent_cat = case_when(
    pos_ratio >= 1 ~ "positive",
    TRUE           ~ "negative"))

ks_docs_pos <- ks_corpus_bing %>%
  filter(sent_cat == "positive") %>%
  mutate(doc_id = document) %>%
  select(doc_id)

ks_docs_neg <- ks_corpus_bing %>%
  filter(sent_cat == "negative") %>%
  mutate(doc_id = document) %>%
  select(doc_id)
```

```{r, message = FALSE}
ks_pos <- inner_join(ks_docs_pos, ks_top_bottom, key = doc_id)
ks_neg <- inner_join(ks_docs_neg, ks_top_bottom, key = doc_id)

ks_ds_pos <- DataframeSource(ks_pos)
ks_corpus_pos <- VCorpus(ks_ds_pos)

ks_ds_neg <- DataframeSource(ks_neg)
ks_corpus_neg <- VCorpus(ks_ds_neg)
```

```{r, message = FALSE}
ks_corpus_pos_clean <- clean_corpus(ks_corpus_pos)
ks_corpus_neg_clean <- clean_corpus(ks_corpus_neg)

ks_corpus_pos_clean_stemmed <- tm_map(ks_corpus_pos_clean, stemDocument)
ks_corpus_neg_clean_stemmed <- tm_map(ks_corpus_neg_clean, stemDocument)
```

```{r}
no_cores <- detectCores() - 1

ks_corpus_pos_final <- mclapply(ks_corpus_pos_clean_stemmed, stemCompletionBrambor, dictionary = ks_corpus_pos_clean, mc.cores = no_cores)
ks_corpus_neg_final <- mclapply(ks_corpus_neg_clean_stemmed, stemCompletionBrambor, dictionary = ks_corpus_neg_clean, mc.cores = no_cores)
```

```{r}
ks_corpus_pos_final <- as.VCorpus(ks_corpus_pos_final)
ks_corpus_neg_final <- as.VCorpus(ks_corpus_neg_final)
```

```{r}
ks_tdm_pos <- TermDocumentMatrix(ks_corpus_pos_final)
ks_tdm_pos_tidy <- tidytext::tidy(ks_tdm_pos)

ks_tdm_neg <- TermDocumentMatrix(ks_corpus_neg_final)
ks_tdm_neg_tidy <- tidytext::tidy(ks_tdm_neg)
```

```{r, warning = FALSE}
ks_tf_idf_pos <- ks_tdm_pos_tidy %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf)) 

wordcloud(ks_tf_idf_pos$term, ks_tf_idf_pos$tf, max.words = 100, colors = "#b58900")
```

```{r, warning = FALSE}
ks_tf_idf_neg <- ks_tdm_neg_tidy %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf)) 

wordcloud(ks_tf_idf_neg$term, ks_tf_idf_neg$tf, max.words = 100, colors = "#268bd2")
```

### c) Get in their mind

*Now, use the NRC Word-Emotion Association Lexicon in the tidytext package to identify a larger set of emotions (anger, anticipation, disgust, fear, joy, sadness, surprise, trust). Again, visualize the relationship between the use of words from these categories and success. What is your finding?*

```{r}
nrc_dict <- as.dictionary(get_sentiments("nrc"))

nrc_dict_ks_dtm <- dfm(ks_corpus_q, dictionary = nrc_dict)

nrc_dict_ks_df <- melt(as.matrix(nrc_dict_ks_dtm)) %>%
  spread(key = features, value = value) %>%
  mutate(document = as.numeric(docs)) %>%
  select(!docs) %>%
  mutate(sent_score = anger + anticipation + disgust + fear + joy + sadness + surprise + trust)

ks_corpus_nrc <- inner_join(ks_corpus_q_df, nrc_dict_ks_df, by = "document")
```

```{r, warning = FALSE}
ggplot(data = ks_corpus_nrc, aes(x = anticipation, color = state)) +
  geom_density(size = .75) +
  xlim(0,4) +
  ylim(0,1.6) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  xlab("\nNRC anticipation score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by anticipation",
       caption = "Source: webrobots.io")
```

```{r, warning = FALSE}
ggplot(data = ks_corpus_nrc, aes(x = joy, color = state)) +
  geom_density(size = .75) +
  xlim(0,4) +
  ylim(0,1.6) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  xlab("\nNRC joy score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by joy",
       caption = "Source: webrobots.io")
```

```{r, warning = FALSE}
ggplot(data = ks_corpus_nrc, aes(x = trust, color = state)) +
  geom_density(size = .75) +
  xlim(0,4) +
  ylim(0,1.6) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  xlab("\nNRC trust score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by trust",
       caption = "Source: webrobots.io")
```

I limited the data to campaigns with emotion scores less than 4. I looked at anticipation, joy, and trust, which I consider the most likely to inspire support for Kickstarter campaigns. However, for each emotion, campaigns with low scores are more likely to succeed, whereas those with higher scores were more likely to fail.

```{r, warning = FALSE}
ggplot(data = ks_corpus_nrc, aes(x = sent_score, color = state)) +
  geom_density(size = .75) +
  xlim(0,20) +
  theme_solar_pretty() +
  scale_color_solarized(accent = 'yellow') +
  theme(legend.title = element_blank(), legend.position = "top") +
  xlab("\nNRC total sentiment score") +
  ylab("Density\n") +
  labs(title = "Successful vs. failed campaigns by all emotions",
       subtitle = "\n...\n",
       caption = "Source: webrobots.io")
```

I then combined the scores for all emotions into a single "emotive" score. Campaigns with lower scores for any emotion are the most likely to be successful. The sweet spot appears to be scores between 0 and 3.
