library(dplyr)
library(ggplot2)
library(hunspell)
library(readr)
library(stringr)
library(tidyr)
library(tidytext)
library(wordcloud2)
questions <- read_csv("data/question_summary.csv")
# First header row removed
responses_raw <- read_tsv("data/responses.csv")
# Filter where Q2 wasn't shown. These don't have any responses, they might have been errors?
responses <- responses_raw %>% filter(Q2 != "{question_not_shown}")
responses_rcons <- responses %>%
filter(Q27 != "{question_not_shown}" & str_length(Q27) > 3) %>%
select(id, Q27, Q28, Q29_SQ01, Q29_SQ02, Q29_SQ03, Q30, Q31, Q38)
# q_words <- q %>%
# mutate(text = str_replace_all(Q27, "\\{.*?\\}", " ")) %>%
# unnest_tokens(word, text)
#
# q_words <- fix_spelling(q_words)
#
# q_words_top <- top_q_tf_idf(q_words)
check_spelling <- function(q_words) {
q_words_spellcheck <- q_words %>%
select(word) %>%
group_by(word) %>%
summarise() %>%
mutate(
en_us = hunspell_check(word, dict = dictionary("en_US")),
en_gb = hunspell_check(word, dict = dictionary("en_GB")),
es_es = hunspell_check(word, dict = dictionary("es_ES")),
r = word %in% c("cran", "alteryx", "rmarkdown", "repl", "api",
"barug", "bioconductor", "walkthroughs",
"composable", "csv", "sig", "dplyr",
"foss", "eda", "geo", "ess", "e.g.",
"lme4", "emacs", "github", "qgis",
"grdevices", "adhoc", "hpc", "httr",
"ui", "i.e.", "jupyter", "knitr", "rladies",
"listserv", "magrittr", "matlab", "mathematica",
"mran", "nlp", "mysql", "pandoc", "splus", "mplus",
"postgres", "postgis", "powerbi", "ropensci",
"purrr", "rcpp", "read_csv", "rmd", "rstan", "spss",
"stata","rstats", "stan", "rstudio", "etl", "testthat",
"tidyr", "tidyverse", "sweave", "stackoverflow") |
str_detect(word, "data|df|app|bloggers|hadley|dplyr|ggplot|odbc|wickham")
)
q_words_mispelled <- q_words_spellcheck %>%
filter(! (en_gb | en_us | es_es | r)) %>%
mutate(suggestions=word) %>%
mutate_at(c("suggestions"), hunspell_suggest) %>%
unnest(suggestions) %>%
group_by(word) %>%
summarise(suggestion=first(suggestions)) # just take the first suggestion for now
return(q_words_mispelled)
}
fix_spelling <- function(q_words) {
q_words_mispelled <- check_spelling(q_words)
q_words <- q_words %>% left_join(q_words_mispelled)
q_words <- q_words %>%
mutate(
word_old = word,
word = ifelse(is.na(suggestion), word, suggestion)
) %>%
count(id, word, sort = TRUE) %>%
ungroup()
return(q_words)
}
stem_q_words <- function(q_words) {
q_stems <- q_words %>%
group_by(word) %>%
summarise() %>%
mutate(
stem = hunspell_stem(word)
) %>%
unnest(stem) %>%
group_by(word) %>%
summarise(stem=first(stem))
q_words <- q_words %>% left_join(q_stems)
q_words <- q_words %>%
mutate(
word_old = word,
word = ifelse(is.na(stem), word, stem)
)
return(q_words)
}
q_tf_idf <- function(q_words) {
q_total_words <- q_words %>%
group_by(id) %>%
summarize(total = sum(n))
q_words <- left_join(q_words, q_total_words)
q_words <- q_words %>%
bind_tf_idf(word, id, n)
q_words %>%
select(-total) %>%
arrange(desc(tf_idf))
return(q_words)
}
top_q_tf_idf <- function(q, tf_idf_value = 3, n_value = 1) {
q_words <- q_tf_idf(q)
q_top <- q_words %>%
filter(tf_idf >= tf_idf_value) %>%
group_by(word) %>%
summarise(n=n()) %>%
filter(n > n_value)
return(q_top)
}
# TODO filter out stop words and do word frequencies
q_word_frequency <- function(q_words) {
q_words_no_stop <- q_words %>%
filter(!word %in% stop_words$word) %>%
group_by(word) %>%
summarise(n=n())
return(q_words_no_stop)
}
best_of_r <- responses_rcons %>%
select(id, Q27) %>%
mutate(text = str_replace_all(Q27, "\\{.*?\\}", " ")) %>%
unnest_tokens(word, text)
best_of_r_stems <- stem_q_words(best_of_r)
best_of_r_words <- fix_spelling(best_of_r_stems)
best_of_r_words_top <- top_q_tf_idf(best_of_r_words)
best_of_r_freq <- q_word_frequency(best_of_r_words)
best_of_r_freq_top <- best_of_r_freq %>% top_n(25)
ggplot(best_of_r_words_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
ggplot(best_of_r_freq_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
best_of_r_wcdata <- best_of_r_words_top
names(best_of_r_wcdata) <- c("word", "freq")
wordcloud2(best_of_r_wcdata)
best_of_r_freq_wcdata <- best_of_r_freq_top
names(best_of_r_freq_wcdata) <- c("word", "freq")
wordcloud2(best_of_r_freq_wcdata)
worst_of_r <- responses_rcons %>%
select(id, Q28) %>%
mutate(text = str_replace_all(Q28, "\\{.*?\\}", " ")) %>%
unnest_tokens(word, text)
# worst_of_r_stems <- stem_q_words(worst_of_r)
worst_of_r_words <- fix_spelling(worst_of_r)
# worst_of_r_words_tfidf <- q_tf_idf(worst_of_r_words)
worst_of_r_words_top <- top_q_tf_idf(worst_of_r_words, tf_idf_value=4, n_value=0)
worst_of_r_freq <- q_word_frequency(worst_of_r_words)
worst_of_r_freq_top <- worst_of_r_freq %>% top_n(25)
ggplot(worst_of_r_words_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
ggplot(worst_of_r_freq_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
worst_of_r_wcdata <- worst_of_r_words_top
names(worst_of_r_wcdata) <- c("word", "freq")
wordcloud2(worst_of_r_wcdata)
worst_of_r_freq_wcdata <- worst_of_r_freq_top
names(worst_of_r_freq_wcdata) <- c("word", "freq")
wordcloud2(worst_of_r_freq_wcdata)
best_of_rcons <- responses_rcons %>%
select(id, Q30) %>%
mutate(text = str_replace_all(Q30, "\\{.*?\\}", " ")) %>%
unnest_tokens(word, text)
best_of_rcons_words <- fix_spelling(best_of_rcons)
# best_of_rcons_stems <- stem_q_words(best_of_rcons)
best_of_rcons_words_tfidf <- q_tf_idf(best_of_rcons_words)
best_of_rcons_words_top <- top_q_tf_idf(best_of_rcons_words, tf_idf_value=2.5, n_value=0)
best_of_rcons_freq <- q_word_frequency(best_of_rcons_words)
best_of_rcons_freq_top <- best_of_rcons_freq %>% top_n(25)
ggplot(best_of_rcons_words_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
ggplot(best_of_rcons_freq_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
best_of_rcons_wcdata <- best_of_rcons_words_top
names(best_of_rcons_wcdata) <- c("word", "freq")
wordcloud2(best_of_rcons_wcdata)
best_of_rcons_freq_wcdata <- best_of_rcons_freq_top
names(best_of_rcons_freq_wcdata) <- c("word", "freq")
wordcloud2(best_of_rcons_freq_wcdata)
worst_of_rcons <- responses_rcons %>%
select(id, Q31) %>%
mutate(text = str_replace_all(Q31, "\\{.*?\\}", " ")) %>%
unnest_tokens(word, text)
worst_of_rcons_words <- fix_spelling(worst_of_rcons)
# worst_of_rcons_stems <- stem_q_words(worst_of_rcons)
worst_of_rcons_words_tfidf <- q_tf_idf(worst_of_rcons_words)
worst_of_rcons_words_top <- top_q_tf_idf(worst_of_rcons_words, tf_idf_value=2.5, n_value=0)
worst_of_rcons_freq <- q_word_frequency(worst_of_rcons_words)
worst_of_rcons_freq_top <- worst_of_rcons_freq %>% top_n(25)
ggplot(worst_of_rcons_words_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
ggplot(worst_of_rcons_freq_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
worst_of_rcons_wcdata <- worst_of_rcons_words_top
names(worst_of_rcons_wcdata) <- c("word", "freq")
wordcloud2(worst_of_rcons_wcdata)
worst_of_rcons_freq_wcdata <- worst_of_rcons_freq_top
names(worst_of_rcons_freq_wcdata) <- c("word", "freq")
wordcloud2(worst_of_rcons_freq_wcdata)
anything_else <- responses_rcons %>%
select(id, Q38) %>%
mutate(text = str_replace_all(Q38, "\\{.*?\\}", " ")) %>%
unnest_tokens(word, text)
anything_else_words <- fix_spelling(anything_else)
# anything_else_stems <- stem_q_words(anything_else)
anything_else_words_tfidf <- q_tf_idf(anything_else_words)
anything_else_words_top <- top_q_tf_idf(anything_else_words, tf_idf_value=2.5, n_value=1)
anything_else_freq <- q_word_frequency(anything_else_words)
anything_else_freq_top <- anything_else_freq %>% top_n(25)
ggplot(anything_else_words_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
ggplot(anything_else_freq_top, aes(x=reorder(word, n), y=n)) +
geom_bar(show.legend=FALSE, stat="identity") +
coord_flip()
anything_else_wcdata <- anything_else_words_top
names(anything_else_wcdata) <- c("word", "freq")
wordcloud2(anything_else_wcdata)
anything_else_freq_wcdata <- anything_else_freq_top
names(anything_else_freq_wcdata) <- c("word", "freq")
wordcloud2(anything_else_freq_wcdata)