Github Repo [https://github.com/rladies-pdx/tidytext_meetups]
library(dplyr)
library(gutenbergr)
library(tidytext)
library(ggplot2)
library(stringr)
library(tidyverse)
library(readr)
library(wordcloud)
setwd("~/dev/Rladies/tidytext_meetups/data")
meetups <- read_csv("rladies_meetup_events_past.csv")
event_descriptions <- meetups %>%
select(id, description, group.localized_location) %>%
mutate(line=row_number(),
description = str_replace_all(description, "<[^>]*>", ""),
description = str_replace_all(description, regex("[^[:alpha:] ]"), "")) %>%
filter(! is.na(description))
event_descr <- event_descriptions %>% unnest_tokens(word,description)
# count words
event_descr_cnt <- event_descr %>% count(word, sort=TRUE)
# remove common words using an anti_join
event_descr_cnt_stop <- event_descr %>% anti_join(stop_words) %>% count(word, sort=TRUE)
event_descr_top <- event_descr %>%
anti_join(stop_words) %>%
filter(! str_detect(word, "'")) %>%
filter(! str_detect(word, "’")) %>%
count(word, sort = TRUE) %>%
top_n(40) %>%
mutate(word=reorder(word,n))
## Joining, by = "word"
## Selecting by n
ggplot(event_descr_top, aes(x=word, y=n)) +
geom_bar(stat="identity") +
coord_flip()
wordcloud(event_descr_top$word, event_descr_top$n, c(5,.8), colors=event_descr_top$n, min.freq = 20)
event_descr_bow <- event_descr %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment, word, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(20) %>%
ungroup %>%
mutate(word=reorder(word,n))
ggplot(event_descr_bow, aes(x=word, y=n, fill=sentiment))+
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~ sentiment, scales = "free")
event_descr_all <- event_descriptions %>%
unnest_tokens(word, description) %>%
count(group.localized_location, word, sort = TRUE) # input into tfidf
event_descr_tfdf <- event_descr_all %>%
filter(! str_detect(word, "'")) %>%
bind_tf_idf(word, group.localized_location, n) %>%
arrange(-tf_idf)
event_descr_tfdf
## # A tibble: 27,000 x 6
## group.localized_location word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Taipei, Taiwan çš„ 245 0.04779555 3.931826 0.18792378
## 2 Izmir, Turkey bir 5 0.04098361 3.238678 0.13273272
## 3 Ames, IA audrey 148 0.03355248 3.931826 0.13192251
## 4 Budapest, Hungary az 83 0.03340040 3.931826 0.13132456
## 5 Izmir, Turkey ve 5 0.04098361 2.833213 0.11611530
## 6 Rio de Janeiro, Brazil em 32 0.03892944 2.833213 0.11029541
## 7 Montevideo, Uruguay de 53 0.10153257 1.041454 0.10574149
## 8 Istanbul, Turkey ve 57 0.03632887 2.833213 0.10292744
## 9 Montevideo, Uruguay en 33 0.06321839 1.533930 0.09697261
## 10 Warsaw, Poland w 313 0.03401065 2.833213 0.09635942
## # ... with 26,990 more rows
ggplot(event_descr_tfdf %>% top_n(50), aes(x=word, y=n, fill=group.localized_location)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ group.localized_location, scales = "free") +
coord_flip()
## Selecting by tf_idf
ggplot(event_descr_tfdf %>% filter(str_detect(group.localized_location, ", \\D\\D$")) %>% top_n(50),
aes(x=word, y=n, fill=group.localized_location)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ group.localized_location, scales = "free") +
coord_flip()
## Selecting by tf_idf
ggplot(event_descr_tfdf %>% filter(str_detect(group.localized_location, ", \\D\\D$")) %>% top_n(50),
aes(x=reorder(word, n), y=n, fill=group.localized_location)) +
geom_col() +
coord_flip()
## Selecting by tf_idf
tidy_ngram <- event_descriptions %>%
unnest_tokens(bigram, description, token = "ngrams", n = 2)
tidy_ngram <- tidy_ngram %>%
separate(bigram, c("word1", "word2"), sep=" ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
tidy_ngram
## # A tibble: 22,764 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 data science 77
## 2 machine learning 73
## 3 pm pm 55
## 4 data scientist 54
## 5 de la 51
## 6 rladies ames 39
## 7 data management 38
## 8 analysis issues 37
## 9 audrey mccombs 37
## 10 coding practices 37
## # ... with 22,754 more rows
tidy_ngram_top <- tidy_ngram %>%
top_n(20) %>%
mutate(combined = paste(word1, word2))
## Selecting by n
tidy_ngram_top
## # A tibble: 22 x 4
## word1 word2 n combined
## <chr> <chr> <int> <chr>
## 1 data science 77 data science
## 2 machine learning 73 machine learning
## 3 pm pm 55 pm pm
## 4 data scientist 54 data scientist
## 5 de la 51 de la
## 6 rladies ames 39 rladies ames
## 7 data management 38 data management
## 8 analysis issues 37 analysis issues
## 9 audrey mccombs 37 audrey mccombs
## 10 coding practices 37 coding practices
## # ... with 12 more rows
ggplot(tidy_ngram_top, aes(x=reorder(combined, n), y=n, fill=word1)) +
geom_bar(stat="identity", show.legend = FALSE) +
coord_flip()
# words with data
data_bigram <- event_descriptions %>%
unnest_tokens(bigram, description, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep=" ") %>%
filter(word1 %in% c("data")) %>%
filter(!word2 %in% stop_words$word, !str_detect(word2, "'")) %>%
count(word1, word2, sort=TRUE)
data_bigram_top <- data_bigram %>% group_by(word1) %>% top_n(25)
## Selecting by n
ggplot(data_bigram_top, aes(x=reorder(word2, n), y=n, fill=word1)) +
geom_bar(stat="identity", show.legend = FALSE) +
coord_flip() +
facet_wrap(~ word1, scales="free")
wordcloud(data_bigram_top$word2, data_bigram_top$n, c(5,1), colors=data_bigram_top$n, min.freq = 5)