Tidytexting R-Ladies Meetups

Github Repo [https://github.com/rladies-pdx/tidytext_meetups]

library(dplyr)
library(gutenbergr)
library(tidytext)
library(ggplot2)
library(stringr)
library(tidyverse)
library(readr)
library(wordcloud)

setwd("~/dev/Rladies/tidytext_meetups/data")

meetups <- read_csv("rladies_meetup_events_past.csv")

event_descriptions <- meetups %>%
  select(id, description, group.localized_location) %>%
  mutate(line=row_number(), 
         description = str_replace_all(description, "<[^>]*>", ""),
         description = str_replace_all(description, regex("[^[:alpha:] ]"), "")) %>%
  filter(! is.na(description))

event_descr <- event_descriptions %>% unnest_tokens(word,description)

# count words
event_descr_cnt <- event_descr %>% count(word, sort=TRUE)

# remove common words using an anti_join
event_descr_cnt_stop <- event_descr %>% anti_join(stop_words) %>% count(word, sort=TRUE)

event_descr_top <- event_descr %>%
  anti_join(stop_words) %>%
  filter(! str_detect(word, "'")) %>%
  filter(! str_detect(word, "’")) %>%
  count(word, sort = TRUE) %>%
  top_n(40) %>%
  mutate(word=reorder(word,n))

## Joining, by = "word"

## Selecting by n

ggplot(event_descr_top, aes(x=word, y=n)) +
  geom_bar(stat="identity") +
  coord_flip()

wordcloud(event_descr_top$word, event_descr_top$n, c(5,.8), colors=event_descr_top$n, min.freq = 20)

event_descr_bow <- event_descr %>% 
  inner_join(get_sentiments("bing")) %>%
  count(sentiment, word, sort = TRUE) %>%
  group_by(sentiment) %>%
  top_n(20) %>%
  ungroup %>%
  mutate(word=reorder(word,n))

ggplot(event_descr_bow, aes(x=word, y=n, fill=sentiment))+
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ sentiment, scales = "free")

event_descr_all <- event_descriptions %>%
  unnest_tokens(word, description) %>%
  count(group.localized_location, word, sort = TRUE) # input into tfidf

event_descr_tfdf <- event_descr_all %>% 
  filter(! str_detect(word, "'")) %>% 
  bind_tf_idf(word, group.localized_location, n) %>% 
  arrange(-tf_idf)

event_descr_tfdf

## # A tibble: 27,000 x 6
##    group.localized_location   word     n         tf      idf     tf_idf
##                       <chr>  <chr> <int>      <dbl>    <dbl>      <dbl>
##  1           Taipei, Taiwan     的   245 0.04779555 3.931826 0.18792378
##  2            Izmir, Turkey    bir     5 0.04098361 3.238678 0.13273272
##  3                 Ames, IA audrey   148 0.03355248 3.931826 0.13192251
##  4        Budapest, Hungary     az    83 0.03340040 3.931826 0.13132456
##  5            Izmir, Turkey     ve     5 0.04098361 2.833213 0.11611530
##  6   Rio de Janeiro, Brazil     em    32 0.03892944 2.833213 0.11029541
##  7      Montevideo, Uruguay     de    53 0.10153257 1.041454 0.10574149
##  8         Istanbul, Turkey     ve    57 0.03632887 2.833213 0.10292744
##  9      Montevideo, Uruguay     en    33 0.06321839 1.533930 0.09697261
## 10           Warsaw, Poland      w   313 0.03401065 2.833213 0.09635942
## # ... with 26,990 more rows

ggplot(event_descr_tfdf %>% top_n(50), aes(x=word, y=n, fill=group.localized_location)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ group.localized_location, scales = "free") +
  coord_flip()

## Selecting by tf_idf

ggplot(event_descr_tfdf %>% filter(str_detect(group.localized_location, ", \\D\\D$")) %>% top_n(50), 
       aes(x=word, y=n, fill=group.localized_location)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ group.localized_location, scales = "free") +
  coord_flip()

## Selecting by tf_idf

ggplot(event_descr_tfdf %>% filter(str_detect(group.localized_location, ", \\D\\D$")) %>% top_n(50), 
       aes(x=reorder(word, n), y=n, fill=group.localized_location)) +
  geom_col() +
  coord_flip()

## Selecting by tf_idf

tidy_ngram <- event_descriptions %>%
  unnest_tokens(bigram, description, token = "ngrams", n = 2)

tidy_ngram <- tidy_ngram %>%
  separate(bigram, c("word1", "word2"), sep=" ") %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE)

tidy_ngram

## # A tibble: 22,764 x 3
##       word1      word2     n
##       <chr>      <chr> <int>
##  1     data    science    77
##  2  machine   learning    73
##  3       pm         pm    55
##  4     data  scientist    54
##  5       de         la    51
##  6  rladies       ames    39
##  7     data management    38
##  8 analysis     issues    37
##  9   audrey    mccombs    37
## 10   coding  practices    37
## # ... with 22,754 more rows

tidy_ngram_top <- tidy_ngram %>%
  top_n(20) %>%
  mutate(combined = paste(word1, word2))

## Selecting by n

tidy_ngram_top

## # A tibble: 22 x 4
##       word1      word2     n         combined
##       <chr>      <chr> <int>            <chr>
##  1     data    science    77     data science
##  2  machine   learning    73 machine learning
##  3       pm         pm    55            pm pm
##  4     data  scientist    54   data scientist
##  5       de         la    51            de la
##  6  rladies       ames    39     rladies ames
##  7     data management    38  data management
##  8 analysis     issues    37  analysis issues
##  9   audrey    mccombs    37   audrey mccombs
## 10   coding  practices    37 coding practices
## # ... with 12 more rows

ggplot(tidy_ngram_top, aes(x=reorder(combined, n), y=n, fill=word1)) +
  geom_bar(stat="identity", show.legend = FALSE) +
  coord_flip()

# words with data

data_bigram <- event_descriptions %>%
  unnest_tokens(bigram, description, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep=" ") %>%
  filter(word1 %in% c("data")) %>%
  filter(!word2 %in% stop_words$word, !str_detect(word2, "'")) %>%
  count(word1, word2, sort=TRUE)

data_bigram_top <- data_bigram %>% group_by(word1) %>% top_n(25)

## Selecting by n

ggplot(data_bigram_top, aes(x=reorder(word2, n), y=n, fill=word1)) +
  geom_bar(stat="identity", show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ word1, scales="free")

wordcloud(data_bigram_top$word2, data_bigram_top$n, c(5,1), colors=data_bigram_top$n, min.freq = 5)

Tidytexting R-Ladies Meetups

Augustina Ragwitz

November 30, 2017