General Note

This example is part of a larger set of examples of using Google Analytics with R…even though this example uses Twitter. For additional examples, downloadable code, and explanations of the overall effort, see: https://github.com/SDITools/ga-and-r-examples. ### Overview

This example pulls all of the followers for a given user and then does some text-mining on their descriptions:

Setup/Config

# Set the base account
tw_account <- "analyticshour"

# Load the necessary libraries. 
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(rtweet,            # How we actually get the Twitter data
               tidyverse,         # Includes dplyr, ggplot2, and others; very key!
               knitr,             # Nicer looking tables
               tidytext,          # Tidy text!
               SnowballC,         # Mainly for stemming the search terms
               DT,                # Make a nice data table
               wordcloud,         # Word cloud creation
               RColorBrewer,      # Get some palettes to use with the word cloud
               topicmodels)       # For the topic modeling using LDA


# Minimum # of occurrences for a term to include in the wordcloud
min_frequency <- 2

# Set the number of topics to include in the topic model
num_topics <- 4

# Set the stopwords language
stopwords_lang <- "en"

# Words to exclude (because they're too dominant to be interesting). This will exclude
# them from the main word cloud AND from the topic modeling
exclude_words <- c("https")

# Label for what this is
main_label <- paste0("Followers of @",tw_account)

# Get app credentials

# Name assigned to created app. You'll need to set these up in the Twitter
# developer console and then either hardcode them here or put them in your .Renviron file
tw_appname <- Sys.getenv("TWITTER_APPNAME")

# Key and Secret
tw_key <- Sys.getenv("TWITTER_KEY")
tw_secret <- Sys.getenv("TWITTER_SECRET")
tw_access_token = Sys.getenv("TWITTER_ACCESS_TOKEN")
tw_access_secret = Sys.getenv("TWITTER_ACCESS_SECRET")


# Create the token. 
tw_token <- create_token(
    app = tw_appname,
    consumer_key = tw_key,
    consumer_secret = tw_secret,
    access_token = tw_access_token,
    access_secret = tw_access_secret)

Get the Data and Clean It Up

# Get a list of all followers
user_followers <- get_followers(tw_account, n=10000, token = tw_token)

# # Split that vector up into a list with 15,000 users per (needed to get the details)
# user_followers_split <- split(user_followers, rep(1:ceiling(nrow(user_followers)/15000), each=15000, 
#                                                             length.out = nrow(user_followers)))

# Get the user details for all of those users
followers_details <-  lookup_users(user_followers$user_id, parse = TRUE, token = tw_token)

# Output how many users are being analyzed
cat(paste0("The following assessment covers the ", nrow(followers_details), " ", tolower(main_label), "."))
## The following assessment covers the 1058 followers of @analyticshour.
# Unnest it -- put each word on its own row and then collapse the individual
# words. This will also make everything lowercase and strip punctuation!
followers_data <- followers_details %>% 
  unnest_tokens(description_term, description) %>% 
  group_by(description_term) %>% 
  summarise(occurrences = n()) %>% 
  select(description_term, occurrences) %>% 
  ungroup() %>% 
  arrange(-occurrences)

# Remove the stop words. 1) get the stopwords, 2) remove 'em
stop_words <- get_stopwords(language = stopwords_lang) %>% 
  select(word)

followers_data <- followers_data %>% 
  anti_join(stop_words, by = c(description_term = "word"))

# Convert UTF-8 to ASCII (needed because all hell starts to break loose if you 
# try to text-mine multibyte). So, we're going to try to convert everything to
# ASCII. For some...this will fail and return NA. So, we'll then just remove
# the NA rows
followers_data <- followers_data %>%
  mutate(description_term = iconv(description_term, "UTF-8", "ASCII")) %>% 
  filter(!is.na(description_term))

# Perform stemming.
followers_data <- followers_data %>% 
  mutate(description_term_stem = wordStem(description_term))

# Go ahead and find the most popular un-stemmed word for each stemmed word.
# That will make the results look more "normal" to the casual viewer. We don't want
# to have any ties, so we're going to somewhat arbitrarily break any ties by adding
# the row number / 1000000 to each of the search counts first (We'll toss this later)
followers_data_top_term <- followers_data %>% 
  mutate(occurrences = occurrences + row_number()/1000000) %>% 
  group_by(description_term_stem) %>% 
  top_n(1, occurrences) %>% 
  select(-occurrences)

# Join that back to search data after totalling the occurrences by the stemmed term.
followers_data <- followers_data %>% 
  group_by(description_term_stem) %>% 
  summarise(occurrences = sum(occurrences)) %>% 
  left_join(followers_data_top_term) %>% 
  ungroup() %>% 
  select(description_term_stem, description_term, occurrences) %>% 
  arrange(-occurrences)

# Remove any additional "remove words" specified
followers_data <-  followers_data %>%
  filter(!description_term_stem %in% exclude_words)

# Get rid of the "top term" data frame
rm(followers_data_top_term)

Show the Original Descriptions

followers_details %>% 
  arrange(-followers_count) %>% 
  select(user_id, description, followers_count) %>% 
  datatable(colnames = c("Username", "Description", "# of Followers"),  rownames = FALSE)