This example is part of a larger set of examples of using Google Analytics with R:
This example assumes that site search tracking is enabled on the site in Google Analytics, and it basically does three things:
This entire example is, essentially, cobbling together ideas from three specific people, so a hat tip to all three:
So, now, on to the code!
# Load the necessary libraries.
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(googleAnalyticsR, # How we actually get the Google Analytics data
tidyverse, # Includes dplyr, ggplot2, and others; very key!
knitr, # Nicer looking tables
tidytext, # Tidy text!
SnowballC, # Mainly for stemming the search terms
DT, # Make a nice data table
wordcloud, # Word cloud creation
RColorBrewer, # Get some palettes to use with the word cloud
topicmodels) # For the topic modeling using LDA
# Authorize GA. Depending on if you've done this already and a .ga-httr-oauth file has
# been saved or not, this may pop you over to a browser to authenticate.
ga_auth(token = ".httr-oauth")
## Token cache file: .httr-oauth
# Set the view ID and the date range. If you want to, you can swap out the Sys.getenv()
# call and just replace that with a hardcoded value for the view ID.
view_id <- Sys.getenv("GA_VIEW_ID_DAPH")
start_date <- Sys.Date() - 120 # The last 120 days
end_date <- Sys.Date() - 1 # Yesterday
# Minimum # of searches for a term to include in the wordcloud
min_frequency <- 1
# Set the number of topics to include in the topic model
num_topics <- 4
# Set the stopwords language
stopwords_lang <- "en"
# Words to exclude (because they're too dominant to be interesting). This will exclude
# them from the main word cloud AND from the topic modeling
exclude_words <- c("cancer", "breast", "de")
# Pull the data
ga_data <- google_analytics(viewId = view_id,
date_range = c(start_date, end_date),
metrics = "searchUniques",
dimensions = "searchKeyword",
anti_sample = TRUE)
# Unnest it -- put each word on its own row and then collapse the individual
# words. This will also make everything lowercase and strip punctuation!
search_data <- ga_data %>%
unnest_tokens(search_term, searchKeyword) %>%
group_by(search_term) %>%
summarise(searches = sum(searchUniques)) %>%
dplyr::select(search_term, searches) %>%
ungroup() %>%
arrange(-searches)
# Remove the stop words. 1) get the stopwords, 2) remove 'em
stop_words <- get_stopwords(language = stopwords_lang) %>%
dplyr::select(word)
search_data <- search_data %>%
anti_join(stop_words, by = c(search_term = "word"))
# Convert UTF-8 to ASCII (needed because all hell starts to break loose if you
# try to text-mine multibyte). So, we're going to try to convert everything to
# ASCII. For some...this will fail and return NA. So, we'll then just remove
# the NA rows
search_data <- search_data %>%
mutate(search_term = iconv(search_term, "UTF-8", "ASCII")) %>%
filter(!is.na(search_term))
# Perform stemming.
search_data <- search_data %>%
mutate(search_term_stem = wordStem(search_term))
# Go ahead and find the most popular un-stemmed word for each stemmed word.
# That will make the results look more "normal" to the casual viewer. We don't want
# to have any ties, so we're going to somewhat arbitrarily break any ties by adding
# the row number / 1000000 to each of the search counts first (We'll toss this later)
search_data_top_term <- search_data %>%
mutate(searches = searches + row_number()/1000000) %>%
group_by(search_term_stem) %>%
top_n(1, searches) %>%
dplyr::select(-searches)
# Join that back to search data after totalling the searches by the stemmed term.
search_data <- search_data %>%
group_by(search_term_stem) %>%
summarise(searches = sum(searches)) %>%
left_join(search_data_top_term) %>%
ungroup() %>%
dplyr::select(search_term_stem, search_term, searches) %>%
arrange(-searches)
# Remove any additional "remove words" specified
search_data <- search_data %>%
filter(!search_term_stem %in% exclude_words)
# Get rid of the "top term" data frame
rm(search_data_top_term)
ga_data %>%
arrange(-searchUniques) %>%
datatable(colnames = c("Search Term", "Searches"), rownames = FALSE)
Searches that started with a question word: who, what why, when, where, how.
ga_data %>%
arrange(-searchUniques) %>%
filter(grepl("^who|^what|^why|^what|^when|^where|^how.*", searchKeyword)) %>%
datatable(colnames = c("Question", "Searches"), rownames = FALSE)
This looks similar to the report in Google Analytics, but it’s been processed to be the individual words, stemmed, stopwords removed, etc.
dplyr::select(search_data, search_term, searches) %>%
datatable(colnames = c("Search Term", "Searches"),
rownames = FALSE)
A wordcloud based on the cleaned up and unnested words.
# Set a seed for reproducibility
set.seed(1971)
# Set a color palette
color_palette <- rev(brewer.pal(8,"Spectral"))
# Generate the word cloud!
wordcloud(words = search_data$search_term,
freq = search_data$searches,
scale=c(5.5,0.6),
min.freq=min_frequency,
max.words=500,
random.order=FALSE,
rot.per=.0,
colors=color_palette)
We’re going to use Latent Dirichlet allocation (LDA) to try to break out these words into topics. This is basically just following the process outlined for LDA at: https://www.tidytextmining.com/topicmodeling.html.
# Cast the term frequency matrix into a document term matrix. We're considering this all one
# "document" so we're just hardcoding a "1" for that
search_data_dtm <- search_data %>%
mutate(doc = 1) %>%
cast_dtm(doc, search_term, searches)
# Run LDA. Setting a seed for reproducibility
search_lda <- LDA(search_data_dtm, k = num_topics, control = list(seed = 1120))
# Assign a probability of each term being in each of the topics
search_topics <- tidy(search_lda, matrix = "beta")
# For each term, assign it to the topic for which it has the highest beta. This diverges
# from the approach described at tidytextmining.com, but it seems like a reasonably legit
# thing to do.
search_topics_and_terms <- search_topics %>%
group_by(term) %>%
top_n(1, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
left_join(search_data, by = c(term = "search_term"))
# Function to generate a word cloud based on the topic ID passed in
generate_topic_wordcloud <- function(topic_id){
# Filter the data to be just the topic and to
# knock out terms with a reallllly low beta
topic_data <- search_topics_and_terms %>%
filter(topic == topic_id &
beta > 0.001)
# Generate the word cloud!
wordcloud(words = topic_data$term,
freq = topic_data$searches,
scale=c(3.5,1),
min.freq=min_frequency,
max.words=500,
random.order=FALSE,
rot.per=.0,
colors=color_palette)
}
# Call the function for each topic ID
topic_wordclouds <- map(seq(1:num_topics), generate_topic_wordcloud)