About this Notebook





Analytics Toolkit: Require Packages



# Here we are checking if the package is installed
if(!require("rvest", quietly = TRUE)) 
  # If the package is not in the system then it will be install
  install.packages(rvest, dependencies = TRUE, quietly = TRUE)
  # Here we are loading the package
  library(rvest, quietly = TRUE)
if(!require("lubridate", quietly = TRUE)) 
  install.packages(lubridate, dependencies = TRUE, quietly = TRUE)
  library(lubridate, quietly = TRUE)
if(!require("wordcloud", quietly = TRUE)) 
  install.packages(wordcloud, dependencies = TRUE, quietly = TRUE)
  library(wordcloud, quietly = TRUE)
if(!require("tm", quietly = TRUE)) 
  install.packages(tm, dependencies = TRUE, quietly = TRUE)
  library(tm, quietly = TRUE)
if(!require(tidyverse, quietly = TRUE))
  install.packages(tidyverse, dependencies = TRUE, quietly = TRUE)
  library(tidyverse, quietly = TRUE)
library(tm)
library(wordcloud)
library(tidyverse)

Data Preparation: Extracting Google Search Information



Locate the Google archive, then find the search data. For this case, it is an html file located in “My Activity” folder" then find the “Search” folder and the html file “MyActivity.html” should be there:

  • Takeout -> My Activity -> Search -> MyActivity.html

Laveraging regular expression we can search the html document to extract:


Extract Search Time

date_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 
  str_extract(pattern = "(?<=<br>)(.*)(?<=PM|AM)") %>%
  mdy_hms()


Extract Search Text

text_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
  str_extract(pattern = '(?<=<a)(.*)(?=</a>)') %>% 
  str_extract(pattern = '(?<=\">)(.*)')


Extract Search Type

type_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 
  str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=<a)") %>% 
  str_extract(pattern = "(\\w+)(?=\\s)")


Create a data frame using the data extracted from the html file

search_data <- tibble(timestamp = date_search,
                      date = as_date(date_search),
                      year = year(date_search),
                      month = month(date_search, label = TRUE),
                      day = weekdays(date_search),
                      hour = hour(date_search),
                      type = type_search,
                      search = text_search)
search_data$day <- factor(search_data$day, 
                          levels = c("Sunday", "Monday", "Tuesday",
                                     "Wednesday","Thursday", "Friday",
                                     "Saturday"))
search_data <- na.omit(search_data)
head(search_data)



Data Analysis: Visualizing Google Searches



To get an overall idea of the search volume, we can plot searches by year

p <- ggplot(search_data, aes(year))
p + geom_bar()


After determine the years with the largest search volume we can plot monthly searches

monthly <- search_data[(search_data$year > 2014 & search_data$year< 2018), ]
ggplot(monthly) + geom_bar(aes(x = month, group = year)) +
  theme(axis.text.x = element_text(angle=90)) +
  facet_grid(.~year, scales="free")


Another interesting metrict is searches by Hour

p <- ggplot(search_data, aes(hour))
p + geom_bar()


We can also plot the search data by day of the week to determine day are the most active

p <- ggplot(search_data, aes(day))
p + geom_bar()


We can take it an step further and group search time with day of the week.

ggplot(search_data) + 
  geom_bar(aes(x = hour, group = day) ) +
  facet_grid(.~day, scales = "free")


We can group the search data by year and day of the week, to visualize the overall trend

wkday <- group_by(search_data, year, day) %>% summarize(count = n())
p <- ggplot(wkday, aes(day, count, fill = year)) 
p + geom_bar(stat = "identity") + labs(x = "", y = "Search Volume")



Reporting: A wordcloud from Google Search Data



First we need to extract the text and clean it using regular expressions

search <- tolower(search_data$search)
#search <- iconv(search, "ASCII", "UTF-8", ")
search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search)
search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search)
search <- trimws(search)


After cleaning the text we can create a Text Corpus (a large and structured set of texts) and remove some words

search_corpus <-  Corpus(VectorSource(search))
search_corpus <- tm_map(search_corpus, content_transformer(removePunctuation))
search_corpus <- tm_map(search_corpus, content_transformer(removeNumbers))
stopwords <- c(stopwords("english"), "chrome", "chicago", "jlroo", "google","online","good","rate")
search_corpus <- tm_map(search_corpus, removeWords, stopwords)


search_tdm <- TermDocumentMatrix(search_corpus)
search_matrix <- as.matrix(search_tdm)


Set a threshold for the min frequency of the words to display as well as max frequency

wordcloud(d$word, d$freq, min.freq = 30, scale = c(2 , 0.5), max.words = 200)

