About this Notebook





Analytics Toolkit: Require Packages



# Here we are checking if the package is installed
if(!require("rvest", quietly = TRUE)) 
  # If the package is not in the system then it will be install
  install.packages("rvest", dependencies = TRUE, quietly = TRUE)
  # Here we are loading the package
  library(rvest, quietly = TRUE)

if(!require("lubridate", quietly = TRUE)) 
  install.packages("lubridate", dependencies = TRUE, quietly = TRUE)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
  library(lubridate, quietly = TRUE)

if(!require("wordcloud", quietly = TRUE)) 
  install.packages("wordcloud", dependencies = TRUE, quietly = TRUE)
  library(wordcloud, quietly = TRUE)

if(!require("tm", quietly = TRUE)) 
  install.packages("tm", dependencies = TRUE, quietly = TRUE)
  library(tm, quietly = TRUE)

if(!require("tidyverse", quietly = TRUE))
  install.packages("tidyverse", dependencies = TRUE, quietly = TRUE)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate()      masks NLP::annotate()
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ readr::guess_encoding()  masks rvest::guess_encoding()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ purrr::pluck()           masks rvest::pluck()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ lubridate::union()       masks base::union()
  library(tidyverse, quietly = TRUE)



Data Collection: Claming your Google Search Data



1) Sign into your google account, then Go to:

4) After selecting the products, choose the file type and max archive size to make sure that all your account data is archive

```



Data Preparation: Extracting Google Search Information



Locate the Google archive, then find the search data. For this case, it is an html file located in “My Activity” folder" then find the “Search” folder and the html file “MyActivity.html” should be there:

  • Takeout -> My Activity -> Search -> MyActivity.html

Laveraging regular expression we can search the html document to extract:


Extract Search Time

date_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 
  str_extract(pattern = "(?<=<br>)(.*)(?<=PM|AM)") %>%
  mdy_hms()


Extract Search Text

text_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
  str_extract(pattern = '(?<=<a)(.*)(?=</a>)') %>% 
  str_extract(pattern = '(?<=\">)(.*)')


Extract Search Type

type_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 
  str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=<a)") %>% 
  str_extract(pattern = "(\\w+)(?=\\s)")


Create a data frame using the data extracted from the html file

search_data <- tibble(timestamp = date_search,
                      date = as_date(date_search),
                      year = year(date_search),
                      month = month(date_search, label = TRUE),
                      day = weekdays(date_search),
                      hour = hour(date_search),
                      type = type_search,
                      search = text_search)

search_data$day <- factor(search_data$day, 
                          levels = c("Sunday", "Monday", "Tuesday",
                                     "Wednesday","Thursday", "Friday",
                                     "Saturday"))

search_data <- na.omit(search_data)

head(search_data)
## # A tibble: 6 x 8
##   timestamp           date        year month day       hour type   search 
##   <dttm>              <date>     <dbl> <ord> <fct>    <int> <chr>  <chr>  
## 1 2018-02-01 12:32:29 2018-02-01  2018 Feb   Thursday    12 Visit… https:…
## 2 2018-02-01 12:32:26 2018-02-01  2018 Feb   Thursday    12 Searc… downlo…
## 3 2018-01-30 13:13:56 2018-01-30  2018 Jan   Tuesday     13 Visit… Google…
## 4 2018-01-30 13:12:22 2018-01-30  2018 Jan   Tuesday     13 Visit… https:…
## 5 2018-01-30 13:11:35 2018-01-30  2018 Jan   Tuesday     13 Searc… restau…
## 6 2018-01-30 13:06:36 2018-01-30  2018 Jan   Tuesday     13 Visit… https:…



Data Analysis: Visualizing Google Searches



To get an overall idea of the search volume, we can plot searches by year

p <- ggplot(search_data, aes(year))
p + geom_bar()


After determine the years with the largest search volume we can plot monthly searches

monthly <- search_data[(search_data$year > 2014 & search_data$year< 2018), ]

ggplot(monthly) + geom_bar(aes(x = month, group = year)) +
  theme(axis.text.x = element_text(angle=90)) +
  facet_grid(.~year, scales="free")


Another interesting metrict is searches by Hour

p <- ggplot(search_data, aes(hour))
p + geom_bar()


We can also plot the search data by day of the week to determine day are the most active

p <- ggplot(search_data, aes(day))
p + geom_bar()


We can take it an step further and group search time with day of the week.

ggplot(search_data) + 
  geom_bar(aes(x = hour, group = day) ) +
  facet_grid(.~day, scales = "free")


We can group the search data by year and day of the week, to visualize the overall trend

wkday <- group_by(search_data, year, day) %>% summarize(count = n())
p <- ggplot(wkday, aes(day, count, fill = year)) 
p + geom_bar(stat = "identity") + labs(x = "", y = "Search Volume")



Reporting: A wordcloud from Google Search Data



First we need to extract the text and clean it using regular expressions

search <- tolower(search_data$search)
search <- iconv(search, "ASCII", "UTF-8", " ")
search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search)
search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search)
search <- trimws(search)


After cleaning the text we can create a Text Corpus (a large and structured set of texts) and remove some words

search_corpus <-  Corpus(VectorSource(search))
search_corpus <- tm_map(search_corpus, content_transformer(removePunctuation))
search_corpus <- tm_map(search_corpus, content_transformer(removeNumbers))
stopwords <- c(stopwords("english"), "chrome", "chicago", "jlroo", "google")
search_corpus <- tm_map(search_corpus, removeWords, stopwords)


search_tdm <- TermDocumentMatrix(search_corpus)
search_matrix <- as.matrix(search_tdm)


Set a threshold for the min frequency of the words to display as well as max frequency

wordcloud(d$word, d$freq, min.freq = 50, scale = c(3 , 0.5), max.words = 200)