About this Notebook





Analytics Toolkit: Require Packages



# Here we are checking if the package is installed
if(!require(rvest, quietly = TRUE)) 
  # If the package is not in the system then it will be install
  install.packages(rvest, dependencies = TRUE, quietly = TRUE)
  # Here we are loading the package
  library(rvest, quietly = TRUE)

if(!require(lubridate, quietly = TRUE)) 
  install.packages(lubridate, dependencies = TRUE, quietly = TRUE)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
  library(lubridate, quietly = TRUE)

if(!require(wordcloud, quietly = TRUE)) 
  install.packages("wordcloud", dependencies = TRUE, quietly = TRUE)
  library(wordcloud, quietly = TRUE)

if(!require(tm, quietly = TRUE)) 
  install.packages("tm", dependencies = TRUE, quietly = TRUE)
  library(tm, quietly = TRUE)

if(!require(tidyverse, quietly = TRUE))
  install.packages("tidyverse", dependencies = TRUE, quietly = TRUE)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate()      masks NLP::annotate()
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x readr::guess_encoding()  masks rvest::guess_encoding()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x purrr::pluck()           masks rvest::pluck()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
  library(tidyverse, quietly = TRUE)



Data Preparation: Extracting Google Search Information



Locate the Google archive, then find the search data. For this case, it is an html file located in “My Activity” folder" then find the “Search” folder and the html file “MyActivity.html” should be there:

  • Takeout -> My Activity -> Search -> MyActivity.html

Laveraging regular expression we can search the html document to extract:


Extract Search Time

date_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 
  str_extract(pattern = "(?<=<br>)(.*)(?<=PM|AM)") %>%
  mdy_hms()

date_search[1:10]
##  [1] "2018-01-30 13:04:55 UTC" "2018-01-30 13:04:16 UTC"
##  [3] "2018-01-29 22:31:17 UTC" "2018-01-29 22:30:54 UTC"
##  [5] "2018-01-29 21:15:20 UTC" "2018-01-29 14:21:18 UTC"
##  [7] "2018-01-29 14:21:16 UTC" "2018-01-28 18:21:18 UTC"
##  [9] "2018-01-28 18:19:53 UTC" "2018-01-28 17:28:11 UTC"


Extract Search Text

text_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
  str_extract(pattern = '(?<=<a)(.*)(?=</a>)') %>% 
  str_extract(pattern = '(?<=\">)(.*)')

text_search[1:10]
##  [1] "this is us season 2 episode 13" "this is us next episode"       
##  [3] "reacting to this is us"         "27 reactions to this is us"    
##  [5] "Google Search"                  "https://get.adobe.com/reader/" 
##  [7] "adobe acrobat"                  "dhsmv crash report"            
##  [9] "dhsmv"                          "do you want a high or low snr"


Extract Search Type

type_search <- search_archive %>% 
  html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 
  str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=<a)") %>% 
  str_extract(pattern = "(\\w+)(?=\\s)")

type_search[1:10]
##  [1] "Searched" "Searched" "Searched" "Searched" "Visited"  "Visited" 
##  [7] "Searched" "Searched" "Searched" "Searched"


Create a data frame using the data extracted from the html file

search_data <- tibble(timestamp = date_search,
                      date = as_date(date_search),
                      year = year(date_search),
                      month = month(date_search, label = TRUE),
                      day = weekdays(date_search),
                      hour = hour(date_search),
                      type = type_search,
                      search = text_search)

search_data$day <- factor(search_data$day, 
                          levels = c("Sunday", "Monday", "Tuesday",
                                     "Wednesday","Thursday", "Friday",
                                     "Saturday"))

search_data <- na.omit(search_data)

head(search_data)
## # A tibble: 6 x 8
##   timestamp           date        year month day      hour type     search
##   <dttm>              <date>     <dbl> <ord> <fct>   <int> <chr>    <chr> 
## 1 2018-01-30 13:04:55 2018-01-30  2018 Jan   Tuesday    13 Searched this ~
## 2 2018-01-30 13:04:16 2018-01-30  2018 Jan   Tuesday    13 Searched this ~
## 3 2018-01-29 22:31:17 2018-01-29  2018 Jan   Monday     22 Searched react~
## 4 2018-01-29 22:30:54 2018-01-29  2018 Jan   Monday     22 Searched 27 re~
## 5 2018-01-29 21:15:20 2018-01-29  2018 Jan   Monday     21 Visited  Googl~
## 6 2018-01-29 14:21:18 2018-01-29  2018 Jan   Monday     14 Visited  https~



Data Analysis: Visualizing Google Searches



To get an overall idea of the search volume, we can plot searches by year

p <- ggplot(search_data, aes(year))
p + geom_bar()


After determine the years with the largest search volume we can plot monthly searches

monthly <- search_data[(search_data$year > 2014 & search_data$year< 2018), ]

ggplot(monthly) + geom_bar(aes(x = month, group = year)) +
  theme(axis.text.x = element_text(angle=90)) +
  facet_grid(.~year, scales="free")


Another interesting metrict is searches by Hour

p <- ggplot(search_data, aes(hour))
p + geom_bar()


We can also plot the search data by day of the week to determine day are the most active

p <- ggplot(search_data, aes(day))
p + geom_bar()


We can take it an step further and group search time with day of the week.

ggplot(search_data) + 
  geom_bar(aes(x = hour, group = day) ) +
  facet_grid(.~day, scales = "free")


We can group the search data by year and day of the week, to visualize the overall trend

wkday <- group_by(search_data, year, day) %>% summarize(count = n())
p <- ggplot(wkday, aes(day, count, fill = year)) 
p + geom_bar(stat = "identity") + labs(x = "", y = "Search Volume")



Reporting: A wordcloud from Google Search Data



First we need to extract the text and clean it using regular expressions

search <- tolower(search_data$search)
#search <- iconv(search, "ASCII", "UTF-8", " ")
search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search)
search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search)
search <- trimws(search)


After cleaning the text we can create a Text Corpus (a large and structured set of texts) and remove some words

search_corpus <-  Corpus(VectorSource(search))
search_corpus <- tm_map(search_corpus, content_transformer(removePunctuation))
search_corpus <- tm_map(search_corpus, content_transformer(removeNumbers))
stopwords <- c(stopwords("english"), "chicago", "michigan", "search", "google", "returning", "luc", "university", "season", "time", "oct", "can")
search_corpus <- tm_map(search_corpus, removeWords, stopwords)


search_tdm <- TermDocumentMatrix(search_corpus)
search_matrix <- as.matrix(search_tdm)


Set a threshold for the min frequency of the words to display as well as max frequency

wordcloud(d$word, d$freq, min.freq = 40, scale = c(3 , 0.5), max.words = 200)