The google search data on this notebook comes from a google account archive
The steps outlined here to collect and analyze the data may change at any time
Below are the steps to claim your google account data
# Here we are checking if the package is installed
if(!require(rvest, quietly = TRUE))
# If the package is not in the system then it will be install
install.packages(rvest, dependencies = TRUE, quietly = TRUE)
# Here we are loading the package
library(rvest, quietly = TRUE)
if(!require(lubridate, quietly = TRUE))
install.packages(lubridate, dependencies = TRUE, quietly = TRUE)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(lubridate, quietly = TRUE)
if(!require(wordcloud, quietly = TRUE))
install.packages("wordcloud", dependencies = TRUE, quietly = TRUE)
library(wordcloud, quietly = TRUE)
if(!require(tm, quietly = TRUE))
install.packages("tm", dependencies = TRUE, quietly = TRUE)
library(tm, quietly = TRUE)
if(!require(tidyverse, quietly = TRUE))
install.packages("tidyverse", dependencies = TRUE, quietly = TRUE)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(tidyverse, quietly = TRUE)
date_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = "(?<=<br>)(.*)(?<=PM|AM)") %>%
mdy_hms()
date_search[1:10]
## [1] "2018-01-30 13:04:55 UTC" "2018-01-30 13:04:16 UTC"
## [3] "2018-01-29 22:31:17 UTC" "2018-01-29 22:30:54 UTC"
## [5] "2018-01-29 21:15:20 UTC" "2018-01-29 14:21:18 UTC"
## [7] "2018-01-29 14:21:16 UTC" "2018-01-28 18:21:18 UTC"
## [9] "2018-01-28 18:19:53 UTC" "2018-01-28 17:28:11 UTC"
text_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = '(?<=<a)(.*)(?=</a>)') %>%
str_extract(pattern = '(?<=\">)(.*)')
text_search[1:10]
## [1] "this is us season 2 episode 13" "this is us next episode"
## [3] "reacting to this is us" "27 reactions to this is us"
## [5] "Google Search" "https://get.adobe.com/reader/"
## [7] "adobe acrobat" "dhsmv crash report"
## [9] "dhsmv" "do you want a high or low snr"
type_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=<a)") %>%
str_extract(pattern = "(\\w+)(?=\\s)")
type_search[1:10]
## [1] "Searched" "Searched" "Searched" "Searched" "Visited" "Visited"
## [7] "Searched" "Searched" "Searched" "Searched"
search_data <- tibble(timestamp = date_search,
date = as_date(date_search),
year = year(date_search),
month = month(date_search, label = TRUE),
day = weekdays(date_search),
hour = hour(date_search),
type = type_search,
search = text_search)
search_data$day <- factor(search_data$day,
levels = c("Sunday", "Monday", "Tuesday",
"Wednesday","Thursday", "Friday",
"Saturday"))
search_data <- na.omit(search_data)
head(search_data)
## # A tibble: 6 x 8
## timestamp date year month day hour type search
## <dttm> <date> <dbl> <ord> <fct> <int> <chr> <chr>
## 1 2018-01-30 13:04:55 2018-01-30 2018 Jan Tuesday 13 Searched this ~
## 2 2018-01-30 13:04:16 2018-01-30 2018 Jan Tuesday 13 Searched this ~
## 3 2018-01-29 22:31:17 2018-01-29 2018 Jan Monday 22 Searched react~
## 4 2018-01-29 22:30:54 2018-01-29 2018 Jan Monday 22 Searched 27 re~
## 5 2018-01-29 21:15:20 2018-01-29 2018 Jan Monday 21 Visited Googl~
## 6 2018-01-29 14:21:18 2018-01-29 2018 Jan Monday 14 Visited https~
p <- ggplot(search_data, aes(year))
p + geom_bar()
monthly <- search_data[(search_data$year > 2014 & search_data$year< 2018), ]
ggplot(monthly) + geom_bar(aes(x = month, group = year)) +
theme(axis.text.x = element_text(angle=90)) +
facet_grid(.~year, scales="free")
p <- ggplot(search_data, aes(hour))
p + geom_bar()
p <- ggplot(search_data, aes(day))
p + geom_bar()
ggplot(search_data) +
geom_bar(aes(x = hour, group = day) ) +
facet_grid(.~day, scales = "free")
wkday <- group_by(search_data, year, day) %>% summarize(count = n())
p <- ggplot(wkday, aes(day, count, fill = year))
p + geom_bar(stat = "identity") + labs(x = "", y = "Search Volume")
search <- tolower(search_data$search)
#search <- iconv(search, "ASCII", "UTF-8", " ")
search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search)
search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search)
search <- trimws(search)
search_corpus <- Corpus(VectorSource(search))
search_corpus <- tm_map(search_corpus, content_transformer(removePunctuation))
search_corpus <- tm_map(search_corpus, content_transformer(removeNumbers))
stopwords <- c(stopwords("english"), "chicago", "michigan", "search", "google", "returning", "luc", "university", "season", "time", "oct", "can")
search_corpus <- tm_map(search_corpus, removeWords, stopwords)
search_tdm <- TermDocumentMatrix(search_corpus)
search_matrix <- as.matrix(search_tdm)
wordcloud(d$word, d$freq, min.freq = 40, scale = c(3 , 0.5), max.words = 200)