The google search data on this notebook comes from a google account archive
The steps outlined here to collect and analyze the data may change at any time
Below are the steps to claim your google account data
# Here we are checking if the package is installed
if(!require(rvest, quietly = TRUE))
# If the package is not in the system then it will be install
install.packages(rvest, dependencies = TRUE, quietly = TRUE)
# Here we are loading the package
library(rvest, quietly = TRUE)
if(!require(lubridate, quietly = TRUE))
install.packages(lubridate, dependencies = TRUE, quietly = TRUE)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(lubridate, quietly = TRUE)
if(!require(wordcloud, quietly = TRUE))
install.packages(wordcloud, dependencies = TRUE, quietly = TRUE)
library(wordcloud, quietly = TRUE)
if(!require(tm, quietly = TRUE))
install.packages(tm, dependencies = TRUE, quietly = TRUE)
library(tm, quietly = TRUE)
if(!require(tidyverse, quietly = TRUE))
install.packages(tidyverse, dependencies = TRUE, quietly = TRUE)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ lubridate::union() masks base::union()
library(tidyverse, quietly = TRUE)
date_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = "(?<=<br>)(.*)(?<=PM|AM)") %>%
mdy_hms()
text_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = '(?<=<a)(.*)(?=</a>)') %>%
str_extract(pattern = '(?<=\">)(.*)')
type_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=<a)") %>%
str_extract(pattern = "(\\w+)(?=\\s)")
search_data <- tibble(timestamp = date_search,
date = as_date(date_search),
year = year(date_search),
month = month(date_search, label = TRUE),
day = weekdays(date_search),
hour = hour(date_search),
type = type_search,
search = text_search)
search_data$day <- factor(search_data$day,
levels = c("Sunday", "Monday", "Tuesday",
"Wednesday","Thursday", "Friday",
"Saturday"))
search_data <- na.omit(search_data)
head(search_data)
## # A tibble: 6 x 8
## timestamp date year month day hour type search
## <dttm> <date> <dbl> <ord> <fct> <int> <chr> <chr>
## 1 2018-01-30 11:59:56 2018-01-30 2018 Jan Tuesday 11 Visited https…
## 2 2018-01-30 11:59:52 2018-01-30 2018 Jan Tuesday 11 Searched aligh…
## 3 2018-01-30 11:54:50 2018-01-30 2018 Jan Tuesday 11 Searched rambl…
## 4 2018-01-30 11:13:48 2018-01-30 2018 Jan Tuesday 11 Visited https…
## 5 2018-01-30 11:13:46 2018-01-30 2018 Jan Tuesday 11 Searched sakai
## 6 2018-01-30 10:57:06 2018-01-30 2018 Jan Tuesday 10 Visited https…
p <- ggplot(search_data, aes(year))
p + geom_bar()
monthly <- search_data[(search_data$year > 2014 & search_data$year< 2018), ]
ggplot(monthly) + geom_bar(aes(x = month, group = year)) +
theme(axis.text.x = element_text(angle=90)) +
facet_grid(.~year, scales="free")
p <- ggplot(search_data, aes(hour))
p + geom_bar()
p <- ggplot(search_data, aes(day))
p + geom_bar()
ggplot(search_data) +
geom_bar(aes(x = hour, group = day) ) +
facet_grid(.~day, scales = "free")
wkday <- group_by(search_data, year, day) %>% summarize(count = n())
p <- ggplot(wkday, aes(day, count, fill = year))
p + geom_bar(stat = "identity") + labs(x = "", y = "Search Volume")
search <- tolower(search_data$search)
search <- iconv(search, "ASCII", "UTF-8", " ")
search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search)
search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search)
search <- trimws(search)
search_corpus <- Corpus(VectorSource(search))
search_corpus <- tm_map(search_corpus, content_transformer(removePunctuation))
search_corpus <- tm_map(search_corpus, content_transformer(removeNumbers))
stopwords <- c(stopwords("english"), "chrome", "chicago", "jlroo", "google")
search_corpus <- tm_map(search_corpus, removeWords, stopwords)
search_tdm <- TermDocumentMatrix(search_corpus)
search_matrix <- as.matrix(search_tdm)
wordcloud(d$word, d$freq, min.freq = 50, scale = c(3 , 0.5), max.words = 200)