The google search data on this notebook comes from a google account archive
The steps outlined here to collect and analyze the data may change at any time
Below are the steps to claim your google account data
# Here we are checking if the package is installed
if(!require("rvest", quietly = TRUE))
# If the package is not in the system then it will be install
install.packages(rvest, dependencies = TRUE, quietly = TRUE)
# Here we are loading the package
library(rvest, quietly = TRUE)
if(!require("lubridate", quietly = TRUE))
install.packages(lubridate, dependencies = TRUE, quietly = TRUE)
library(lubridate, quietly = TRUE)
if(!require("wordcloud", quietly = TRUE))
install.packages(wordcloud, dependencies = TRUE, quietly = TRUE)
library(wordcloud, quietly = TRUE)
if(!require("tm", quietly = TRUE))
install.packages(tm, dependencies = TRUE, quietly = TRUE)
library(tm, quietly = TRUE)
if(!require(tidyverse, quietly = TRUE))
install.packages(tidyverse, dependencies = TRUE, quietly = TRUE)
library(tidyverse, quietly = TRUE)
library(tm)
library(wordcloud)
library(tidyverse)
date_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = "(?<=<br>)(.*)(?<=PM|AM)") %>%
mdy_hms()
text_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = '(?<=<a)(.*)(?=</a>)') %>%
str_extract(pattern = '(?<=\">)(.*)')
type_search <- search_archive %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=<a)") %>%
str_extract(pattern = "(\\w+)(?=\\s)")
search_data <- tibble(timestamp = date_search,
date = as_date(date_search),
year = year(date_search),
month = month(date_search, label = TRUE),
day = weekdays(date_search),
hour = hour(date_search),
type = type_search,
search = text_search)
search_data$day <- factor(search_data$day,
levels = c("Sunday", "Monday", "Tuesday",
"Wednesday","Thursday", "Friday",
"Saturday"))
search_data <- na.omit(search_data)
head(search_data)
p <- ggplot(search_data, aes(year))
p + geom_bar()
monthly <- search_data[(search_data$year > 2014 & search_data$year< 2018), ]
ggplot(monthly) + geom_bar(aes(x = month, group = year)) +
theme(axis.text.x = element_text(angle=90)) +
facet_grid(.~year, scales="free")
p <- ggplot(search_data, aes(hour))
p + geom_bar()
p <- ggplot(search_data, aes(day))
p + geom_bar()
ggplot(search_data) +
geom_bar(aes(x = hour, group = day) ) +
facet_grid(.~day, scales = "free")
wkday <- group_by(search_data, year, day) %>% summarize(count = n())
p <- ggplot(wkday, aes(day, count, fill = year))
p + geom_bar(stat = "identity") + labs(x = "", y = "Search Volume")
search <- tolower(search_data$search)
#search <- iconv(search, "ASCII", "UTF-8", ")
search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search)
search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search)
search <- trimws(search)
search_corpus <- Corpus(VectorSource(search))
search_corpus <- tm_map(search_corpus, content_transformer(removePunctuation))
search_corpus <- tm_map(search_corpus, content_transformer(removeNumbers))
stopwords <- c(stopwords("english"), "chrome", "chicago", "jlroo", "google","online","good","rate")
search_corpus <- tm_map(search_corpus, removeWords, stopwords)
search_tdm <- TermDocumentMatrix(search_corpus)
search_matrix <- as.matrix(search_tdm)
wordcloud(d$word, d$freq, min.freq = 30, scale = c(2 , 0.5), max.words = 200)