Before retrieving any kind of data, let’s install the necessary packages.
#install.packages("gtrendsR","remotes","dplyr","tidyr","ggplot2","grid","gridExtra")
library(remotes)
## Warning: package 'remotes' was built under R version 4.0.5
#remotes::install_github("trendecon/trendecon")
At this point, we can start retrieving data from Google Trends through the ‘trendecon’ package which allows access to the GT API. The trick from Askitas (2015) consists in filling a a list of ‘words’ -if you can call them that- which will yield no result. If our first word is something like ‘jdodjs’ and our search term is ‘wine’ then GT will try to compare both search terms but will come up empty when looking for ‘jdodjs’ so really, we wil only obtain results for ‘wine’. If we make R loop through a lot of searches composed of ‘wine’ + ‘word’ then we will obtain different results for ‘wine’ as the sample seems to be renewed with each query. Let’s try this for the search term ‘wine’ in Spain from 01-01-2011 to 31-12-2020.
In 10 years there are 120 months, and because the time period is long enough, we know GT will provide our results in a monthly frequency.
#Create our vector of searches
library(trendecon)
search<-c("wine")
words <- sapply(rep(8, 30), function(x) paste(sample(c(letters, LETTERS), x, replace=TRUE), collapse=""))
terms<-data.frame(matrix(nrow = length(search) * length(words), ncol = 1))
for (i in 1:length(search)){
for (j in 1:length(words)){
lastterm<-paste(search[i],words[j],sep = " + ")
terms[j,]=lastterm
}
}
terms<-t(terms)
#Extract data with trendecon
GTdata<-data.frame(matrix(nrow = 120, ncol = length(terms)))
for (i in 1:length(terms)){
extracc<-ts_gtrends(
keyword = terms[i],
category = "0",
geo = "ES",
time ="2011-01-01 2020-12-31",
retry = 20,
wait = 5,
quiet = FALSE
)
lastextracc<-extracc[,2]
GTdata[,i]=lastextracc
colnames(GTdata)[i]=terms[i]
}
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
#print(GTdata)
rownames(GTdata)<-extracc$time
#save(extracc,file="extracc.RData")
#save(GTdata,file = "GTdata.RData")
extracc
## # A tibble: 120 x 2
## time value
## <date> <int>
## 1 2011-01-01 56
## 2 2011-02-01 60
## 3 2011-03-01 55
## 4 2011-04-01 61
## 5 2011-05-01 63
## 6 2011-06-01 59
## 7 2011-07-01 73
## 8 2011-08-01 58
## 9 2011-09-01 57
## 10 2011-10-01 60
## # ... with 110 more rows
If you get an error while running this code, download the data from: https://github.com/ecebcerupv/T4-Google-Trends-data-CARMA-2024-
As you might have noticed, the extractions from Google Trends vary significantly, and you might be able to see this more clearly if we plot those searches. In the code below I have plotted 5 of these extractions to show how distorted the same might be. I used a random sample to decide which ones to plot, but you can plot whichever ones you like!
load("C:/Users/ecebcer/OneDrive - UPV/CARMA 2024/GTdata.RData")
#Let's randomly plot 10 of the samples we just obtained.
library(dplyr)
library(tidyr)
library(ggplot2)
library(grid)
library(gridExtra)
s<-sample(1:length(words),5)
sampled_data <- GTdata %>%
select(all_of(s)) %>%
mutate(Time = 1:120)
# Reshape the data into long format
GTdata_long <- sampled_data %>%
pivot_longer(cols = -Time, names_to = "Sample", values_to = "Value")
# Define desired colors
desired_colors <- c("red", "orange", "purple", "darkblue", "yellow")
# Plot the graph
graph <- ggplot(GTdata_long, aes(x = Time, y = Value, group = Sample, color = Sample)) +
geom_line(alpha = 0.15) +
theme_minimal() +
theme(legend.position = "none") +
ylab("SVI") +
xlab("Time") +
ylim(0, 100) +
scale_x_continuous(breaks = c(0, 20, 40, 60, 80, 100, 120)) +
theme(axis.title = element_text(size = 18)) +
theme(axis.text = element_text(size = 18)) +
theme(axis.text.y = element_text(margin = margin(t = 0, r = 0, b = 0, l = 0))) +
theme(axis.text.x = element_text(margin = margin(t = 0, r = 0, b = 0, l = 0))) +
scale_color_manual(values = desired_colors)
# Add bottom text
graph1 <- grid.arrange(graph, bottom = textGrob(
expression(paste("Samples for the search term 'Wine'")),
vjust = 0.1, hjust = 0.5, gp = gpar(fontsize = 18, font = 1))
)
Now that we have checked that there is indeed a sampling error, the next step is to know how to fix it. Now, you might remember that more popular terms need less extractions in order to be consistent, but how do we know which term is more popular? We use the average s.d. for the term. In this case, the s.d. of the term will be the s.d. among the 30 extractions we just obtained.
#Sdev
sdev<-data.frame(matrix(nrow = 120, ncol = 1))
i<-NULL
for (i in 1:nrow(GTdata)) {
lastsd<-sd(GTdata[i,])
sdev[i,1]=lastsd
}
#El colMeans es para obtener la s.d. final
wineSD<-colMeans(sdev)
save(wineSD,file = "wineSD.RData")
wineSD
## matrix.nrow...120..ncol...1.
## 3.090652
n<-ncol(GTdata)
MAPE<-(1.3728*wineSD)/sqrt(n) + (0.0034*wineSD^3)/sqrt(n)
save(MAPE,file = "minMAPE.RData")
MAPE
## matrix.nrow...120..ncol...1.
## 0.7929603
This means that given our number of extractions, if we were to average them all, we would only be able to reduce the error to 0.793%. Let’s imagine for a moment that instead, we wanted to calculate the necessary amount of extractions necessary to reduce our error to 0.5% and to 0.1%. In that case, we could use this same formula, but solved for n.
In this case, we assume a desired error of 0.1%.
dmape<-0.1
nec<-((1.3728*wineSD) + (0.0034*wineSD^3))^2/dmape^2
save(nec,file = "wineextractions.RData")
nec
## matrix.nrow...120..ncol...1.
## 1886.358
If we wanted a desired error of 0.1%, we would need to use 1886.36 extractions. We round this number up to 1887 since the number of extractions needs to be an integer, and with 1887, we are sure to cross the 0.1% MAPE threshold.
If we assume a desired error of 0.5%…
dmape<-0.5
nec05<-((1.3728*wineSD) + (0.0034*wineSD^3))^2/dmape^2
save(nec,file = "wineextractions05.RData")
nec05
## matrix.nrow...120..ncol...1.
## 75.45433
We would only need 76 extractions. For that, we could either download all 76 extractions at once, or add 46 extra to the ones that we have.
Let’s say we finally settle on a 0.5% error for efficiency sake. In the code below we extract the rest of necessary observations.
#Create our vector of searches
library(trendecon)
search<-c("wine")
words <- sapply(rep(8, 46), function(x) paste(sample(c(letters, LETTERS), x, replace=TRUE), collapse=""))
terms<-data.frame(matrix(nrow = length(search) * length(words), ncol = 1))
for (i in 1:length(search)){
for (j in 1:length(words)){
lastterm<-paste(search[i],words[j],sep = " + ")
terms[j,]=lastterm
}
}
terms<-t(terms)
#Extract data with trendecon
GTdataextra<-data.frame(matrix(nrow = 120, ncol = length(terms)))
for (i in 1:length(terms)){
extracc<-ts_gtrends(
keyword = terms[i],
category = "0",
geo = "ES",
time ="2011-01-01 2020-12-31",
retry = 20,
wait = 5,
quiet = FALSE
)
lastextracc<-extracc[,2]
GTdataextra[,i]=lastextracc
colnames(GTdataextra)[i]=terms[i]
}
#print(GTdataextra)
rownames(GTdataextra)<-extracc$time
#save(extracc,file="extracc.RData")
#save(GTdataextra,file = "GTdataextra.RData")
load("C:/Users/ecebcer/OneDrive - UPV/CARMA 2024/GTdataextra.RData")
extractions<-cbind(GTdata,GTdataextra)
SVI<-rowMeans(extractions)
FinalSVI<-SVI/max(SVI)*100
#save(FinalSVI,file = "SVI.RData")
FinalSVI
## 2011-01-01 2011-02-01 2011-03-01 2011-04-01 2011-05-01 2011-06-01 2011-07-01
## 56.49160 60.79712 56.94481 61.25033 64.51613 59.73074 73.96694
## 2011-08-01 2011-09-01 2011-10-01 2011-11-01 2011-12-01 2012-01-01 2012-02-01
## 57.93122 57.58464 61.42362 64.91602 64.06292 54.34551 59.90403
## 2012-03-01 2012-04-01 2012-05-01 2012-06-01 2012-07-01 2012-08-01 2012-09-01
## 55.89176 53.87897 61.33031 56.21168 59.22421 55.95841 60.74380
## 2012-10-01 2012-11-01 2012-12-01 2013-01-01 2013-02-01 2013-03-01 2013-04-01
## 61.87683 60.98374 60.53053 55.86510 51.34631 58.74433 60.54385
## 2013-05-01 2013-06-01 2013-07-01 2013-08-01 2013-09-01 2013-10-01 2013-11-01
## 61.71688 62.42335 60.03732 56.95814 61.71688 60.01066 60.86377
## 2013-12-01 2014-01-01 2014-02-01 2014-03-01 2014-04-01 2014-05-01 2014-06-01
## 66.79552 61.07705 62.06345 60.49054 63.64969 64.74274 69.60810
## 2014-07-01 2014-08-01 2014-09-01 2014-10-01 2014-11-01 2014-12-01 2015-01-01
## 69.04825 71.08771 67.23540 67.88856 65.92909 72.84724 63.90296
## 2015-02-01 2015-03-01 2015-04-01 2015-05-01 2015-06-01 2015-07-01 2015-08-01
## 62.92989 63.43642 67.50200 63.39643 69.22154 70.03466 69.59477
## 2015-09-01 2015-10-01 2015-11-01 2015-12-01 2016-01-01 2016-02-01 2016-03-01
## 69.44815 70.10131 71.23434 73.68702 65.51586 70.68782 74.50013
## 2016-04-01 2016-05-01 2016-06-01 2016-07-01 2016-08-01 2016-09-01 2016-10-01
## 77.71261 87.33671 100.00000 87.64330 80.35191 83.71101 79.89869
## 2016-11-01 2016-12-01 2017-01-01 2017-02-01 2017-03-01 2017-04-01 2017-05-01
## 79.29885 82.05812 73.66036 76.45961 77.95255 76.80619 80.85844
## 2017-06-01 2017-07-01 2017-08-01 2017-09-01 2017-10-01 2017-11-01 2017-12-01
## 78.75233 84.73740 80.77846 86.07038 75.40656 75.37990 82.01813
## 2018-01-01 2018-02-01 2018-03-01 2018-04-01 2018-05-01 2018-06-01 2018-07-01
## 68.78166 75.23327 74.80672 79.65876 82.08478 85.41722 86.96348
## 2018-08-01 2018-09-01 2018-10-01 2018-11-01 2018-12-01 2019-01-01 2019-02-01
## 85.47054 88.80299 83.92429 77.97921 83.81765 73.06052 76.47294
## 2019-03-01 2019-04-01 2019-05-01 2019-06-01 2019-07-01 2019-08-01 2019-09-01
## 79.64543 80.59184 88.53639 94.13490 91.62890 92.69528 90.49587
## 2019-10-01 2019-11-01 2019-12-01 2020-01-01 2020-02-01 2020-03-01 2020-04-01
## 86.75020 81.36497 84.31085 79.37883 81.07171 57.69128 59.13090
## 2020-05-01 2020-06-01 2020-07-01 2020-08-01 2020-09-01 2020-10-01 2020-11-01
## 56.99813 53.98560 60.99707 64.94268 58.87763 58.41109 59.81072
## 2020-12-01
## 68.50173
#Let's randomly plot 10 of the samples we just obtained.
library(dplyr)
library(tidyr)
library(ggplot2)
library(grid)
library(gridExtra)
GTdata$SVI=FinalSVI
# Plot the graph
# Create a data frame with the sampled columns and the time variable
sampled_data <- GTdata %>%
select(all_of(s)) %>%
mutate(Time = 1:120, SVI=GTdata$SVI)
# Reshape the data into long format
GTdata_long <- sampled_data[,1:6] %>%
pivot_longer(cols = -Time, names_to = "Sample", values_to = "Value")
GTdata_long2 <- sampled_data[,6:7] %>%
pivot_longer(cols = -Time, names_to = "Sample", values_to = "Value")
# Define desired colors
desired_colors <- c("red", "orange", "purple", "darkblue", "yellow")
# Plot the graph
graph <- ggplot(GTdata_long, aes(x = Time, y = Value, group = Sample, color = Sample)) +
geom_line(alpha = 0.25) +
theme_minimal() +
theme(legend.position = "none") +
ylab("SVI") +
xlab("Time") +
ylim(0, 100) +
scale_x_continuous(breaks = c(0, 20, 40, 60, 80, 100, 120)) +
theme(axis.title = element_text(size = 18)) +
theme(axis.text = element_text(size = 18)) +
theme(axis.text.y = element_text(margin = margin(t = 0, r = 0, b = 0, l = 0))) +
theme(axis.text.x = element_text(margin = margin(t = 0, r = 0, b = 0, l = 0))) +
scale_color_manual(values = desired_colors)
# Add the new object (FinalSVI from GTdata) as another line to the plot
graph <- graph + geom_line(data = GTdata_long2, aes(x = Time, y = Value), color = "black", alpha = 0.65)
# Add bottom text
graph1 <- grid.arrange(graph, bottom = textGrob(
expression(paste("Samples for the search term 'Wine' and the consistent SVI")),
vjust = 0.1, hjust = 0.5, gp = gpar(fontsize = 18, font = 1))
)
# Display the graph
print(graph1)
## TableGrob (2 x 1) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange text[GRID.text.68]
Finally, the term which we have called FinalSVI would be the one which we would use to try to improve forecasting, nowcasting, etc.