Part 1. Retrieving Data

1.1. Installing Packages

Before retrieving any kind of data, let’s install the necessary packages.

#install.packages("gtrendsR","remotes","dplyr","tidyr","ggplot2","grid","gridExtra")
library(remotes)
## Warning: package 'remotes' was built under R version 4.0.5
#remotes::install_github("trendecon/trendecon")

1.2. Our searches

At this point, we can start retrieving data from Google Trends through the ‘trendecon’ package which allows access to the GT API. The trick from Askitas (2015) consists in filling a a list of ‘words’ -if you can call them that- which will yield no result. If our first word is something like ‘jdodjs’ and our search term is ‘wine’ then GT will try to compare both search terms but will come up empty when looking for ‘jdodjs’ so really, we wil only obtain results for ‘wine’. If we make R loop through a lot of searches composed of ‘wine’ + ‘word’ then we will obtain different results for ‘wine’ as the sample seems to be renewed with each query. Let’s try this for the search term ‘wine’ in Spain from 01-01-2011 to 31-12-2020.

In 10 years there are 120 months, and because the time period is long enough, we know GT will provide our results in a monthly frequency.

#Create our vector of searches
library(trendecon)
search<-c("wine")

words <- sapply(rep(8, 30), function(x) paste(sample(c(letters, LETTERS), x, replace=TRUE), collapse=""))

terms<-data.frame(matrix(nrow = length(search) * length(words), ncol = 1))
for (i in 1:length(search)){
        for (j in 1:length(words)){
                lastterm<-paste(search[i],words[j],sep = " + ")
                terms[j,]=lastterm
        }
}
terms<-t(terms)


#Extract data with trendecon
GTdata<-data.frame(matrix(nrow = 120, ncol = length(terms)))
for (i in 1:length(terms)){
        extracc<-ts_gtrends(
                keyword = terms[i],
                category = "0",
                geo = "ES",
                time ="2011-01-01 2020-12-31",
                        retry = 20,
                        wait = 5,
                        quiet = FALSE
        )
        lastextracc<-extracc[,2]
                GTdata[,i]=lastextracc
                colnames(GTdata)[i]=terms[i]
}
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
## Downloading data for 2011-01-01 2020-12-31
#print(GTdata)
rownames(GTdata)<-extracc$time
#save(extracc,file="extracc.RData")
#save(GTdata,file = "GTdata.RData")
extracc
## # A tibble: 120 x 2
##    time       value
##    <date>     <int>
##  1 2011-01-01    56
##  2 2011-02-01    60
##  3 2011-03-01    55
##  4 2011-04-01    61
##  5 2011-05-01    63
##  6 2011-06-01    59
##  7 2011-07-01    73
##  8 2011-08-01    58
##  9 2011-09-01    57
## 10 2011-10-01    60
## # ... with 110 more rows

If you get an error while running this code, download the data from: https://github.com/ecebcerupv/T4-Google-Trends-data-CARMA-2024-

1.3. Sampling error

As you might have noticed, the extractions from Google Trends vary significantly, and you might be able to see this more clearly if we plot those searches. In the code below I have plotted 5 of these extractions to show how distorted the same might be. I used a random sample to decide which ones to plot, but you can plot whichever ones you like!

load("C:/Users/ecebcer/OneDrive - UPV/CARMA 2024/GTdata.RData")
#Let's randomly plot 10 of the samples we just obtained. 
library(dplyr)
library(tidyr)
library(ggplot2)
library(grid)
library(gridExtra)

s<-sample(1:length(words),5)

sampled_data <- GTdata %>%
        select(all_of(s)) %>%
        mutate(Time = 1:120)

# Reshape the data into long format
GTdata_long <- sampled_data %>%
        pivot_longer(cols = -Time, names_to = "Sample", values_to = "Value")

# Define desired colors
desired_colors <- c("red", "orange", "purple", "darkblue", "yellow")

# Plot the graph
graph <- ggplot(GTdata_long, aes(x = Time, y = Value, group = Sample, color = Sample)) +
        geom_line(alpha = 0.15) +
        theme_minimal() +
        theme(legend.position = "none") +
        ylab("SVI") +
        xlab("Time") +
        ylim(0, 100) +
        scale_x_continuous(breaks = c(0, 20, 40, 60, 80, 100, 120)) +
        theme(axis.title = element_text(size = 18)) +
        theme(axis.text = element_text(size = 18)) +
        theme(axis.text.y = element_text(margin = margin(t = 0, r = 0, b = 0, l = 0))) +
        theme(axis.text.x = element_text(margin = margin(t = 0, r = 0, b = 0, l = 0))) +
        scale_color_manual(values = desired_colors)

# Add bottom text
graph1 <- grid.arrange(graph, bottom = textGrob(
        expression(paste("Samples for the search term 'Wine'")), 
        vjust = 0.1, hjust = 0.5, gp = gpar(fontsize = 18, font = 1))
)

Part 2. Processing and averaging GT data

2.1. Calculating the Standard Deviation of a query

Now that we have checked that there is indeed a sampling error, the next step is to know how to fix it. Now, you might remember that more popular terms need less extractions in order to be consistent, but how do we know which term is more popular? We use the average s.d. for the term. In this case, the s.d. of the term will be the s.d. among the 30 extractions we just obtained.

#Sdev
sdev<-data.frame(matrix(nrow = 120, ncol = 1))
i<-NULL
for (i in 1:nrow(GTdata)) {
        lastsd<-sd(GTdata[i,])
        sdev[i,1]=lastsd
}
#El colMeans es para obtener la s.d. final
wineSD<-colMeans(sdev)
save(wineSD,file = "wineSD.RData")
wineSD
## matrix.nrow...120..ncol...1. 
##                     3.090652

2.2. Minimum amount of error

n<-ncol(GTdata)
MAPE<-(1.3728*wineSD)/sqrt(n) + (0.0034*wineSD^3)/sqrt(n)
save(MAPE,file = "minMAPE.RData")
MAPE
## matrix.nrow...120..ncol...1. 
##                    0.7929603

This means that given our number of extractions, if we were to average them all, we would only be able to reduce the error to 0.793%. Let’s imagine for a moment that instead, we wanted to calculate the necessary amount of extractions necessary to reduce our error to 0.5% and to 0.1%. In that case, we could use this same formula, but solved for n.

2.3. Necessary amount of extractions

In this case, we assume a desired error of 0.1%.

dmape<-0.1
nec<-((1.3728*wineSD) + (0.0034*wineSD^3))^2/dmape^2
save(nec,file = "wineextractions.RData")
nec
## matrix.nrow...120..ncol...1. 
##                     1886.358

If we wanted a desired error of 0.1%, we would need to use 1886.36 extractions. We round this number up to 1887 since the number of extractions needs to be an integer, and with 1887, we are sure to cross the 0.1% MAPE threshold.

If we assume a desired error of 0.5%…

dmape<-0.5
nec05<-((1.3728*wineSD) + (0.0034*wineSD^3))^2/dmape^2
save(nec,file = "wineextractions05.RData")
nec05
## matrix.nrow...120..ncol...1. 
##                     75.45433

We would only need 76 extractions. For that, we could either download all 76 extractions at once, or add 46 extra to the ones that we have.

Part 3. Obtaining the necessary extractions

Let’s say we finally settle on a 0.5% error for efficiency sake. In the code below we extract the rest of necessary observations.

#Create our vector of searches
library(trendecon)
search<-c("wine")

words <- sapply(rep(8, 46), function(x) paste(sample(c(letters, LETTERS), x, replace=TRUE), collapse=""))

terms<-data.frame(matrix(nrow = length(search) * length(words), ncol = 1))
for (i in 1:length(search)){
        for (j in 1:length(words)){
                lastterm<-paste(search[i],words[j],sep = " + ")
                terms[j,]=lastterm
        }
}
terms<-t(terms)


#Extract data with trendecon
GTdataextra<-data.frame(matrix(nrow = 120, ncol = length(terms)))
for (i in 1:length(terms)){
        extracc<-ts_gtrends(
                keyword = terms[i],
                category = "0",
                geo = "ES",
                time ="2011-01-01 2020-12-31",
                        retry = 20,
                        wait = 5,
                        quiet = FALSE
        )
        lastextracc<-extracc[,2]
                GTdataextra[,i]=lastextracc
                colnames(GTdataextra)[i]=terms[i]
}
#print(GTdataextra)
rownames(GTdataextra)<-extracc$time
#save(extracc,file="extracc.RData")
#save(GTdataextra,file = "GTdataextra.RData")

3.2 Creating the consistent SVI for a 0.5% MAPE.

load("C:/Users/ecebcer/OneDrive - UPV/CARMA 2024/GTdataextra.RData")
extractions<-cbind(GTdata,GTdataextra)
SVI<-rowMeans(extractions)
FinalSVI<-SVI/max(SVI)*100
#save(FinalSVI,file = "SVI.RData")
FinalSVI
## 2011-01-01 2011-02-01 2011-03-01 2011-04-01 2011-05-01 2011-06-01 2011-07-01 
##   56.49160   60.79712   56.94481   61.25033   64.51613   59.73074   73.96694 
## 2011-08-01 2011-09-01 2011-10-01 2011-11-01 2011-12-01 2012-01-01 2012-02-01 
##   57.93122   57.58464   61.42362   64.91602   64.06292   54.34551   59.90403 
## 2012-03-01 2012-04-01 2012-05-01 2012-06-01 2012-07-01 2012-08-01 2012-09-01 
##   55.89176   53.87897   61.33031   56.21168   59.22421   55.95841   60.74380 
## 2012-10-01 2012-11-01 2012-12-01 2013-01-01 2013-02-01 2013-03-01 2013-04-01 
##   61.87683   60.98374   60.53053   55.86510   51.34631   58.74433   60.54385 
## 2013-05-01 2013-06-01 2013-07-01 2013-08-01 2013-09-01 2013-10-01 2013-11-01 
##   61.71688   62.42335   60.03732   56.95814   61.71688   60.01066   60.86377 
## 2013-12-01 2014-01-01 2014-02-01 2014-03-01 2014-04-01 2014-05-01 2014-06-01 
##   66.79552   61.07705   62.06345   60.49054   63.64969   64.74274   69.60810 
## 2014-07-01 2014-08-01 2014-09-01 2014-10-01 2014-11-01 2014-12-01 2015-01-01 
##   69.04825   71.08771   67.23540   67.88856   65.92909   72.84724   63.90296 
## 2015-02-01 2015-03-01 2015-04-01 2015-05-01 2015-06-01 2015-07-01 2015-08-01 
##   62.92989   63.43642   67.50200   63.39643   69.22154   70.03466   69.59477 
## 2015-09-01 2015-10-01 2015-11-01 2015-12-01 2016-01-01 2016-02-01 2016-03-01 
##   69.44815   70.10131   71.23434   73.68702   65.51586   70.68782   74.50013 
## 2016-04-01 2016-05-01 2016-06-01 2016-07-01 2016-08-01 2016-09-01 2016-10-01 
##   77.71261   87.33671  100.00000   87.64330   80.35191   83.71101   79.89869 
## 2016-11-01 2016-12-01 2017-01-01 2017-02-01 2017-03-01 2017-04-01 2017-05-01 
##   79.29885   82.05812   73.66036   76.45961   77.95255   76.80619   80.85844 
## 2017-06-01 2017-07-01 2017-08-01 2017-09-01 2017-10-01 2017-11-01 2017-12-01 
##   78.75233   84.73740   80.77846   86.07038   75.40656   75.37990   82.01813 
## 2018-01-01 2018-02-01 2018-03-01 2018-04-01 2018-05-01 2018-06-01 2018-07-01 
##   68.78166   75.23327   74.80672   79.65876   82.08478   85.41722   86.96348 
## 2018-08-01 2018-09-01 2018-10-01 2018-11-01 2018-12-01 2019-01-01 2019-02-01 
##   85.47054   88.80299   83.92429   77.97921   83.81765   73.06052   76.47294 
## 2019-03-01 2019-04-01 2019-05-01 2019-06-01 2019-07-01 2019-08-01 2019-09-01 
##   79.64543   80.59184   88.53639   94.13490   91.62890   92.69528   90.49587 
## 2019-10-01 2019-11-01 2019-12-01 2020-01-01 2020-02-01 2020-03-01 2020-04-01 
##   86.75020   81.36497   84.31085   79.37883   81.07171   57.69128   59.13090 
## 2020-05-01 2020-06-01 2020-07-01 2020-08-01 2020-09-01 2020-10-01 2020-11-01 
##   56.99813   53.98560   60.99707   64.94268   58.87763   58.41109   59.81072 
## 2020-12-01 
##   68.50173
#Let's randomly plot 10 of the samples we just obtained. 
library(dplyr)
library(tidyr)
library(ggplot2)
library(grid)
library(gridExtra)

GTdata$SVI=FinalSVI

# Plot the graph
# Create a data frame with the sampled columns and the time variable
sampled_data <- GTdata %>%
        select(all_of(s)) %>%
        mutate(Time = 1:120, SVI=GTdata$SVI)

# Reshape the data into long format
GTdata_long <- sampled_data[,1:6] %>%
        pivot_longer(cols = -Time, names_to = "Sample", values_to = "Value")
GTdata_long2 <- sampled_data[,6:7] %>%
        pivot_longer(cols = -Time, names_to = "Sample", values_to = "Value")

# Define desired colors
desired_colors <- c("red", "orange", "purple", "darkblue", "yellow")

# Plot the graph
graph <- ggplot(GTdata_long, aes(x = Time, y = Value, group = Sample, color = Sample)) +
        geom_line(alpha = 0.25) +
        theme_minimal() +
        theme(legend.position = "none") +
        ylab("SVI") +
        xlab("Time") +
        ylim(0, 100) +
        scale_x_continuous(breaks = c(0, 20, 40, 60, 80, 100, 120)) +
        theme(axis.title = element_text(size = 18)) +
        theme(axis.text = element_text(size = 18)) +
        theme(axis.text.y = element_text(margin = margin(t = 0, r = 0, b = 0, l = 0))) +
        theme(axis.text.x = element_text(margin = margin(t = 0, r = 0, b = 0, l = 0))) +
        scale_color_manual(values = desired_colors)

# Add the new object (FinalSVI from GTdata) as another line to the plot
graph <- graph + geom_line(data = GTdata_long2, aes(x = Time, y = Value), color = "black", alpha = 0.65)

# Add bottom text
graph1 <- grid.arrange(graph, bottom = textGrob(
        expression(paste("Samples for the search term 'Wine' and the consistent SVI")), 
        vjust = 0.1, hjust = 0.5, gp = gpar(fontsize = 18, font = 1))
)

# Display the graph
print(graph1)
## TableGrob (2 x 1) "arrange": 2 grobs
##   z     cells    name               grob
## 1 1 (1-1,1-1) arrange     gtable[layout]
## 2 2 (2-2,1-1) arrange text[GRID.text.68]

Finally, the term which we have called FinalSVI would be the one which we would use to try to improve forecasting, nowcasting, etc.