library(rvest)
library(plyr)
library(pdftools)
library(glue)
library(tidyverse)
library(tm)
library(textcat)
library(tidytext)
library(stringr)
library(dplyr)
library(purrr)
library(textmineR)
library(quanteda)
library(stringr)
library(newsmap)
library(maps)
library(data.table)
library(ggthemes)
rm(list=ls())
setwd("~/Desktop/R/Introductory Data Science/Group Project")
data_frame <- read.csv("Covid-19\ Policy\ Mentions\ -\ 2020-04-06.csv") %>%
select(Outlet.or.Author, Mention.Title, Country, Mention.Date,
Mention.URL, Research.Output.Title, Publication.Date, Journal.Collection.Title,
Affiliations..GRID., Altmetric.Attention.Score)
# We create a character vector of unique URLs from which we will download policy documents
urls <- data_frame$Mention.URL %>%
as.character %>%
unique
# We define the pattern, according to which file names will be assigned to downloaded PDFs. Then, we reset the working directory and download each PDF from the urls vector.
pdf_names <- paste0("document_", seq(urls), ".pdf")
setwd("~/Desktop/R/Introductory Data Science/Group Project/PDF downloads")
mapply(download.file, urls, pdf_names)
# This is a preliminary step that we need to filter out those policy documents that are not in English. To do that, we create a corpus out of all PDFs located our current working directory (PDF Downloads).
setwd("~/Desktop/R/Introductory Data Science/Group Project/PDF downloads")
file_vector <- list.files(path = getwd())
corpus <- Corpus(URISource(file_vector),
readerControl = list(reader = readPDF))
# After we create the corpus we use the textcat fucntion to identify document language and delete non-English docuemtns from the working directory.
languages <- textcat(corpus)
names(languages)[languages != "english"] %>%
unlink()
# Now we create a new corpus, which consists of remaining English documents in the working directory.
setwd("~/Desktop/R/Introductory Data Science/Group Project/PDF downloads")
file_vector <- list.files(path = getwd())
corpus <- Corpus(URISource(file_vector),
readerControl = list(reader = readPDF))
# An additional transformation is needed in order to match our text data with original document variables that we had in our initial data set (data_frame).
# First, we match unique URLs from the original data set with how our documents are named in the PDF Downloads folder.
knitr::opts_knit$set(root.dir = "~/Desktop/R/Introductory Data Science/Group Project/PDF downloads" )
df_uniq_url <- data_frame[!duplicated(data_frame[,"Mention.URL",]),]
df_uniq_url$document <- paste0("document_", seq(1,nrow(df_uniq_url), by = 1), ".pdf")
# Now we remove those rows that correspond to non-English documents that we removed from the working directory a few steps earlier
df_uniq_url <- df_uniq_url[grepl(paste(file_vector, collapse="|"), df_uniq_url$document),]
# We convert the corpus into a data frame
data_policy <- data.frame(text=sapply(corpus, identity),
stringsAsFactors = FALSE)
# And we eremove the second row, which contains document meta data identified by the corpus function earlier. The problem with this metadata is that it is very incomplete. That's why we are going through all these transformations to link each text to its meta data from the original data set
data_policy <- data_policy[-2,]
# We elongate the data, so that each row corresponds to one document.
data_policy <- gather(data_policy , key = "document", value = "text")
# Remove the text. prefix before each document name.
data_policy$document <- gsub("text.", "", data_policy$document)
# And finally we merge the text data frame with our initial data frame.
merged_data <- merge(df_uniq_url, data_policy, by.x = "document", by.y = "document")
merged_data$text <- as.character(merged_data$text)
merged_data <- merged_data %>%
select(document, Mention.Title, Mention.Date, text, Outlet.or.Author, Country,
Publication.Date, Journal.Collection.Title, Affiliations..GRID.,
Altmetric.Attention.Score)
# Remove some unwanted symbols, which will be harder to remove later.
merged_data$text <- gsub("c(", "", merged_data$text, fixed = TRUE)
# And finally create a new corpus from the data frame, which combines both text and document variables.
corpus_policy <- corpus(merged_data, docid_field = "document",
text_field = "text",)
# There is one more cleaning procedure to make. Some of the documents we have downloaded are updates or versions of the same document. They are very similar in content and will distort our future analysis. Therefore we will calculate their similarity and remove them from our corpus.
# To calculate similarity we create a document feature matrix and use the textstat.simil function.
dfm_cov <- dfm(corpus_policy,
tolower = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
dictionary = NULL)
similarity <- textstat_simil(dfm_cov)
similarity <- as.data.frame(similarity)
# We define our similarity threshold at 0.95, which seems to work well to sort out versions of documents that originate in the same unique document.
threshold <- 0.95
# Now we identify documents with similarity higher than 0.95 and save their names in a character vector.
todrop <- subset(similarity, select = document1, subset = correlation > threshold, drop = TRUE)
todrop <- as.vector(unique(todrop))
# Can we finally do some text analysis please? Yes, now we can. We will mostly use the quanteda library in this example. First, we create tokens.
tokens <- tokens(corpus_policy,
what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
include_docvars = TRUE)
# We remove tokens associated with highly similar documents that we identified previously.
tokens <-tokens[-unlist(lapply(todrop, function(x) grep(x, names(tokens)) ) ) ]
# These are the remaining documents.
names(tokens)
## [1] "document_1.pdf" "document_11.pdf" "document_12.pdf" "document_14.pdf"
## [5] "document_16.pdf" "document_17.pdf" "document_2.pdf" "document_23.pdf"
## [9] "document_24.pdf" "document_26.pdf" "document_27.pdf" "document_28.pdf"
## [13] "document_3.pdf" "document_31.pdf" "document_32.pdf" "document_33.pdf"
## [17] "document_34.pdf" "document_35.pdf" "document_36.pdf" "document_37.pdf"
## [21] "document_38.pdf" "document_40.pdf" "document_44.pdf" "document_46.pdf"
## [25] "document_49.pdf" "document_5.pdf" "document_50.pdf" "document_52.pdf"
## [29] "document_53.pdf" "document_54.pdf" "document_55.pdf" "document_57.pdf"
## [33] "document_58.pdf" "document_6.pdf" "document_7.pdf" "document_8.pdf"
## [37] "document_9.pdf"
# Let's remove the stopwords and some other unwanted artefacts
tokens_nostop <- tokens_select(tokens, pattern = stopwords('en'), selection = 'remove')
tokens_nostop <- tokens_select(tokens_nostop, pattern = c("n*", "percent", "et",
"al", "can", "2019", "2020", "b*",
"2018", "figure", "e.g."),
selection = 'remove')
# A preview of how our tokens look like within each document.
head(tokens_nostop, 5)
## Tokens consisting of 5 documents and 8 docvars.
## document_1.pdf :
## [1] "COVID-19" "rapid" "guideline" "suspected" "pneumonia" "community"
## [7] "guideline" "April" "guidance" "rights" "reserved" "Subject"
## [ ... and 1,254 more ]
##
## document_11.pdf :
## [1] "Morbidity" "Mortality" "Weekly" "Report" "Release"
## [6] "Vol" "March" "Severe" "Outcomes" "Among"
## [11] "Patients" "Coronavirus"
## [ ... and 1,410 more ]
##
## document_12.pdf :
## [1] "Coronavirus" "Disease" "COVID-19" "Professionals"
## [5] "Frequently" "Asked" "Questio" "Answers"
## [9] "March" "also" "general" "COVID-19"
## [ ... and 1,150 more ]
##
## document_14.pdf :
## [1] "Coronavirus" "Disease" "COVID-19" "Home" "Isolation"
## [6] "Persons" "Interim" "Guidance" "CDC" "guidance"
## [11] "COVID-19" "may"
## [ ... and 642 more ]
##
## document_16.pdf :
## [1] "Morbidity" "Mortality" "Weekly" "Report"
## [5] "Release" "Vol" "March" "Evaluation"
## [9] "Effectiveness" "Surveillance" "Containment" "Measures"
## [ ... and 1,505 more ]
# Let's create a new cleaner document feature matrix without versions of the same documents. We will create the matrix from tokens instead of the corpus.
dfm_cov <- dfm(tokens_nostop)
head(dfm_cov, 5)
## Document-feature matrix of: 5 documents, 14,852 features (96.6% sparse) and 8 docvars.
## features
## docs covid-19 rapid guideline suspected pneumonia community april
## document_1.pdf 47 20 28 19 38 19 1
## document_11.pdf 41 0 0 1 1 4 0
## document_12.pdf 58 0 0 8 2 3 0
## document_14.pdf 14 0 0 2 2 0 0
## document_16.pdf 38 1 0 3 6 5 0
## features
## docs guidance rights reserved
## document_1.pdf 10 28 14
## document_11.pdf 4 0 0
## document_12.pdf 8 0 0
## document_14.pdf 14 0 0
## document_16.pdf 0 0 0
## [ reached max_nfeat ... 14,842 more features ]
# Let's count how many documents mention our features
doc_freq <- docfreq(dfm_cov)
# We can limit our analysis to features that are mentioned in at least 10 documents
dfm_cov <- dfm_cov[, doc_freq >= 10]
# This is how many are left.
nfeat(dfm_cov)
## [1] 694
# And these are the top ones.
topfeatures(dfm_cov, 10)
## covid-19 health cases china patients care disease infection
## 1593 1163 704 699 695 596 547 528
## countries world
## 527 522
# We can also calcualre a tfidf.
dfm_tfidf <- dfm_tfidf(dfm_cov, force = TRUE)
head(dfm_tfidf,5)
## Document-feature matrix of: 5 documents, 694 features (65.9% sparse) and 8 docvars.
## features
## docs covid-19 rapid suspected pneumonia community guidance
## document_1.pdf 2.9634321 4.515581 3.2349726 5.8226783 2.9113391 0.9108047
## document_11.pdf 2.5851216 0 0.1702617 0.1532284 0.6129135 0.3643219
## document_12.pdf 3.6570013 0 1.3620937 0.3064568 0.4596851 0.7286438
## document_14.pdf 0.8827244 0 0.3405234 0.3064568 0 1.2751266
## document_16.pdf 2.3959663 0.225779 0.5107851 0.9193703 0.7661419 0
## features
## docs rights reserved subject https
## document_1.pdf 11.81806 5.489547 3.7404042 3.4437540
## document_11.pdf 0 0 0.2671717 0.9839297
## document_12.pdf 0 0 0 0
## document_14.pdf 0 0 0.2671717 0
## document_16.pdf 0 0 0.2671717 0.2459824
## [ reached max_nfeat ... 684 more features ]
# And further limit the analysis to terms that are mentioned no less than 100 times
dfm_trim <- dfm_trim(dfm_cov, min_termfreq = 100)
# This is how many will remain in this trimmed version.
nfeat(dfm_trim)
## [1] 180
# Let's compare how UK and US policy documents are different. We can now create subsets of document token based on the meta data that we spent so much time linking to the text data.
tokens_UK <- tokens_subset(tokens_nostop, Country == "United Kingdom")
tokens_US <- tokens_subset(tokens_nostop, Country == "United States")
# We can do the same with the document feature matrix. And compare worldclouds for UK and US. Note however that the number of words with 100 frequency threshold is very different in both cases.
dfm_subset(dfm_cov,
Country %in% c("United Kingdom", "United States")) %>%
dfm(groups = "Country") %>%
dfm_trim(min_termfreq = 50, verbose = FALSE) %>%
textplot_wordcloud(comparison = TRUE, max_words = 150, color = c("blue", "red"))
# Let's create an individual dfm for each country as well. By setting individual frequence thresholds we can make the number of words in each sample more or less equal. We will use these matrices in more detailed country level comparison later.
dfm_UK <- dfm_subset(dfm_cov, Country == "United Kingdom")
dfm_trim_UK <- dfm_trim(dfm_UK, min_termfreq = 20)
nfeat(dfm_trim_UK)
## [1] 73
dfm_US <- dfm_subset(dfm_cov, Country == "United States")
dfm_trim_US <- dfm_trim(dfm_US, min_termfreq = 100)
nfeat(dfm_trim_US)
## [1] 68
# We can also create a feature co-occurence matrix, which will help us identify, which words are more likely to occur with each other.
fcm_cov <- fcm(dfm_cov)
dim(fcm_cov)
## [1] 694 694
top_feat <- names(topfeatures(fcm_cov, 40))
fcm_cov_select <- fcm_select(fcm_cov, pattern = top_feat)
# We have a 49 x 40 mantrix, which we can now visualise as co-occurence network
dim(fcm_cov_select)
## [1] 40 40
size <- log(colSums(dfm_select(dfm_cov, top_feat)))
set.seed(144)
textplot_network(fcm_cov_select, min_freq = 0.8, vertex_size = size / max(size) * 3)
## Registered S3 method overwritten by 'network':
## method from
## summary.character quanteda
# We can use various dictionaries to identify key themes addressed by each document. One such dictionary (Laver Garry) provides a classification of policy and governance related terms. You will need to download it from the internet and upload here in .cat format to make it work.
dict_lg <- dictionary(file = "/Users/dmitrymalkov/Desktop/laver-garry.cat")
# Let's see which themes are addressed in the UK and US documents.
dfm_lg_UK <- dfm_lookup(dfm_UK, dictionary = dict_lg, levels = 1)
dfm_lg_US <- dfm_lookup(dfm_US, dictionary = dict_lg, levels = 1)
print(dfm_lg_UK)
## Document-feature matrix of: 5 documents, 9 features (42.2% sparse) and 8 docvars.
## features
## docs CULTURE ECONOMY ENVIRONMENT GROUPS INSTITUTIONS LAW_AND_ORDER
## document_1.pdf 10 58 1 0 24 1
## document_2.pdf 7 93 1 0 65 0
## document_23.pdf 21 197 5 0 34 0
## document_33.pdf 14 99 3 0 11 0
## document_8.pdf 0 30 1 0 3 1
## features
## docs RURAL URBAN VALUES
## document_1.pdf 0 0 32
## document_2.pdf 0 0 39
## document_23.pdf 0 0 9
## document_33.pdf 0 0 7
## document_8.pdf 0 0 9
print(dfm_lg_US)
## Document-feature matrix of: 18 documents, 9 features (56.2% sparse) and 8 docvars.
## features
## docs CULTURE ECONOMY ENVIRONMENT GROUPS INSTITUTIONS LAW_AND_ORDER
## document_11.pdf 2 83 0 0 14 0
## document_12.pdf 3 102 5 0 22 1
## document_14.pdf 1 24 0 0 3 0
## document_16.pdf 0 73 1 0 12 1
## document_17.pdf 0 42 1 0 7 0
## document_24.pdf 4 10 0 0 2 0
## features
## docs RURAL URBAN VALUES
## document_11.pdf 0 0 9
## document_12.pdf 0 0 2
## document_14.pdf 0 0 0
## document_16.pdf 0 0 2
## document_17.pdf 0 0 0
## document_24.pdf 0 0 7
## [ reached max_ndoc ... 12 more documents ]
# We can try the same with different policy organisations.
dfm_lg <- dfm_lookup(dfm_cov, dictionary = dict_lg, levels = 1)
org_vec <- unique(data_frame$Outlet.or.Author)
dfm_lg_org <- dfm_subset(dfm_lg,
Outlet.or.Author %in% org_vec) %>%
dfm(groups = "Outlet.or.Author")
# The results can be visualised as a heatmap. We need to manipulate the data a little to convert it into a cluster matrix.
df_lg_org <- dfm_lg_org [,2:(ncol(dfm_lg_org))] %>%
as.matrix %>%
as.data.frame()
row.order <- hclust(dist(df_lg_org ))$order
col.order <- hclust(dist(t(df_lg_org )))$order
df_lg_org_2 <- df_lg_org[row.order, col.order]
cluster_matrix <- melt(as.matrix(df_lg_org_2))
names(cluster_matrix) <- c("Organisation", "Theme","Frequency")
# Now we plot the heatmap.
ggplot(cluster_matrix,aes(x = Organisation,
y = Theme,
fill = Frequency,
label = Frequency)) +
geom_tile() + scale_fill_viridis_c() +
geom_text(color="#FFFFFF",size=2) +
theme_fivethirtyeight() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1)) +
theme(legend.position="none",
text = element_text(size=9)) +
labs(title = "Most frequently invoked themes")
# We can also create our own keywords instead of dictionaries. For example, let's see how often our documents refer to scientific research. Let's start by looking at the overall frequency of terms we are interested in.
dfm_study <- dfm(tokens_nostop, select = c("studi*", "study", "research*", "scienti*"))
tstat_freq <- textstat_frequency(dfm_study, n = 100, groups = "Country")
dfm_study %>%
textstat_frequency(n = 100) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()
# Let's use our own key words and repeat the steps to produce a heat map.
dfm_lg_org_2 <- dfm_subset(dfm_study,
Outlet.or.Author %in% org_vec) %>%
dfm(groups = "Outlet.or.Author")
df_lg_org_2 <- dfm_lg_org_2[,2:(ncol( dfm_lg_org_2))] %>%
as.matrix %>%
as.data.frame()
row.order <- hclust(dist(df_lg_org_2 ))$order
col.order <- hclust(dist(t(df_lg_org_2 )))$order
df_lg_org_3 <- df_lg_org_2[row.order, col.order]
cluster_matrix <- melt(as.matrix(df_lg_org_3))
names(cluster_matrix) <- c("Organisation", "Word","Frequency")
ggplot(cluster_matrix,aes(x = Organisation,
y = Word,
fill = Frequency,
label = Frequency)) +
geom_tile() + scale_fill_viridis_c() +
geom_text(color="#FFFFFF",size=2) +
theme_fivethirtyeight() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1)) +
theme(legend.position="none",
text = element_text(size=9)) +
labs(title = "Frequency of research related terms by policy organisation")
# Lexical dispersion plots are useful to see the distribution of terms within the documents. Let's see how it works with a few examples. We use the textplot_xray function to create these plots.
kwic(tokens_nostop, pattern = "international") %>%
textplot_xray()
# Let's compare several terms at the same time and colour label them.
g <- textplot_xray(
kwic(tokens_nostop, pattern = "China"),
kwic(tokens_nostop, pattern = "international"),
kwic(tokens_nostop, pattern = "travel"))
g + aes(color = keyword) +
scale_color_manual(values = c("blue", "red", "dark green")) +
theme(legend.position = "none")
# Relative frequency plots enable us to compare the relative frequency of top terms across countries our any other document variables.
dfm_weight <- dfm_cov %>%
dfm_weight(scheme = "prop")
freq_weight_country <- textstat_frequency(dfm_weight, n = 10, groups = "Country")
ggplot(data = freq_weight_country, aes(x = nrow(freq_weight_country):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight_country):1,
labels = freq_weight_country$feature) +
labs(x = NULL, y = "Relative frequency")
# We can do the same by policy organisations to give another example.
freq_weight_org <- textstat_frequency(dfm_weight, n = 10, groups = "Outlet.or.Author")
ggplot(data = freq_weight_org, aes(x = nrow(freq_weight_org):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight_org):1,
labels = freq_weight_org$feature) +
labs(x = NULL, y = "Relative frequency")
# Our we can see how term usage changed over time by country, UK and US in this case.
dfm_weight_UK <- dfm_UK %>%
dfm_weight(scheme = "prop")
freq_weight_UK_date <- textstat_frequency(dfm_weight_UK, n = 10, groups = "Mention.Date")
ggplot(data = freq_weight_UK_date, aes(x = nrow(freq_weight_UK_date):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight_UK_date):1,
labels = freq_weight_UK_date$feature) +
labs(x = NULL, y = "Relative frequency")
dfm_weight_US <- dfm_US %>%
dfm_weight(scheme = "prop")
freq_weight_US_date <- textstat_frequency(dfm_weight_US, n = 5, groups = "Mention.Date")
ggplot(data = freq_weight_US_date, aes(x = nrow(freq_weight_US_date):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight_US_date):1,
labels = freq_weight_US_date$feature) +
labs(x = NULL, y = "Relative frequency")
# Keyness comparison is based on a special score that shows how terms in one group differentially compare to terms in another group. We can compare UK documents with US documents by using UK as our reference group.
dfm_UK_US <- dfm_subset(dfm_cov,
Country %in% c("United Kingdom", "United States"))
dfm_UK_US <- dfm(dfm_UK_US, groups = "Country")
result_keyness <- textstat_keyness(dfm_UK_US, target = "United Kingdom")
textplot_keyness(result_keyness)
# We can use another dictionary, which identifies all terms related to countries, geographical regions and nationalities.
dict_newsmap <- dictionary(data_dictionary_newsmap_en)
# Let's create a subset of tokens consisting of these dictionary terms.
tokens_region <- tokens_lookup(tokens_nostop, dictionary = dict_newsmap, levels = 3)
# We can also see the context in which they occur (if we are interested in the context)
kw_region <- kwic(tokens_nostop, pattern = data_dictionary_newsmap_en, window = 3)
# But let's proceed. One approach is to use a modelling algorithm from textmodel_newsmap to predict which country the documents are focused on based on the geographical dictionary.
dfm_label <- dfm(tokens_region, tolower = FALSE)
tmod_nm <- textmodel_newsmap(dfm_cov, y = dfm_label)
pred_nm <- predict(tmod_nm)
head(pred_nm, 20)
## document_1.pdf document_11.pdf document_12.pdf document_14.pdf document_16.pdf
## "GB" "CN" "CN" "CN" "CN"
## document_17.pdf document_2.pdf document_23.pdf document_24.pdf document_26.pdf
## "CN" "GB" "CN" "CN" "CN"
## document_27.pdf document_28.pdf document_3.pdf document_31.pdf document_32.pdf
## "US" "CN" "CN" "CN" "CN"
## document_33.pdf document_34.pdf document_35.pdf document_36.pdf document_37.pdf
## "CN" "CN" "CN" "NL" "CN"
# We can summarise and plot these results on a geographical map
count <- sort(table(factor(pred_nm, levels = colnames(dfm_label))), decreasing = TRUE)
head(count, 20)
##
## CN US GB ER RE SO IQ NL BI DJ ET KE KM MG MU MW MZ RW SC TZ
## 23 6 3 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
dat_country <- as.data.frame(count, stringsAsFactors = FALSE)
colnames(dat_country) <- c("id", "frequency")
world_map <- map_data(map = "world")
world_map$region <- iso.alpha(world_map$region)
ggplot(dat_country, aes(map_id = id)) +
geom_map(aes(fill = frequency), map = world_map) +
expand_limits(x = world_map$long, y = world_map$lat) +
scale_fill_continuous(name = "Frequency") +
theme_void() +
coord_fixed()
# Obviously, most documents focus on China with US being the frontrunner. This is not very informative. We can change the approach and calcualte the frequency of all geographical mentions across all documents instead.
dat_country2 <- as.character(tokens_region)
dat_country2<- as.factor(dat_country2)
dat_country2 <- as.data.frame(dat_country2)
dat_country2 <- count(dat_country2,dat_country2)
colnames(dat_country2) <- c("id", "frequency")
# And to make it even more precise we can zoom into Europe and delete the UK, becuase obviously it has many mentions since we are looking at English documents, some of them by UK policy organisations.
# So we create a subset of European regions to use them in the map.
vec_EU <- c('AT','BE' , 'HR', 'BG', 'CY', 'CZ', 'DK', 'EE', 'FI', 'FR', 'DE', 'GR', 'HU', 'IE','IT', 'LV', 'LT', 'LU', 'MT', 'NL', 'PL', 'PT', 'RO', 'SK', 'SI', 'ES', 'SE') #UK is excluded
EU <- subset(world_map, region %in% vec_EU)
EU_subset <- subset(dat_country2, id %in% vec_EU)
# And finally plot it. This looks somewhat more informative.
ggplot(EU_subset, aes(map_id = id)) +
geom_map(aes(fill = frequency), map = EU) +
expand_limits(x = EU$long, y = EU$lat) +
scale_fill_viridis_c(name = "Frequency") +
theme_void() +
coord_fixed()