DS_Group_Project

0.2 Text Analysis of Policy Footprint of Covid-19 Research

0.2.1 Packages

library(rvest)
library(plyr)
library(pdftools)
library(glue)
library(tidyverse)
library(tm)
library(textcat)
library(tidytext)
library(stringr)
library(dplyr)
library(purrr)
library(textmineR)
library(quanteda)
library(stringr)
library(newsmap)
library(maps)
library(data.table)
library(ggthemes)

0.2.2 Setting working directory and uploading data

rm(list=ls())

setwd("~/Desktop/R/Introductory Data Science/Group Project")

data_frame <- read.csv("Covid-19\ Policy\ Mentions\ -\ 2020-04-06.csv") %>%
  select(Outlet.or.Author, Mention.Title, Country, Mention.Date, 
         Mention.URL, Research.Output.Title, Publication.Date, Journal.Collection.Title,
         Affiliations..GRID., Altmetric.Attention.Score)

0.2.3 Extracting URLs of policy documents and downloading the PDFs

# We create a character vector of unique URLs from which we will download policy documents 
urls <- data_frame$Mention.URL %>%
  as.character %>%
  unique

# We define the pattern, according to which file names will be assigned to downloaded PDFs. Then, we reset the working directory and download each PDF from the urls vector. 
pdf_names <- paste0("document_", seq(urls), ".pdf")
setwd("~/Desktop/R/Introductory Data Science/Group Project/PDF downloads")
mapply(download.file, urls, pdf_names)

0.2.4 Creating a preliminary corpus from the PDFs to identify and remove non-English documents from the folder

# This is a preliminary step that we need to filter out those policy documents that are not in English. To do that, we create a corpus out of all PDFs located our current working directory (PDF Downloads). 
setwd("~/Desktop/R/Introductory Data Science/Group Project/PDF downloads")
file_vector <- list.files(path = getwd())
corpus <- Corpus(URISource(file_vector),
                 readerControl = list(reader = readPDF))
# After we create the corpus we use the textcat fucntion to identify document language and delete non-English docuemtns from the working directory.
languages <- textcat(corpus)
names(languages)[languages != "english"] %>%
  unlink()

0.2.5 Creating a corpus from policy documents in English

# Now we create a new corpus, which consists of remaining English documents in the working directory.
setwd("~/Desktop/R/Introductory Data Science/Group Project/PDF downloads")
file_vector <- list.files(path = getwd())
corpus <- Corpus(URISource(file_vector),
                 readerControl = list(reader = readPDF))

0.2.6 Converting the corpus back into a dataframe to match the documents with original document variables

# An additional transformation is needed in order to match our text data with original document variables that we had in our initial data set (data_frame). 

# First, we match unique URLs from the original data set with how our documents are named in the PDF Downloads folder.
knitr::opts_knit$set(root.dir = "~/Desktop/R/Introductory Data Science/Group Project/PDF downloads" )

df_uniq_url <- data_frame[!duplicated(data_frame[,"Mention.URL",]),] 
df_uniq_url$document <- paste0("document_", seq(1,nrow(df_uniq_url), by = 1), ".pdf")

# Now we remove those rows that correspond to non-English documents that we removed from the working directory a few steps earlier
df_uniq_url <- df_uniq_url[grepl(paste(file_vector, collapse="|"), df_uniq_url$document),]

# We convert the corpus into a data frame
data_policy <- data.frame(text=sapply(corpus, identity), 
                        stringsAsFactors = FALSE)

# And we eremove the second row, which contains document meta data identified by the corpus function earlier. The problem with this metadata is that it is very incomplete. That's why we are going through all these transformations to link each  text to its meta data from the original data set 
data_policy <- data_policy[-2,] 

# We elongate the data, so that each row corresponds to one document.
data_policy <- gather(data_policy , key = "document", value = "text")

# Remove the text. prefix before each document name.
data_policy$document <- gsub("text.", "", data_policy$document)

# And finally we merge the text data frame with our initial data frame.
merged_data <- merge(df_uniq_url, data_policy, by.x = "document", by.y = "document")
merged_data$text <- as.character(merged_data$text)
merged_data <- merged_data %>%
  select(document, Mention.Title, Mention.Date, text, Outlet.or.Author, Country, 
         Publication.Date, Journal.Collection.Title, Affiliations..GRID.,
         Altmetric.Attention.Score) 

# Remove some unwanted symbols, which will be harder to remove later.
merged_data$text <- gsub("c(", "", merged_data$text, fixed = TRUE)

# And finally create a new corpus from the data frame, which combines both text and document variables.
corpus_policy <- corpus(merged_data, docid_field = "document", 
                      text_field = "text",)

0.2.7 Identifying highly similar documents (versions of documents) to remove from further analysis

# There is one more cleaning procedure to make. Some of the documents we have downloaded are updates or versions of the same document. They are very similar in content and will distort  our future analysis. Therefore we will calculate their similarity and remove them from our corpus. 

# To calculate similarity we create a document feature matrix and use the textstat.simil function.
dfm_cov <- dfm(corpus_policy,
            tolower = TRUE,
            remove_punct = TRUE,
            remove_symbols = TRUE,
            remove_numbers = TRUE,
            dictionary = NULL)
similarity <- textstat_simil(dfm_cov)
similarity <- as.data.frame(similarity)

# We define our similarity threshold at 0.95, which seems to work well to sort out versions of documents that originate in the same unique document. 
threshold <- 0.95

# Now we identify documents with similarity higher than 0.95 and save their names in a character vector.
todrop <- subset(similarity, select = document1, subset = correlation > threshold, drop = TRUE)
todrop <- as.vector(unique(todrop))

0.2.8 Creating tokens and removing stopwords

# Can we finally do some text analysis please? Yes, now we can. We will mostly use the quanteda library in this example. First, we create tokens.  
tokens <- tokens(corpus_policy,
                 what = "word",
                 remove_punct = TRUE,
                 remove_symbols = TRUE, 
                 remove_numbers = TRUE,
                 include_docvars = TRUE)

# We remove tokens associated with highly similar documents that we identified previously.
tokens <-tokens[-unlist(lapply(todrop, function(x) grep(x, names(tokens)) ) ) ]

# These are the remaining documents.
names(tokens)

##  [1] "document_1.pdf"  "document_11.pdf" "document_12.pdf" "document_14.pdf"
##  [5] "document_16.pdf" "document_17.pdf" "document_2.pdf"  "document_23.pdf"
##  [9] "document_24.pdf" "document_26.pdf" "document_27.pdf" "document_28.pdf"
## [13] "document_3.pdf"  "document_31.pdf" "document_32.pdf" "document_33.pdf"
## [17] "document_34.pdf" "document_35.pdf" "document_36.pdf" "document_37.pdf"
## [21] "document_38.pdf" "document_40.pdf" "document_44.pdf" "document_46.pdf"
## [25] "document_49.pdf" "document_5.pdf"  "document_50.pdf" "document_52.pdf"
## [29] "document_53.pdf" "document_54.pdf" "document_55.pdf" "document_57.pdf"
## [33] "document_58.pdf" "document_6.pdf"  "document_7.pdf"  "document_8.pdf" 
## [37] "document_9.pdf"

# Let's remove the stopwords and some other unwanted artefacts
tokens_nostop <- tokens_select(tokens, pattern = stopwords('en'), selection = 'remove')
tokens_nostop <- tokens_select(tokens_nostop, pattern = c("n*", "percent", "et", 
                                                          "al", "can", "2019", "2020", "b*", 
                                                          "2018", "figure", "e.g."),
                                                           selection = 'remove')

# A preview of how our tokens look like within each document. 
head(tokens_nostop, 5)

## Tokens consisting of 5 documents and 8 docvars.
## document_1.pdf :
##  [1] "COVID-19"  "rapid"     "guideline" "suspected" "pneumonia" "community"
##  [7] "guideline" "April"     "guidance"  "rights"    "reserved"  "Subject"  
## [ ... and 1,254 more ]
## 
## document_11.pdf :
##  [1] "Morbidity"   "Mortality"   "Weekly"      "Report"      "Release"    
##  [6] "Vol"         "March"       "Severe"      "Outcomes"    "Among"      
## [11] "Patients"    "Coronavirus"
## [ ... and 1,410 more ]
## 
## document_12.pdf :
##  [1] "Coronavirus"   "Disease"       "COVID-19"      "Professionals"
##  [5] "Frequently"    "Asked"         "Questio"       "Answers"      
##  [9] "March"         "also"          "general"       "COVID-19"     
## [ ... and 1,150 more ]
## 
## document_14.pdf :
##  [1] "Coronavirus" "Disease"     "COVID-19"    "Home"        "Isolation"  
##  [6] "Persons"     "Interim"     "Guidance"    "CDC"         "guidance"   
## [11] "COVID-19"    "may"        
## [ ... and 642 more ]
## 
## document_16.pdf :
##  [1] "Morbidity"     "Mortality"     "Weekly"        "Report"       
##  [5] "Release"       "Vol"           "March"         "Evaluation"   
##  [9] "Effectiveness" "Surveillance"  "Containment"   "Measures"     
## [ ... and 1,505 more ]

0.2.9 Creating and analysing the document feature matrix

# Let's create a new cleaner document feature matrix without versions of the same documents. We will create the matrix from tokens instead of the corpus. 
dfm_cov <- dfm(tokens_nostop)
head(dfm_cov, 5)

## Document-feature matrix of: 5 documents, 14,852 features (96.6% sparse) and 8 docvars.
##                  features
## docs              covid-19 rapid guideline suspected pneumonia community april
##   document_1.pdf        47    20        28        19        38        19     1
##   document_11.pdf       41     0         0         1         1         4     0
##   document_12.pdf       58     0         0         8         2         3     0
##   document_14.pdf       14     0         0         2         2         0     0
##   document_16.pdf       38     1         0         3         6         5     0
##                  features
## docs              guidance rights reserved
##   document_1.pdf        10     28       14
##   document_11.pdf        4      0        0
##   document_12.pdf        8      0        0
##   document_14.pdf       14      0        0
##   document_16.pdf        0      0        0
## [ reached max_nfeat ... 14,842 more features ]

# Let's count how many documents mention our features 
doc_freq <- docfreq(dfm_cov)

# We can limit our analysis to features that are mentioned in at least 10 documents 
dfm_cov <- dfm_cov[, doc_freq >= 10]

# This is how many are left.
nfeat(dfm_cov)

## [1] 694

# And these are the top ones.
topfeatures(dfm_cov, 10)

##  covid-19    health     cases     china  patients      care   disease infection 
##      1593      1163       704       699       695       596       547       528 
## countries     world 
##       527       522

# We can also calcualre a tfidf.
dfm_tfidf <- dfm_tfidf(dfm_cov, force = TRUE)
head(dfm_tfidf,5)

## Document-feature matrix of: 5 documents, 694 features (65.9% sparse) and 8 docvars.
##                  features
## docs               covid-19    rapid suspected pneumonia community  guidance
##   document_1.pdf  2.9634321 4.515581 3.2349726 5.8226783 2.9113391 0.9108047
##   document_11.pdf 2.5851216 0        0.1702617 0.1532284 0.6129135 0.3643219
##   document_12.pdf 3.6570013 0        1.3620937 0.3064568 0.4596851 0.7286438
##   document_14.pdf 0.8827244 0        0.3405234 0.3064568 0         1.2751266
##   document_16.pdf 2.3959663 0.225779 0.5107851 0.9193703 0.7661419 0        
##                  features
## docs                rights reserved   subject     https
##   document_1.pdf  11.81806 5.489547 3.7404042 3.4437540
##   document_11.pdf  0       0        0.2671717 0.9839297
##   document_12.pdf  0       0        0         0        
##   document_14.pdf  0       0        0.2671717 0        
##   document_16.pdf  0       0        0.2671717 0.2459824
## [ reached max_nfeat ... 684 more features ]

# And further limit the analysis to terms that are mentioned no less than 100 times
dfm_trim <- dfm_trim(dfm_cov, min_termfreq = 100)

# This is how many will remain in this trimmed version.
nfeat(dfm_trim)

## [1] 180

0.2.10 Preparing data for country level comparison

# Let's compare how UK and US policy documents are different. We can now create subsets of document token based on the meta data that we spent so much time linking to the text data. 
tokens_UK <- tokens_subset(tokens_nostop, Country == "United Kingdom")
tokens_US <- tokens_subset(tokens_nostop, Country == "United States")

# We can do the same with the document feature matrix. And compare worldclouds for UK and US. Note however that the number of words with 100 frequency threshold is very different in both cases. 
dfm_subset(dfm_cov,
              Country %in% c("United Kingdom", "United States")) %>%
    dfm(groups = "Country") %>%
    dfm_trim(min_termfreq = 50, verbose = FALSE) %>%
    textplot_wordcloud(comparison = TRUE, max_words = 150, color = c("blue", "red"))

# Let's create an individual dfm for each country as well. By setting individual frequence thresholds we can make the number of words in each sample more or less equal. We will use these matrices in more detailed country level comparison later.
dfm_UK <- dfm_subset(dfm_cov, Country == "United Kingdom")
dfm_trim_UK <- dfm_trim(dfm_UK, min_termfreq = 20)
nfeat(dfm_trim_UK)

## [1] 73

dfm_US <- dfm_subset(dfm_cov, Country == "United States")
dfm_trim_US <- dfm_trim(dfm_US, min_termfreq = 100)
nfeat(dfm_trim_US)

## [1] 68

0.2.11 Feature co-occurrence matrix

# We can also create a feature co-occurence matrix, which  will help us identify, which words are more likely to occur with each other. 

fcm_cov <- fcm(dfm_cov)
dim(fcm_cov)

## [1] 694 694

top_feat <- names(topfeatures(fcm_cov, 40))
fcm_cov_select <- fcm_select(fcm_cov, pattern = top_feat)

# We have a 49 x 40 mantrix, which we can now visualise as co-occurence network
dim(fcm_cov_select)

## [1] 40 40

size <- log(colSums(dfm_select(dfm_cov, top_feat)))
set.seed(144)
textplot_network(fcm_cov_select, min_freq = 0.8, vertex_size = size / max(size) * 3)

## Registered S3 method overwritten by 'network':
##   method            from    
##   summary.character quanteda

0.2.12 Dictionary based text analysis

# We can use various dictionaries to identify key themes addressed by each document. One such dictionary (Laver Garry) provides a classification of policy and governance related terms. You will need to download it from the internet and upload here in .cat format to make it work. 
dict_lg <- dictionary(file = "/Users/dmitrymalkov/Desktop/laver-garry.cat")

# Let's see which themes are addressed in the UK and US documents.
dfm_lg_UK <- dfm_lookup(dfm_UK, dictionary = dict_lg, levels = 1)
dfm_lg_US <- dfm_lookup(dfm_US, dictionary = dict_lg, levels = 1)
print(dfm_lg_UK)

## Document-feature matrix of: 5 documents, 9 features (42.2% sparse) and 8 docvars.
##                  features
## docs              CULTURE ECONOMY ENVIRONMENT GROUPS INSTITUTIONS LAW_AND_ORDER
##   document_1.pdf       10      58           1      0           24             1
##   document_2.pdf        7      93           1      0           65             0
##   document_23.pdf      21     197           5      0           34             0
##   document_33.pdf      14      99           3      0           11             0
##   document_8.pdf        0      30           1      0            3             1
##                  features
## docs              RURAL URBAN VALUES
##   document_1.pdf      0     0     32
##   document_2.pdf      0     0     39
##   document_23.pdf     0     0      9
##   document_33.pdf     0     0      7
##   document_8.pdf      0     0      9

print(dfm_lg_US)

## Document-feature matrix of: 18 documents, 9 features (56.2% sparse) and 8 docvars.
##                  features
## docs              CULTURE ECONOMY ENVIRONMENT GROUPS INSTITUTIONS LAW_AND_ORDER
##   document_11.pdf       2      83           0      0           14             0
##   document_12.pdf       3     102           5      0           22             1
##   document_14.pdf       1      24           0      0            3             0
##   document_16.pdf       0      73           1      0           12             1
##   document_17.pdf       0      42           1      0            7             0
##   document_24.pdf       4      10           0      0            2             0
##                  features
## docs              RURAL URBAN VALUES
##   document_11.pdf     0     0      9
##   document_12.pdf     0     0      2
##   document_14.pdf     0     0      0
##   document_16.pdf     0     0      2
##   document_17.pdf     0     0      0
##   document_24.pdf     0     0      7
## [ reached max_ndoc ... 12 more documents ]

# We can try the same with different policy organisations. 
dfm_lg <- dfm_lookup(dfm_cov, dictionary = dict_lg, levels = 1)
org_vec <- unique(data_frame$Outlet.or.Author)
dfm_lg_org <- dfm_subset(dfm_lg,
              Outlet.or.Author %in% org_vec) %>%
              dfm(groups = "Outlet.or.Author")

# The results can be visualised as a heatmap. We need to manipulate the data a little to convert it into a cluster matrix. 
df_lg_org <- dfm_lg_org [,2:(ncol(dfm_lg_org))] %>%
  as.matrix %>%
  as.data.frame()
row.order <- hclust(dist(df_lg_org ))$order
col.order <- hclust(dist(t(df_lg_org )))$order
df_lg_org_2 <- df_lg_org[row.order, col.order]
cluster_matrix <- melt(as.matrix(df_lg_org_2))
names(cluster_matrix) <- c("Organisation", "Theme","Frequency")

# Now we plot the heatmap.
ggplot(cluster_matrix,aes(x = Organisation,
                          y = Theme,
                          fill = Frequency,
                          label = Frequency)) + 
  geom_tile() + scale_fill_viridis_c() + 
  geom_text(color="#FFFFFF",size=2) +
  theme_fivethirtyeight() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1)) +
  theme(legend.position="none",
        text = element_text(size=9)) +
  labs(title = "Most frequently invoked themes")

# We can also create our own keywords instead of dictionaries. For example, let's see how often our documents refer to scientific research. Let's start by looking at the overall frequency of terms we are interested in.
    
dfm_study <- dfm(tokens_nostop, select = c("studi*", "study", "research*", "scienti*"))
tstat_freq <- textstat_frequency(dfm_study, n = 100, groups = "Country")

dfm_study %>% 
  textstat_frequency(n = 100) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL, y = "Frequency") +
  theme_minimal()

# Let's use our own key words and repeat the steps to produce a heat map. 

dfm_lg_org_2 <- dfm_subset(dfm_study,
              Outlet.or.Author %in% org_vec) %>%
              dfm(groups = "Outlet.or.Author")

df_lg_org_2 <- dfm_lg_org_2[,2:(ncol( dfm_lg_org_2))] %>%
  as.matrix %>%
  as.data.frame()

row.order <- hclust(dist(df_lg_org_2 ))$order
col.order <- hclust(dist(t(df_lg_org_2 )))$order
df_lg_org_3 <- df_lg_org_2[row.order, col.order]
cluster_matrix <- melt(as.matrix(df_lg_org_3))
names(cluster_matrix) <- c("Organisation", "Word","Frequency")

ggplot(cluster_matrix,aes(x = Organisation,
                          y = Word,
                          fill = Frequency,
                          label = Frequency)) + 
  geom_tile() + scale_fill_viridis_c() + 
  geom_text(color="#FFFFFF",size=2) +
  theme_fivethirtyeight() +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1)) +
  theme(legend.position="none",
        text = element_text(size=9)) +
  labs(title = "Frequency of research related terms by policy organisation")

0.2.13 Lexical dispersion plot

# Lexical dispersion plots are useful to see the distribution of terms within the documents. Let's see how it works with a few examples. We use the textplot_xray function to create these plots.

kwic(tokens_nostop, pattern = "international") %>%
  textplot_xray()

# Let's compare several terms at the same time and colour label them.

g <- textplot_xray(
    kwic(tokens_nostop, pattern = "China"),
     kwic(tokens_nostop, pattern = "international"),
     kwic(tokens_nostop, pattern = "travel"))
g + aes(color = keyword) +
    scale_color_manual(values = c("blue", "red", "dark green")) +
    theme(legend.position = "none")

0.2.14 Relative frequency plots by country of document

# Relative frequency plots enable us to compare the relative frequency of top terms across countries our any other document variables. 

dfm_weight <- dfm_cov %>%
     dfm_weight(scheme = "prop")
freq_weight_country <- textstat_frequency(dfm_weight, n = 10, groups = "Country")

ggplot(data = freq_weight_country, aes(x = nrow(freq_weight_country):1, y = frequency)) +
     geom_point() +
     facet_wrap(~ group, scales = "free") +
     coord_flip() +
     scale_x_continuous(breaks = nrow(freq_weight_country):1,
                        labels = freq_weight_country$feature) +
     labs(x = NULL, y = "Relative frequency")

# We can do the same by policy organisations to give another example.

freq_weight_org <- textstat_frequency(dfm_weight, n = 10, groups = "Outlet.or.Author")

ggplot(data = freq_weight_org, aes(x = nrow(freq_weight_org):1, y = frequency)) +
     geom_point() +
     facet_wrap(~ group, scales = "free") +
     coord_flip() +
     scale_x_continuous(breaks = nrow(freq_weight_org):1,
                        labels = freq_weight_org$feature) +
     labs(x = NULL, y = "Relative frequency")

# Our we can see how term usage changed over time by country, UK and US in this case. 

dfm_weight_UK <- dfm_UK %>%
  dfm_weight(scheme = "prop")
freq_weight_UK_date <- textstat_frequency(dfm_weight_UK, n = 10, groups = "Mention.Date")

ggplot(data = freq_weight_UK_date, aes(x = nrow(freq_weight_UK_date):1, y = frequency)) +
     geom_point() +
     facet_wrap(~ group, scales = "free") +
     coord_flip() +
     scale_x_continuous(breaks = nrow(freq_weight_UK_date):1,
                        labels = freq_weight_UK_date$feature) +
     labs(x = NULL, y = "Relative frequency")

dfm_weight_US <- dfm_US %>%
  dfm_weight(scheme = "prop")
freq_weight_US_date <- textstat_frequency(dfm_weight_US, n = 5, groups = "Mention.Date")

ggplot(data = freq_weight_US_date, aes(x = nrow(freq_weight_US_date):1, y = frequency)) +
     geom_point() +
     facet_wrap(~ group, scales = "free") +
     coord_flip() +
     scale_x_continuous(breaks = nrow(freq_weight_US_date):1,
                        labels = freq_weight_US_date$feature) +
     labs(x = NULL, y = "Relative frequency")

0.2.15 Keyness comparison

# Keyness comparison is based on a special score that shows how terms in one group differentially compare to terms in another group. We can compare UK documents with US documents by using UK as our reference group. 

dfm_UK_US <- dfm_subset(dfm_cov,
                            Country %in% c("United Kingdom", "United States"))
dfm_UK_US <- dfm(dfm_UK_US, groups = "Country")
result_keyness <- textstat_keyness(dfm_UK_US, target = "United Kingdom")
textplot_keyness(result_keyness)

0.2.16 Geographically mapping mentions of countries and nationalities in policy documents

# We can use another dictionary, which identifies all terms related to countries, geographical regions and nationalities. 
dict_newsmap <- dictionary(data_dictionary_newsmap_en)

# Let's create a subset of tokens consisting of these dictionary terms. 
tokens_region <- tokens_lookup(tokens_nostop, dictionary = dict_newsmap, levels = 3)

# We can also see the context in which they occur (if we are interested in the context)
kw_region <- kwic(tokens_nostop, pattern = data_dictionary_newsmap_en, window = 3)

# But let's proceed. One approach is to use a modelling algorithm from textmodel_newsmap to predict which country the documents are focused on based on the geographical dictionary. 
dfm_label <- dfm(tokens_region, tolower = FALSE)
tmod_nm <- textmodel_newsmap(dfm_cov, y = dfm_label)
pred_nm <- predict(tmod_nm)
head(pred_nm, 20)

##  document_1.pdf document_11.pdf document_12.pdf document_14.pdf document_16.pdf 
##            "GB"            "CN"            "CN"            "CN"            "CN" 
## document_17.pdf  document_2.pdf document_23.pdf document_24.pdf document_26.pdf 
##            "CN"            "GB"            "CN"            "CN"            "CN" 
## document_27.pdf document_28.pdf  document_3.pdf document_31.pdf document_32.pdf 
##            "US"            "CN"            "CN"            "CN"            "CN" 
## document_33.pdf document_34.pdf document_35.pdf document_36.pdf document_37.pdf 
##            "CN"            "CN"            "CN"            "NL"            "CN"

# We can summarise and plot these results on a geographical map
count <- sort(table(factor(pred_nm, levels = colnames(dfm_label))), decreasing = TRUE)
head(count, 20)

## 
## CN US GB ER RE SO IQ NL BI DJ ET KE KM MG MU MW MZ RW SC TZ 
## 23  6  3  1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0

dat_country <- as.data.frame(count, stringsAsFactors = FALSE)
colnames(dat_country) <- c("id", "frequency")
world_map <- map_data(map = "world")
world_map$region <- iso.alpha(world_map$region) 

ggplot(dat_country, aes(map_id = id)) +
  geom_map(aes(fill = frequency), map = world_map) +
  expand_limits(x = world_map$long, y = world_map$lat) +
  scale_fill_continuous(name = "Frequency") +
  theme_void() +
  coord_fixed()

# Obviously, most documents focus on China with US being the frontrunner. This is not very informative. We can change the approach and calcualte the frequency of all geographical mentions across all documents instead. 

dat_country2 <- as.character(tokens_region)
dat_country2<- as.factor(dat_country2)
dat_country2 <- as.data.frame(dat_country2)
dat_country2 <- count(dat_country2,dat_country2)
colnames(dat_country2) <- c("id", "frequency")

# And to make it even more precise we can zoom into Europe and delete the UK, becuase obviously it has many mentions since we are looking at English documents, some of them by UK policy organisations.

# So we create a subset of European regions to use them in the map.
vec_EU  <-  c('AT','BE' , 'HR', 'BG', 'CY', 'CZ', 'DK', 'EE', 'FI', 'FR', 'DE', 'GR', 'HU', 'IE','IT', 'LV', 'LT', 'LU', 'MT', 'NL', 'PL', 'PT', 'RO', 'SK', 'SI', 'ES', 'SE') #UK is excluded 

EU <- subset(world_map, region %in% vec_EU)
EU_subset <- subset(dat_country2, id %in% vec_EU)

# And finally plot it. This looks somewhat more informative.
ggplot(EU_subset, aes(map_id = id)) +
  geom_map(aes(fill = frequency), map = EU) +
  expand_limits(x = EU$long, y = EU$lat) +
  scale_fill_viridis_c(name = "Frequency") +
  theme_void() +
  coord_fixed()

DS_Group_Project

0.1 Introductory Data Science Group Project

0.2 Text Analysis of Policy Footprint of Covid-19 Research

0.2.1 Packages

0.2.2 Setting working directory and uploading data

0.2.3 Extracting URLs of policy documents and downloading the PDFs

0.2.4 Creating a preliminary corpus from the PDFs to identify and remove non-English documents from the folder

0.2.5 Creating a corpus from policy documents in English

0.2.6 Converting the corpus back into a dataframe to match the documents with original document variables

0.2.7 Identifying highly similar documents (versions of documents) to remove from further analysis

0.2.8 Creating tokens and removing stopwords

0.2.9 Creating and analysing the document feature matrix

0.2.10 Preparing data for country level comparison

0.2.11 Feature co-occurrence matrix

0.2.12 Dictionary based text analysis

0.2.13 Lexical dispersion plot

0.2.14 Relative frequency plots by country of document

0.2.15 Keyness comparison

0.2.16 Geographically mapping mentions of countries and nationalities in policy documents