rm(list = ls())
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, topicmodels, ldatuning, lda,SnowballC, pals, flextable, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, xtable, plotrix, ggplot2, openxlsx)

set.seed(94305)
dir.create(file.path('tables'), showWarnings = FALSE)
dir.create(file.path('figures'), showWarnings = FALSE)

Data

df <- read.csv("./data/any_one_question.csv")
colnames(df) <- c("ID", "Country", "Question", "Keywords", "PossibleQuestions")

df_v2 <- read.csv("./data/Any_One_Question_v2.csv")
colnames(df_v2) <- c("ID", "Country", "Question", "Keywords", "PossibleQuestions")

V1 Keywords

Cleaning

keywords <- unlist(strsplit(df$Keywords,","))
# trim leading / trailing white space
keywords <- trimws(keywords, which = c("both"), whitespace = "[ \t\r\n]")
unique_keywords <- unique(keywords)

keywords_processed <- keywords
keywords_processed[keywords_processed %in% c("Johnson & Johnson", "Jansen", "astra zeneca", "institutions")] <- "vaccine manufacturers"
keywords_processed[keywords_processed %in% c("vaccine necessity")] <- "necessity"
keywords_processed[keywords_processed %in% c("contracting COVID", "COVID future", "COVID ressurgence")] <- "COVID again after vax"
keywords_processed[grepl("danger|dangr|safe|risk", keywords_processed, ignore.case = TRUE)] <- "safety & danger & risk"
keywords_processed[grepl("side effect", keywords_processed, ignore.case = TRUE)] <- "side effects"
keywords_processed[grepl("symptom", keywords_processed, ignore.case = TRUE)] <- "side effects"
keywords_processed[keywords_processed %in% c("long term", "sick", "post vaccine behaviour", "headache", "body")] <- "side effects"
keywords_processed[grepl("blood|paralysis|blindness", keywords_processed, ignore.case = TRUE)] <- "serious side effects"
keywords_processed[keywords_processed %in% c("eradication", "erradication")] <- "fertility"
keywords_processed[grepl("mechanism|mechanissm", keywords_processed, ignore.case = TRUE)] <- "mechanism"
keywords_processed[grepl("benefit|assurance|incentives", keywords_processed, ignore.case = TRUE)] <- "benefits"
keywords_processed[grepl("logistic", keywords_processed, ignore.case = TRUE)] <- "logistics"
keywords_processed[grepl("eficacy|efficacy", keywords_processed, ignore.case = TRUE)] <- "efficacy"
keywords_processed[grepl("death", keywords_processed, ignore.case = TRUE)] <- "death"
keywords_processed[grepl("conspiracy", keywords_processed, ignore.case = TRUE)] <- "conspiracy"
keywords_processed[grepl("testing|develoment|development|science question", keywords_processed, ignore.case = TRUE)] <- "testing & development"
keywords_processed[grepl("regulation|mandates|compulsory|require", keywords_processed, ignore.case = TRUE)] <- "requirements & regulations"
keywords_processed[keywords_processed %in% c("no booster", "disparity")] <- "availability"
keywords_processed[grepl("booster|bosster", keywords_processed, ignore.case = TRUE)] <- "booster"
keywords_processed[keywords_processed %in% c("COVID status", "COVID origin", "COVID urgency", "COVID end", "COVID cure", "COVID general", "variantss", "covid end", "variants", "COVID origins", "COVID resurgence", "cure", "epidemiology", "empidemology", "epidemology", "transmission")] <- "general questions on COVID"
keywords_processed[keywords_processed %in% c("recommendation", "recommendations", "best vaccine", "number of dosess", "vaccine doses", "vaccine composition", 
                                             "vaccine general", "number of doses", "vaccine components", "COVID protection", "COVID age", "age", "immune", "immunity", "vaccination age", "vulnerable groups", "vaccine age", "instructions", "vaccine longevity")] <- "general questions on vaccine"
keywords_processed[grepl("chronic", keywords_processed, ignore.case = TRUE)] <- "chronic diseases"
keywords_processed[grepl("nonsensical", keywords_processed, ignore.case = TRUE)] <- "nonsensicals"
keywords_processed[keywords_processed %in% c("distrust", "surveillance", "vaccine failures", "vaccine contradictions", "vaccine failure")] <- "distrust"
keywords_processed[grepl("pregnan*", keywords_processed, ignore.case = TRUE)] <- "pregnancy"
keywords_processed[grepl("children", keywords_processed, ignore.case = TRUE)] <- "children"
keywords_processed[grepl("not sure|don't know", keywords_processed, ignore.case = TRUE)] <- "none"
keywords_processed[keywords_processed == ""] <- "none"
keywords_processed[grepl("alternative|med", keywords_processed, ignore.case = TRUE)] <- "medicine & alternatives"
keywords_processed[!keywords_processed %in% c("access", "vaccine manufacturers", "necessity", "cost", "COVID again after vax", "safety & danger & risk", "side effects", "serious side effects", "fertility", "mechanism", "benefits", "logistics", "efficacy", "death", "conspiracy", "testing & development", "requirements & regulations", "availability", "booster", "general questions on COVID", "general questions on vaccine", "chronic diseases", "nonsensicals", "distrust", "pregnancy", "children", "none", "medicine & alternatives", "pain")] <- "others"
unique_processed_keywords <- unique(keywords_processed)

Keywords Statistics

tab <- table(keywords_processed)

# Raw Count
raw_count <- as.data.frame(tab[order(tab,decreasing = TRUE)])
colnames(raw_count)[2] <- paste0("Freq (n = ", nrow(df), ")")
datatable(raw_count)
# Percentage of Participants
# percentage_count <- as.data.frame(round(tab[order(tab,decreasing = TRUE)] / nrow(df) * 100, 2))
# percentage_count$Freq <- paste0(percentage_count$Freq, "%")
# colnames(percentage_count)[2] <- paste0("Percentage (n = ", nrow(df), ")")
# percentage_count

V2 Keywords

Keywords Statistics

keywords_v2 <- unlist(strsplit(df_v2$Keywords,","))
# trim leading / trailing white space
keywords_v2 <- trimws(keywords_v2, which = c("both"), whitespace = "[ \t\r\n]")
unique_keywords_v2 <- unique(keywords_v2)

tab_v2 <- table(keywords_v2)

# Raw Count
raw_count_v2 <- as.data.frame(tab_v2[order(tab_v2,decreasing = TRUE)])
colnames(raw_count_v2)[2] <- paste0("Freq (n = ", nrow(df_v2), ")")
datatable(raw_count_v2)