pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer)
set.seed(94305)
dir.create(file.path('tables'), showWarnings = FALSE)
dir.create(file.path('figures'), showWarnings = FALSE)# files <- list.files('./data',
# pattern = '.*MIT.*.csv',
# full.names = TRUE)
# (INPUT_FILENAME <- files[which.max(file.info(files)$mtime)])
df <- read.csv("data/latest/Vaccine Hesitancy Pilot - MIT SIDE - Copy_October 26, 2021_13.38.csv")
#df %>% select(contains("start"))# filter out testing data
df_cv <- df %>% filter(StartDate >= "2021-10-25 17:00:00")
# filter out incomplete data
df_cv <- df_cv %>% filter(Progress == "100")
df_cv <- df_cv %>% filter(legit_threat != "")
# remove unnecessary information columns
df_cv <- df_cv %>%
subset(
select = -c(StartDate, EndDate, Status, IPAddress, Progress, ResponseId, RecordedDate, Finished, RecipientLastName, RecipientFirstName, RecipientEmail, ExternalReference, DistributionChannel, UserLanguage)
)
# remove unmeaningful responses
df_cv <- df_cv %>%
subset(
select = -c(consent10, consent_participate, Get.started)
)
df_cv$age <- as.numeric(df_cv$age)
df_cv[["age.1"]] <- as.numeric(df_cv$age.1)
df_cv[["Duration..in.seconds."]] <- as.numeric(df_cv[["Duration..in.seconds."]])mc_variables <- c("legit_threat", "ask_vax_stat", "vax_mandate_op", "friend_vax", "vax_status", "covid_pos", "gender", "race", "lib_cons_1", "vote_2020", "age","income","educ","trust_gov_1", "trust_healthcare_1")
df_mc <- df_cv[, mc_variables]
makeCodebook(df_cv[, c(mc_variables, "express_true_op", "Duration..in.seconds.")], replace = TRUE,
reportTitle = 'Pilot 1 Variables Code Book',
file = 'data/codebook_df_cv_pilot1.Rmd')is_all_numeric <- function(x) {
!any(is.na(suppressWarnings(as.numeric(na.omit(x))))) & is.character(x)
}df_mc_numeric <-
df_mc %>%
mutate(
legit_threat = case_when(
legit_threat == "absolutely" ~ 3,
legit_threat == "for some ppl, sure" ~ 2,
legit_threat == "vaccine is a bigger threat" ~ 1,
),
ask_vax_stat = case_when(
ask_vax_stat == "no problem at all" ~ 3,
ask_vax_stat == "only with good reason" ~ 2,
ask_vax_stat == "none of their biz" ~ 1,
),
vax_mandate_op = case_when(
vax_mandate_op == "yes, WAY overdue!" ~ 3,
vax_mandate_op == "maybe..." ~ 2,
vax_mandate_op == "absolutely NOT!" ~ 1,
),
friend_vax = case_when(
friend_vax == "GET IT!" ~ 3,
friend_vax == "It depends..." ~ 2,
friend_vax == "DON'T do it!" ~ 1,
),
vax_status = case_when(
vax_status == "yes, i have" ~ 3,
vax_status == "i'm not telling you!" ~ 2,
vax_status == "no, i haven't" ~ 1,
),
covid_pos = case_when(
covid_pos == "yes, i have" ~ 3,
covid_pos == "i'm not telling you!" ~ 2,
covid_pos == "no, i haven't" ~ 1,
),
female = case_when(
gender == "a woman" ~ 1,
gender != "a woman" ~ 0
),
white = case_when(
race == "White or Caucasian" ~ 1,
race != "White or Caucasian" ~ 0
),
vote_trump = case_when(
vote_2020 == "Trump" ~ 1,
vote_2020 != "Trump" ~ 0,
),
vote_biden = case_when(
vote_2020 == "Biden" ~ 1,
vote_2020 != "Biden" ~ 0,
),
) %>%
mutate_if(is_all_numeric, as.numeric) %>%
select(where(is.numeric), -lib_cons_1)
ggcorrplot(cor(df_mc_numeric, use = "pairwise.complete.obs"), type = "lower", lab = TRUE)We should use this section to develop code to understand what combinations of the first four multiple choice questions can help us correctly identify the vaccination status. This would help us fork the chatbot to the right people.
Here is an estimation of the vaccinate status based on a decision tree.
Estimation using a decision tree
fit <- rpart(vax_status~ legit_threat + ask_vax_stat + vax_mandate_op + friend_vax, data = df_mc, minsplit = 1)
rpart.plot(fit)Below are two-way cross tabulation tables based on individual variables with the vaccination status.
table(df_mc$vax_status, df_mc$legit_threat)##
## absolutely for some ppl, sure vaccine is a bigger threat
## i'm not telling you! 1 2 0
## no, i haven't 6 5 5
## yes, i have 18 8 5
table(df_mc$vax_status, df_mc$ask_vax_stat)##
## no problem at all none of their biz
## i'm not telling you! 0 3
## no, i haven't 4 9
## yes, i have 9 7
##
## only with good reason
## i'm not telling you! 0
## no, i haven't 3
## yes, i have 15
table(df_mc$vax_status, df_mc$vax_mandate_op)##
## absolutely NOT! maybe... yes, WAY overdue!
## i'm not telling you! 3 0 0
## no, i haven't 12 1 3
## yes, i have 8 10 13
table(df_mc$vax_status, df_mc$friend_vax)##
## DON'T do it! GET IT! It depends...
## i'm not telling you! 1 0 2
## no, i haven't 4 3 9
## yes, i have 1 24 6
table(df_mc$vax_status, df_mc$covid_pos)##
## i'm not telling you! no, i haven't yes, i have
## i'm not telling you! 3 0 0
## no, i haven't 0 12 4
## yes, i have 1 21 9
table(df_mc$vax_status, df_mc$gender)##
## a man a woman
## i'm not telling you! 1 2
## no, i haven't 8 8
## yes, i have 16 15
table(df_mc$vax_status, df_mc$race)##
## Asian or Pacific Islander Black or African American
## i'm not telling you! 0 1
## no, i haven't 1 2
## yes, i have 4 3
##
## Hispanic or Latino White or Caucasian
## i'm not telling you! 0 2
## no, i haven't 2 11
## yes, i have 4 20
table(df_mc$vax_status, df_mc$lib_cons_1)##
## 1 2 3 4 5
## i'm not telling you! 1 0 1 0 1
## no, i haven't 2 1 4 2 7
## yes, i have 6 2 15 4 4
table(df_mc$vax_status, df_mc$vote_2020)##
## Biden Did not vote Other Trump
## i'm not telling you! 0 1 0 2
## no, i haven't 3 6 0 7
## yes, i have 21 3 1 6
table(df_mc$vax_status, df_mc$income)##
## 1. <$15,000 2. $15,000-$24,999 3. $25,000-$49,999
## i'm not telling you! 0 0 0
## no, i haven't 2 4 5
## yes, i have 4 2 8
##
## 4. $50,000-$74,999 5. $75,000-$99,999 6. >$100,000
## i'm not telling you! 1 2 0
## no, i haven't 1 2 2
## yes, i have 3 7 7
table(df_mc$vax_status, df_mc$educ)##
## < high school 2-year degree 4-year degree
## i'm not telling you! 0 0 1
## no, i haven't 1 3 2
## yes, i have 2 3 8
##
## graduate degree high school diploma/GED some college
## i'm not telling you! 0 1 1
## no, i haven't 1 7 2
## yes, i have 6 5 7
table(df_mc$vax_status, df_mc$trust_gov_1)##
## 1 2 3 4 5
## i'm not telling you! 1 1 1 0 0
## no, i haven't 7 5 3 1 0
## yes, i have 7 6 6 6 6
table(df_mc$vax_status, df_mc$trust_healthcare_1)##
## 1 2 3 4 5
## i'm not telling you! 0 0 2 0 1
## no, i haven't 2 2 9 2 1
## yes, i have 1 5 9 8 8
Given the number of three-way cross tabulations being generated, all
of the outputs are stored in tables folder as PDFs instead
of being displayed here. The meaningful ones are also displayed in the
analysis memo in the same folder as this code.
df_mc$weight <- rep(1, nrow(df_mc))
df_mc$age_binary <- ifelse(df_mc$age >= median(df_mc$age), "old", "young")
mc_no_vax <- mc_variables[!(mc_variables %in% c("vax_status", "age"))]
for (i in 1:(length(mc_no_vax) - 1)){
for (j in (i+1):length(mc_no_vax)){
x <- mc_no_vax[i]
z <- mc_no_vax[j]
df_mc %>%
crosstab_3way(x = !!sym(x), y = vax_status, z = !!sym(z),
weight = weight) %>%
kable(digits = 0,
caption = paste0("Three Way Tab - vax_status + ", x, " + " , z),
format = "html") %>%
kable_styling() %>%
save_kable(paste0("tables/v1/three_way_", x, "_", z, ".pdf"))
}
}mc_pre_text <- c("legit_threat", "ask_vax_stat", "vax_mandate_op", "friend_vax")
for (i in 1:(length(mc_pre_text) - 1)){
for (j in (i+1):length(mc_pre_text)){
x <- mc_pre_text[i]
z <- mc_pre_text[j]
df_mc %>%
crosstab_3way(x = !!sym(x), y = vax_status, z = !!sym(z),
weight = weight) %>%
kable(digits = 0,
caption = paste0("Three Way Tab - vax_status + ", x, " + " , z),
format = "html") %>%
kable_styling() %>%
save_kable(paste0("tables/v1/three_way_", x, "_", z, ".png"))
}
}# define free text vars here
free_text_vars <- c("vax_worries", "vax_worries_add", "reaction_why", "reaction_why_add", "express_true_op", "express_true_op_add")Overall summary:
for (var in free_text_vars) {
df_cv[, str_glue("{var}_nchar")] <- nchar(df_cv[, {var}])
}
df_cv %>% select(contains("nchar")) %>% summary()## vax_worries_nchar vax_worries_add_nchar reaction_why_nchar
## Min. : 2.00 Min. : 2.00 Min. : 2.00
## 1st Qu.: 11.75 1st Qu.: 2.25 1st Qu.: 10.25
## Median : 19.50 Median : 8.00 Median : 25.50
## Mean : 39.60 Mean :14.58 Mean : 32.72
## 3rd Qu.: 41.25 3rd Qu.:16.00 3rd Qu.: 43.50
## Max. :264.00 Max. :83.00 Max. :185.00
## reaction_why_add_nchar express_true_op_nchar express_true_op_add_nchar
## Min. : 2.00 Min. :12.00 Min. : 2.00
## 1st Qu.: 8.25 1st Qu.:15.00 1st Qu.: 13.00
## Median : 15.50 Median :15.00 Median : 21.50
## Mean : 26.72 Mean :14.88 Mean : 37.20
## 3rd Qu.: 34.00 3rd Qu.:15.00 3rd Qu.: 41.25
## Max. :126.00 Max. :15.00 Max. :235.00
Median number of characters for free text responses grouped by vaccination status:
df_cv %>% group_by(vax_status) %>% summarize_at(vars(contains("nchar")), ~ median(.)) %>% knitr::kable()| vax_status | vax_worries_nchar | vax_worries_add_nchar | reaction_why_nchar | reaction_why_add_nchar | express_true_op_nchar | express_true_op_add_nchar |
|---|---|---|---|---|---|---|
| i’m not telling you! | 23 | 4 | 27 | 50 | 15 | 67.0 |
| no, i haven’t | 22 | 7 | 38 | 15 | 15 | 24.5 |
| yes, i have | 18 | 9 | 18 | 13 | 15 | 19.0 |
Vaccine worries - Unvaccinated:
# Create corpus
docs <- Corpus(VectorSource(df_cv %>% filter(vax_status != "yes, i have") %>% pull(vax_worries)))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Vaccine worries - Vaccinated:
# Create corpus
docs <- Corpus(VectorSource(df_cv %>% filter(vax_status == "yes, i have") %>% pull(vax_worries)))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Reaction why - Unvaccinated:
# Create corpus
docs <- Corpus(VectorSource(df_cv %>% filter(vax_status != "yes, i have") %>% pull(reaction_why)))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Reaction why - Vaccinated:
# Create corpus
docs <- Corpus(VectorSource(df_cv %>% filter(vax_status == "yes, i have") %>% pull(reaction_why)))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))df_vac <- df_cv %>% filter(vax_status == "yes, i have")
df_not_vac <- df_cv %>% filter(vax_status != "yes, i have")
df_vac_imped <- df_vac[, c("vax_worries", "vax_worries_add")]
df_not_vac_imped <- df_not_vac[, c("vax_worries", "vax_worries_add")]
write.csv(df_vac_imped, "tables/vac_impediments1.csv")
write.csv(df_not_vac_imped, "tables/not_vac_impediments1.csv")not_vax_imped <- df_not_vac_imped$vax_worries[nchar(df_not_vac_imped$vax_worries) >= 14]
not_vax_imped## [1] "People still get covid, people still die even with the vaccine"
## [2] "Getting the COVID again "
## [3] "I'?m not worries"
## [4] "It's not been tested enough to actually be a vaccine. Two years isn't long enough. "
## [5] "Made it way fast "
## [6] "Health issues.\nImmunity"
## [7] "not tested enough"
## [8] "Depends on your concerns of certain side affects and what health issue you may have thst might make you prone to certain side affects"
## [9] "potential side effects of the vaccine and pharmaceutical companies not being at fault for any harm caused by the vaccine."
## [10] "Not proven to work and not proven to not do harm to your body"
## [11] "Don't know what the side effects are"
## [12] "Pain afterwards "
## [13] "It’s too risky"
## [14] "no long term studies"
## [15] "How do we know we might get sick in the future "
## [16] "I don't have any worries, just not getting it "
not_vax_imped_add <- df_not_vac_imped$vax_worries_add[nchar(df_not_vac_imped$vax_worries_add) >= 14]
not_vax_imped_add## [1] "Don’t get the shot let your own immunity work against it"
## [2] "If you want to get pregnant you shouldn't get this vaccine. "
## [3] "I am already protected"
## [4] "Noone should be f orced to put something in their body that is no proven to be safe"
## [5] "It doesn't work"
## [6] "No there is not "
vax_imped <- df_vac_imped$vax_worries[nchar(df_vac_imped$vax_worries) >= 14]
vax_imped## [1] "There are some people who have illnesses that can't get vaccinated. This can be a concern. These people can catch COVID-19 and spread it to others. People vaccinated can still get COVID-19 and die. Also, some people have side effects from the vaccine and get sick."
## [2] "ITS VERY MUTCH"
## [3] "It Might Be Bad."
## [4] "The has some virus or can damage your body system"
## [5] "Side effects, created too quickly "
## [6] "Most of the worries are overblown by the politicization of a vaccine which companies have been working on since SARS1."
## [7] "the long term side effects are unknown."
## [8] "Is the best option"
## [9] "Long term effects are unknown"
## [10] "there are no long term diagnosis for what it may bring on later."
## [11] "I don’t have any"
## [12] "I do not see any legit problems."
## [13] "people think that they can get the virus from the vax, that they are going to be microchipped, has aborted fetuses in it, they are just antivaxers because they don't want to put chemicals in their body even though they do it on a daily basis"
## [14] "If you will have a reaction to it"
## [15] "Short term and long term effects "
## [16] "Side effects are always a concern"
## [17] "the ingredients"
## [18] "Allergic reaction or not enough testing"
## [19] "Unproven \nLong term effects\nEffectiveness "
## [20] "It could be painful"
## [21] "Long term effects"
vax_imped_add <- df_vac_imped$vax_worries_add[nchar(df_vac_imped$vax_worries_add) >= 14]
vax_imped_add## [1] "Some people are just plain scared to get the vaccine. I'm not sure why."
## [2] "nope, it is safe"
## [3] "That can hurt your brain"
## [4] "deadly secondary effects"
## [5] "No one really knows if it is safe or not"
## [6] "Long term or short term effects"
## [7] "It depends who I am talking to. "
## [8] "Listen to doctors, not blogs"
## [9] "Absolutely nothing"
df_cv <-
df_cv %>%
mutate(
LocationLongitude = as.numeric(LocationLongitude),
LocationLatitude = as.numeric(LocationLatitude),
)
us <- c(left = -125, bottom = 24, right = -67, top = 49)
get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
ggmap() +
geom_point(
aes(x=LocationLongitude, y = LocationLatitude, color = legit_threat),
data = df_cv,
size = 5,
alpha=0.75
) +
theme_void() +
theme(legend.position = "bottom") +
labs(color="COVID is a legitimate threat")get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
ggmap() +
geom_point(
aes(x=LocationLongitude, y = LocationLatitude, color = vax_status),
data = df_cv,
size = 5,
alpha=0.75
) +
theme_void() +
theme(legend.position = "bottom") +
labs(color="Vax status")get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
ggmap() +
geom_point(
aes(x=LocationLongitude, y = LocationLatitude, color = ask_vax_stat),
data = df_cv,
size = 5,
alpha=0.75
) +
theme_void() +
theme(legend.position = "bottom") +
labs(color="Ask vax status")get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
ggmap() +
geom_point(
aes(x=LocationLongitude, y = LocationLatitude, color = vax_mandate_op),
data = df_cv,
size = 5,
alpha=0.75
) +
theme_void() +
theme(legend.position = "bottom") +
labs(color="Vax mandate")get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
ggmap() +
geom_point(
aes(x=LocationLongitude, y = LocationLatitude, color = friend_vax),
data = df_cv,
size = 5,
alpha=0.75
) +
theme_void() +
theme(legend.position = "bottom") +
labs(color="Friend to get vaccinated")get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
ggmap() +
geom_point(
aes(x=LocationLongitude, y = LocationLatitude, color = covid_pos),
data = df_cv,
size = 5,
alpha=0.75
) +
theme_void() +
theme(legend.position = "bottom") +
labs(color="COVID positive")paste0("Percentage of matched age: ",
round(mean(df_cv$age == df_cv$age.1) * 100, 2), "%")## [1] "Percentage of matched age: 76%"
paste0("Percentage of matched gender: ",
round(mean(ifelse(df_cv$gender == "a woman", "2", "1") == df_cv$gender.1), 2)*100,
"%")## [1] "Percentage of matched gender: 100%"