Data Reading

pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer)

set.seed(94305)
dir.create(file.path('tables'), showWarnings = FALSE)
dir.create(file.path('figures'), showWarnings = FALSE)

Load Data

# files <- list.files('./data', 
#                     pattern = '.*MIT.*.csv', 
#                     full.names = TRUE)
# (INPUT_FILENAME <- files[which.max(file.info(files)$mtime)])


df <- read.csv("data/latest/Vaccine Hesitancy Pilot - MIT SIDE - Copy_October 26, 2021_13.38.csv")

#df %>% select(contains("start"))

Data Cleaning

# filter out testing data
df_cv <- df %>% filter(StartDate >= "2021-10-25 17:00:00")

# filter out incomplete data
df_cv <- df_cv %>% filter(Progress == "100")
df_cv <- df_cv %>% filter(legit_threat != "")

# remove unnecessary information columns
df_cv <- df_cv %>%
  subset(
    select = -c(StartDate, EndDate, Status, IPAddress, Progress, ResponseId, RecordedDate, Finished, RecipientLastName, RecipientFirstName, RecipientEmail, ExternalReference, DistributionChannel, UserLanguage)
  )

# remove unmeaningful responses
df_cv <- df_cv %>%
  subset(
    select = -c(consent10, consent_participate, Get.started)
  )

df_cv$age <- as.numeric(df_cv$age)
df_cv[["age.1"]] <- as.numeric(df_cv$age.1)
df_cv[["Duration..in.seconds."]] <- as.numeric(df_cv[["Duration..in.seconds."]])

Data Rough Analysis

Response Multiple Choices

mc_variables <- c("legit_threat", "ask_vax_stat", "vax_mandate_op", "friend_vax", "vax_status", "covid_pos", "gender", "race", "lib_cons_1", "vote_2020", "age","income","educ","trust_gov_1", "trust_healthcare_1")

df_mc <- df_cv[, mc_variables]

makeCodebook(df_cv[, c(mc_variables, "express_true_op", "Duration..in.seconds.")], replace = TRUE,
             reportTitle = 'Pilot 1 Variables Code Book',
             file = 'data/codebook_df_cv_pilot1.Rmd')

Correlation plot

is_all_numeric <- function(x) {
  !any(is.na(suppressWarnings(as.numeric(na.omit(x))))) & is.character(x)
}
df_mc_numeric <-
  df_mc %>% 
  mutate(
    legit_threat = case_when(
      legit_threat == "absolutely" ~ 3,
      legit_threat == "for some ppl, sure" ~ 2,
      legit_threat == "vaccine is a bigger threat" ~ 1,
    ),
    ask_vax_stat = case_when(
      ask_vax_stat == "no problem at all" ~ 3,
      ask_vax_stat == "only with good reason" ~ 2,
      ask_vax_stat == "none of their biz" ~ 1,
    ),
    vax_mandate_op = case_when(
      vax_mandate_op == "yes, WAY overdue!" ~ 3,
      vax_mandate_op == "maybe..." ~ 2,
      vax_mandate_op == "absolutely NOT!" ~ 1,
    ),
    friend_vax = case_when(
      friend_vax == "GET IT!" ~ 3,
      friend_vax == "It depends..." ~ 2,
      friend_vax == "DON'T do it!" ~ 1,
    ),
    vax_status = case_when(
      vax_status == "yes, i have" ~ 3,
      vax_status == "i'm not telling you!" ~ 2,
      vax_status == "no, i haven't" ~ 1,
    ),
    covid_pos = case_when(
      covid_pos == "yes, i have" ~ 3,
      covid_pos == "i'm not telling you!" ~ 2,
      covid_pos == "no, i haven't" ~ 1,
    ),
    female = case_when(
      gender == "a woman" ~ 1,
      gender != "a woman" ~ 0
    ),
    white = case_when(
      race == "White or Caucasian" ~ 1,
      race != "White or Caucasian" ~ 0
    ),
    vote_trump = case_when(
      vote_2020 == "Trump" ~ 1,
      vote_2020 != "Trump" ~ 0,
    ),
    vote_biden = case_when(
      vote_2020 == "Biden" ~ 1,
      vote_2020 != "Biden" ~ 0,
    ),
  ) %>% 
  mutate_if(is_all_numeric, as.numeric) %>% 
  select(where(is.numeric), -lib_cons_1)
ggcorrplot(cor(df_mc_numeric, use = "pairwise.complete.obs"), type = "lower", lab = TRUE)

Vaccine Estimation

We should use this section to develop code to understand what combinations of the first four multiple choice questions can help us correctly identify the vaccination status. This would help us fork the chatbot to the right people.

Here is an estimation of the vaccinate status based on a decision tree.

Estimation using a decision tree

fit <- rpart(vax_status~ legit_threat + ask_vax_stat + vax_mandate_op + friend_vax, data = df_mc, minsplit = 1)
rpart.plot(fit)

Two-Way Cross Tabulations

Below are two-way cross tabulation tables based on individual variables with the vaccination status.

Legit Threat

table(df_mc$vax_status, df_mc$legit_threat)
##                       
##                        absolutely for some ppl, sure vaccine is a bigger threat
##   i'm not telling you!          1                  2                          0
##   no, i haven't                 6                  5                          5
##   yes, i have                  18                  8                          5

Ask Vax Stat

table(df_mc$vax_status, df_mc$ask_vax_stat)
##                       
##                        no problem at all none of their biz
##   i'm not telling you!                 0                 3
##   no, i haven't                        4                 9
##   yes, i have                          9                 7
##                       
##                        only with good reason
##   i'm not telling you!                     0
##   no, i haven't                            3
##   yes, i have                             15

Vax Mandate Op

table(df_mc$vax_status, df_mc$vax_mandate_op)
##                       
##                        absolutely NOT! maybe... yes, WAY overdue!
##   i'm not telling you!               3        0                 0
##   no, i haven't                     12        1                 3
##   yes, i have                        8       10                13

Friend Vax

table(df_mc$vax_status, df_mc$friend_vax)
##                       
##                        DON'T do it! GET IT! It depends...
##   i'm not telling you!            1       0             2
##   no, i haven't                   4       3             9
##   yes, i have                     1      24             6

COVID pos

table(df_mc$vax_status, df_mc$covid_pos)
##                       
##                        i'm not telling you! no, i haven't yes, i have
##   i'm not telling you!                    3             0           0
##   no, i haven't                           0            12           4
##   yes, i have                             1            21           9

Gender

table(df_mc$vax_status, df_mc$gender)
##                       
##                        a man a woman
##   i'm not telling you!     1       2
##   no, i haven't            8       8
##   yes, i have             16      15

Race

table(df_mc$vax_status, df_mc$race)
##                       
##                        Asian or Pacific Islander  Black or African American 
##   i'm not telling you!                          0                          1
##   no, i haven't                                 1                          2
##   yes, i have                                   4                          3
##                       
##                        Hispanic or Latino  White or Caucasian 
##   i'm not telling you!                   0                   2
##   no, i haven't                          2                  11
##   yes, i have                            4                  20

Liberal Conservative

table(df_mc$vax_status, df_mc$lib_cons_1)
##                       
##                         1  2  3  4  5
##   i'm not telling you!  1  0  1  0  1
##   no, i haven't         2  1  4  2  7
##   yes, i have           6  2 15  4  4

Vote 2020

table(df_mc$vax_status, df_mc$vote_2020)
##                       
##                        Biden Did not vote Other Trump
##   i'm not telling you!     0            1     0     2
##   no, i haven't            3            6     0     7
##   yes, i have             21            3     1     6

Income

table(df_mc$vax_status, df_mc$income)
##                       
##                        1. <$15,000 2. $15,000-$24,999 3. $25,000-$49,999
##   i'm not telling you!           0                  0                  0
##   no, i haven't                  2                  4                  5
##   yes, i have                    4                  2                  8
##                       
##                        4. $50,000-$74,999 5. $75,000-$99,999 6. >$100,000
##   i'm not telling you!                  1                  2            0
##   no, i haven't                         1                  2            2
##   yes, i have                           3                  7            7

Education

table(df_mc$vax_status, df_mc$educ)
##                       
##                        < high school 2-year degree 4-year degree
##   i'm not telling you!             0             0             1
##   no, i haven't                    1             3             2
##   yes, i have                      2             3             8
##                       
##                        graduate degree high school diploma/GED some college
##   i'm not telling you!               0                       1            1
##   no, i haven't                      1                       7            2
##   yes, i have                        6                       5            7

Trust Govt

table(df_mc$vax_status, df_mc$trust_gov_1)
##                       
##                        1 2 3 4 5
##   i'm not telling you! 1 1 1 0 0
##   no, i haven't        7 5 3 1 0
##   yes, i have          7 6 6 6 6

Trust Healthcare

table(df_mc$vax_status, df_mc$trust_healthcare_1)
##                       
##                        1 2 3 4 5
##   i'm not telling you! 0 0 2 0 1
##   no, i haven't        2 2 9 2 1
##   yes, i have          1 5 9 8 8

Three-way Cross Tabulations

Given the number of three-way cross tabulations being generated, all of the outputs are stored in tables folder as PDFs instead of being displayed here. The meaningful ones are also displayed in the analysis memo in the same folder as this code.

df_mc$weight <- rep(1, nrow(df_mc))
df_mc$age_binary <- ifelse(df_mc$age >= median(df_mc$age), "old", "young")
mc_no_vax <- mc_variables[!(mc_variables %in% c("vax_status", "age"))]

for (i in 1:(length(mc_no_vax) - 1)){
  for (j in (i+1):length(mc_no_vax)){
    x <- mc_no_vax[i]
    z <- mc_no_vax[j]
    df_mc %>%
      crosstab_3way(x = !!sym(x), y = vax_status, z = !!sym(z),
                    weight = weight) %>%
      kable(digits = 0,
            caption = paste0("Three Way Tab - vax_status + ", x, " + " , z),
            format = "html") %>%
      kable_styling() %>%
      save_kable(paste0("tables/v1/three_way_", x, "_", z, ".pdf"))
  }
}
mc_pre_text <- c("legit_threat", "ask_vax_stat", "vax_mandate_op", "friend_vax")
for (i in 1:(length(mc_pre_text) - 1)){
  for (j in (i+1):length(mc_pre_text)){
    x <- mc_pre_text[i]
    z <- mc_pre_text[j]
    df_mc %>%
      crosstab_3way(x = !!sym(x), y = vax_status, z = !!sym(z),
                    weight = weight) %>%
      kable(digits = 0,
            caption = paste0("Three Way Tab - vax_status + ", x, " + " , z),
            format = "html") %>%
      kable_styling() %>%
      save_kable(paste0("tables/v1/three_way_", x, "_", z, ".png"))
  }
}

Free text responses

# define free text vars here
free_text_vars <- c("vax_worries", "vax_worries_add", "reaction_why", "reaction_why_add", "express_true_op", "express_true_op_add")

Elaboration

Overall summary:

for (var in free_text_vars) {
  df_cv[, str_glue("{var}_nchar")] <- nchar(df_cv[, {var}])
}

df_cv %>% select(contains("nchar")) %>% summary()
##  vax_worries_nchar vax_worries_add_nchar reaction_why_nchar
##  Min.   :  2.00    Min.   : 2.00         Min.   :  2.00    
##  1st Qu.: 11.75    1st Qu.: 2.25         1st Qu.: 10.25    
##  Median : 19.50    Median : 8.00         Median : 25.50    
##  Mean   : 39.60    Mean   :14.58         Mean   : 32.72    
##  3rd Qu.: 41.25    3rd Qu.:16.00         3rd Qu.: 43.50    
##  Max.   :264.00    Max.   :83.00         Max.   :185.00    
##  reaction_why_add_nchar express_true_op_nchar express_true_op_add_nchar
##  Min.   :  2.00         Min.   :12.00         Min.   :  2.00           
##  1st Qu.:  8.25         1st Qu.:15.00         1st Qu.: 13.00           
##  Median : 15.50         Median :15.00         Median : 21.50           
##  Mean   : 26.72         Mean   :14.88         Mean   : 37.20           
##  3rd Qu.: 34.00         3rd Qu.:15.00         3rd Qu.: 41.25           
##  Max.   :126.00         Max.   :15.00         Max.   :235.00

Median number of characters for free text responses grouped by vaccination status:

df_cv %>% group_by(vax_status) %>% summarize_at(vars(contains("nchar")), ~ median(.)) %>% knitr::kable()
vax_status vax_worries_nchar vax_worries_add_nchar reaction_why_nchar reaction_why_add_nchar express_true_op_nchar express_true_op_add_nchar
i’m not telling you! 23 4 27 50 15 67.0
no, i haven’t 22 7 38 15 15 24.5
yes, i have 18 9 18 13 15 19.0

Wordclouds

Vaccine worries - Unvaccinated:

# Create corpus  
docs <- Corpus(VectorSource(df_cv %>% filter(vax_status != "yes, i have") %>% pull(vax_worries)))

# Clean corpus
docs <- 
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs)) 
words <- sort(rowSums(matrix), decreasing = TRUE) 
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Vaccine worries - Vaccinated:

# Create corpus  
docs <- Corpus(VectorSource(df_cv %>% filter(vax_status == "yes, i have") %>% pull(vax_worries)))

# Clean corpus
docs <- 
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs)) 
words <- sort(rowSums(matrix), decreasing = TRUE) 
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Reaction why - Unvaccinated:

# Create corpus  
docs <- Corpus(VectorSource(df_cv %>% filter(vax_status != "yes, i have") %>% pull(reaction_why)))

# Clean corpus
docs <- 
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs)) 
words <- sort(rowSums(matrix), decreasing = TRUE) 
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Reaction why - Vaccinated:

# Create corpus  
docs <- Corpus(VectorSource(df_cv %>% filter(vax_status == "yes, i have") %>% pull(reaction_why)))

# Clean corpus
docs <- 
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs)) 
words <- sort(rowSums(matrix), decreasing = TRUE) 
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Impediments

df_vac <- df_cv %>% filter(vax_status == "yes, i have")
df_not_vac <- df_cv %>% filter(vax_status != "yes, i have")

df_vac_imped <- df_vac[, c("vax_worries", "vax_worries_add")]
df_not_vac_imped <- df_not_vac[, c("vax_worries", "vax_worries_add")]




write.csv(df_vac_imped, "tables/vac_impediments1.csv")
write.csv(df_not_vac_imped, "tables/not_vac_impediments1.csv")

Not Vax Imped

not_vax_imped <- df_not_vac_imped$vax_worries[nchar(df_not_vac_imped$vax_worries) >= 14]
not_vax_imped
##  [1] "People still get covid, people still die even with the vaccine"                                                                       
##  [2] "Getting the COVID again "                                                                                                             
##  [3] "I'?m not worries"                                                                                                                     
##  [4] "It's not been tested enough to actually be a vaccine. Two years isn't long enough. "                                                  
##  [5] "Made it way fast "                                                                                                                    
##  [6] "Health issues.\nImmunity"                                                                                                             
##  [7] "not tested enough"                                                                                                                    
##  [8] "Depends on your concerns of certain side affects and what health issue you may have thst might make you prone to certain side affects"
##  [9] "potential side effects of the vaccine and pharmaceutical companies not being at fault for any harm caused by the vaccine."            
## [10] "Not proven to work and not proven to not do harm to your body"                                                                        
## [11] "Don't know what the side effects are"                                                                                                 
## [12] "Pain afterwards "                                                                                                                     
## [13] "It’s too risky"                                                                                                                       
## [14] "no long term studies"                                                                                                                 
## [15] "How do we know we might get sick in the future "                                                                                      
## [16] "I don't have any worries, just not getting it "

Not Vax Imped (Additional)

not_vax_imped_add <- df_not_vac_imped$vax_worries_add[nchar(df_not_vac_imped$vax_worries_add) >= 14]
not_vax_imped_add
## [1] "Don’t get the shot let your own immunity work against it"                           
## [2] "If you want to get pregnant you shouldn't get this vaccine. "                       
## [3] "I am already protected"                                                             
## [4] "Noone should be f orced to put something in their body that is no proven to be safe"
## [5] "It doesn't work"                                                                    
## [6] "No there is not "

Vax Imped

vax_imped <- df_vac_imped$vax_worries[nchar(df_vac_imped$vax_worries) >= 14]
vax_imped
##  [1] "There are some people who have illnesses that can't get vaccinated. This can be a concern. These people can catch COVID-19 and spread it to others. People vaccinated can still get COVID-19 and die. Also, some people have side effects from the vaccine and get sick."
##  [2] "ITS VERY MUTCH"                                                                                                                                                                                                                                                          
##  [3] "It Might Be Bad."                                                                                                                                                                                                                                                        
##  [4] "The has some virus or can damage your body system"                                                                                                                                                                                                                       
##  [5] "Side effects, created too quickly "                                                                                                                                                                                                                                      
##  [6] "Most of the worries are overblown by the politicization of a vaccine which companies have been working on since SARS1."                                                                                                                                                  
##  [7] "the long term side effects are unknown."                                                                                                                                                                                                                                 
##  [8] "Is the best option"                                                                                                                                                                                                                                                      
##  [9] "Long term effects are unknown"                                                                                                                                                                                                                                           
## [10] "there are no long term diagnosis for what it may bring on later."                                                                                                                                                                                                        
## [11] "I don’t have any"                                                                                                                                                                                                                                                        
## [12] "I do not see any legit problems."                                                                                                                                                                                                                                        
## [13] "people think that they can get the virus from the vax, that they are going to be microchipped, has aborted fetuses in it, they are just antivaxers because they don't want to put chemicals in their body even though they do it on a daily basis"                       
## [14] "If you will have a reaction to it"                                                                                                                                                                                                                                       
## [15] "Short term and long term effects "                                                                                                                                                                                                                                       
## [16] "Side effects are always a concern"                                                                                                                                                                                                                                       
## [17] "the ingredients"                                                                                                                                                                                                                                                         
## [18] "Allergic reaction or not enough testing"                                                                                                                                                                                                                                 
## [19] "Unproven \nLong term effects\nEffectiveness "                                                                                                                                                                                                                            
## [20] "It could be painful"                                                                                                                                                                                                                                                     
## [21] "Long term effects"

Vax Imped (Additional)

vax_imped_add <- df_vac_imped$vax_worries_add[nchar(df_vac_imped$vax_worries_add) >= 14]
vax_imped_add
## [1] "Some people are just plain scared to get the vaccine. I'm not sure why."
## [2] "nope, it is safe"                                                       
## [3] "That can hurt your brain"                                               
## [4] "deadly secondary effects"                                               
## [5] "No one really knows if it is safe or not"                               
## [6] "Long term or short term effects"                                        
## [7] "It depends who I am talking to. "                                       
## [8] "Listen to doctors, not blogs"                                           
## [9] "Absolutely nothing"

Spatial

Legitimate Threat

df_cv <- 
  df_cv %>% 
  mutate(
    LocationLongitude = as.numeric(LocationLongitude),
    LocationLatitude = as.numeric(LocationLatitude),
  )

us <- c(left = -125, bottom = 24, right = -67, top = 49)

get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
  ggmap() +
  geom_point(
    aes(x=LocationLongitude, y = LocationLatitude, color = legit_threat), 
    data = df_cv, 
    size = 5,
    alpha=0.75
  ) +
  theme_void() +
  theme(legend.position = "bottom") +
  labs(color="COVID is a legitimate threat")

Vax Status

get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
  ggmap() +
  geom_point(
    aes(x=LocationLongitude, y = LocationLatitude, color = vax_status), 
    data = df_cv, 
    size = 5,
    alpha=0.75
  ) +
  theme_void() +
  theme(legend.position = "bottom") +
  labs(color="Vax status")

Ask Vax Status

get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
  ggmap() +
  geom_point(
    aes(x=LocationLongitude, y = LocationLatitude, color = ask_vax_stat), 
    data = df_cv, 
    size = 5,
    alpha=0.75
  ) +
  theme_void() +
  theme(legend.position = "bottom") +
  labs(color="Ask vax status")

Vax Mandate

get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
  ggmap() +
  geom_point(
    aes(x=LocationLongitude, y = LocationLatitude, color = vax_mandate_op), 
    data = df_cv, 
    size = 5,
    alpha=0.75
  ) +
  theme_void() +
  theme(legend.position = "bottom") +
  labs(color="Vax mandate")

Friend Vax

get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
  ggmap() +
  geom_point(
    aes(x=LocationLongitude, y = LocationLatitude, color = friend_vax), 
    data = df_cv, 
    size = 5,
    alpha=0.75
  ) +
  theme_void() +
  theme(legend.position = "bottom") +
  labs(color="Friend to get vaccinated")

COVID Pos

get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>%
  ggmap() +
  geom_point(
    aes(x=LocationLongitude, y = LocationLatitude, color = covid_pos), 
    data = df_cv, 
    size = 5,
    alpha=0.75
  ) +
  theme_void() +
  theme(legend.position = "bottom") +
  labs(color="COVID positive")

Quality Control

paste0("Percentage of matched age: ", 
       round(mean(df_cv$age == df_cv$age.1) * 100, 2), "%")
## [1] "Percentage of matched age: 76%"
paste0("Percentage of matched gender: ", 
       round(mean(ifelse(df_cv$gender == "a woman", "2", "1") == df_cv$gender.1), 2)*100, 
       "%")
## [1] "Percentage of matched gender: 100%"