Free-Text Analysis

rm(list = ls())
library(openxlsx)
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, topicmodels, ldatuning, lda,SnowballC, pals, flextable, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, xtable, plotrix, ggplot2)

set.seed(94305)
dir.create(file.path('tables'), showWarnings = FALSE)
dir.create(file.path('figures'), showWarnings = FALSE)
dir.create(file.path('freetext'), showWarnings = FALSE)

Data

library(dplyr)
files <- list.files('./data', full.names = TRUE)
# pre_files <- files[grepl("Main PRE", files)]
# post_files <- files[grepl("Main POST", files)]
# INPUT_FILENAME_PRE <- pre_files[which.max(file.info(pre_files)$mtime)]
# INPUT_FILENAME_POST <- post_files[which.max(file.info(post_files)$mtime)]
# df_full_pre <- read_csv(INPUT_FILENAME_PRE) %>% clean_names %>% remove_empty()
# df_full_post <- read_csv(INPUT_FILENAME_POST) %>% clean_names %>% remove_empty()

followup_files <- files[grepl("Followup", files)]
INPUT_FILENAME_FOLLOWUP <- followup_files[which.max(file.info(followup_files)$mtime)]
df_full_followup <- read_csv(INPUT_FILENAME_FOLLOWUP) %>% clean_names %>% remove_empty()

# df_final <- readRDS("../main_analysis/data/df_final.rds")


df <- df_full_followup %>% filter(finished == "True") %>% filter(consent_course == "Yes, I agree") %>% filter(status == "IP Address")

contact_list <- read.csv("../contact_lists/main/misinfo_followup.csv") 
contact_list <- contact_list %>% rename(external_reference = ExternalDataReference)
contact_list$external_reference <- paste0("+", as.character(contact_list$external_reference))
df <- left_join(df, contact_list, by = "external_reference")

# df_post_valid_phone <- unique(df_final$phone_num)
# df_post <- df_full_post[df_full_post$phone_num %in% df_post_valid_phone, ]

# clean_phone_number <- function(phone){
#   if (is.na(phone)){
#     return ("")
#   }
#   if (((substr(phone, 1, 1) == "0") | (substr(phone, 1, 1) == "O") | (substr(phone, 1, 1) == "+")) & nchar(phone) == 10){
#     phone <- substr(phone, 2, nchar(phone))
#   } else if (((substr(phone, 1, 3) == "254")) & nchar(phone) == 12){
#     phone <- substr(phone, 4, nchar(phone))
#   } else if (((substr(phone, 1, 4) == "2540")) & nchar(phone) == 13){
#     phone <- substr(phone, 5, nchar(phone))
#   } else if (((substr(phone, 1, 4) == "+254")) & nchar(phone) == 13){
#     phone <- substr(phone, 5, nchar(phone))
#   } else if (((substr(phone, 1, 4) == "±254")) & nchar(phone) == 13){
#     phone <- substr(phone, 5, nchar(phone))
#   } else if (((substr(phone, 1, 4) == "†254")) & nchar(phone) == 13){
#     phone <- substr(phone, 5, nchar(phone))
#   } else if (((substr(phone, 1, 5) == "+2540")) & nchar(phone) == 14){
#     phone <- substr(phone, 6, nchar(phone))
#   } else {
#     phone <- ""
#   }
#   return (phone)
#   
#   
# }
# df_post_phone <- sapply(df_post_valid_phone, function(x) clean_phone_number(x))

free_text_columns <- colnames(df)[startsWith(colnames(df), "reflective")]
reflective_questions <- c("What are some techniques that people use to create misleading social media posts?",
                          "When browsing your timeline in the last month, did you notice any post that looked misleading? If so, what made it seem misleading?",
                          "How did you feel when you saw the misleading post? If you haven't seen any misleading posts recently, how do you think you would feel?",
                          "Has the Inoculation against Misinformation course changed your behavior on social media? If so, how?",
                          "If you were to tell a friend what you learned in the course, what tip would you share?")


# for (i in 1:5){
#   output <- df[free_text_columns[i]]
#   colnames(output) <- reflective_questions[i]
#   write.csv(output, paste0("./freetext/reflective_questions_", i, ".csv"))
# }

Generate Descriptive Statistics

Create variables of interest

options(scipen=999)
treatments <- c("control_delayed", "control_alternative", "tactics", "emotion", "combo", "overall")
for (ftc in free_text_columns){
  question_num <- substr(ftc, 12, 12)
  output_name <- paste0("Reflective Question ", question_num)
  total_output <- c()
  for (t in treatments){
    if (t != "overall"){
      q <- df %>% filter(treatment == t) %>% select(ftc)
    } else {
      q <- df %>% select(ftc)
    }
    num_characters_mean <- mean(sapply(q, function(x) nchar(x)), na.rm = T)
    num_characters_se <- std.error(sapply(q, function(x) nchar(x)), na.rm = T)
    num_word_mean <- mean(sapply(q, function(x) str_count(x, "\\w+")), na.rm = T)
    num_word_se <- std.error(sapply(q, function(x) str_count(x, "\\w+")), na.rm = T)
    include_yes_mean <- mean(sapply(q, function(x) grepl("\\byes\\b", x, ignore.case = TRUE)), na.rm = T)
    include_yes_se <- std.error(sapply(q, function(x) grepl("\\byes\\b", x, ignore.case = TRUE)), na.rm = T)
    include_no_mean <- mean(sapply(q, function(x) grepl("\\bno\\b", x, ignore.case = TRUE) & !grepl("\\byes\\b", x, ignore.case = TRUE)), na.rm = T)
    include_no_se <- std.error(sapply(q, function(x) grepl("\\bno\\b", x, ignore.case = TRUE) & !grepl("\\byes\\b", x, ignore.case = TRUE)), na.rm = T)
    only_yes_mean <- mean(sapply(q, function(x) grepl("\\byes\\b", x, ignore.case = TRUE) & nchar(x) <= 5), na.rm = T)
    only_yes_se <- std.error(sapply(q, function(x) grepl("\\byes\\b", x, ignore.case = TRUE) & nchar(x) <= 5), na.rm = T)
    only_no_mean <- mean(sapply(q, function(x) grepl("\\bno\\b", x, ignore.case = TRUE) & !grepl("\\byes\\b", x, ignore.case = TRUE) & nchar(x) <= 5), na.rm = T)
    only_no_se <- std.error(sapply(q, function(x) grepl("\\bno\\b", x, ignore.case = TRUE) & !grepl("\\byes\\b", x, ignore.case = TRUE) & nchar(x) <= 5), na.rm = T)
    more_than_yes_mean <- mean(sapply(q, function(x) grepl("\\byes\\b", x, ignore.case = TRUE) & nchar(x) > 5), na.rm = T)
    more_than_yes_se <- std.error(sapply(q, function(x) grepl("\\byes\\b", x, ignore.case = TRUE) & nchar(x) > 5), na.rm = T)
    more_than_no_mean <- mean(sapply(q, function(x) grepl("\\bno\\b", x, ignore.case = TRUE) & !grepl("\\byes\\b", x, ignore.case = TRUE) & nchar(x) > 5), na.rm = T)
    more_than_no_se <- std.error(sapply(q, function(x) grepl("\\bno\\b", x, ignore.case = TRUE) & !grepl("\\byes\\b", x, ignore.case = TRUE) & nchar(x) > 5), na.rm = T)
    other_mean <- mean(sapply(q, function(x) !(grepl("\\bno\\b", x, ignore.case = TRUE) | grepl("\\byes\\b", x, ignore.case = TRUE))), na.rm = T)
    other_se <- std.error(sapply(q, function(x) !(grepl("\\bno\\b", x, ignore.case = TRUE) | grepl("\\byes\\b", x, ignore.case = TRUE))), na.rm = T)
    
    output <- c(round(num_characters_mean, 4), paste0("(", round(num_characters_se, 2), ")"), 
                round(num_word_mean, 4), paste0("(", round(num_word_se, 2), ")"), 
                round(include_yes_mean, 4), paste0("(", round(include_yes_se, 2), ")"), 
                round(include_no_mean, 4), paste0("(", round(include_no_se, 2), ")"), 
                round(only_yes_mean, 4), paste0("(", round(only_yes_se, 2), ")"), 
                round(only_no_mean, 4), paste0("(", round(only_no_se, 2), ")"), 
                round(more_than_yes_mean, 4), paste0("(", round(more_than_yes_se, 2), ")"), 
                round(more_than_no_mean, 4), paste0("(", round(more_than_no_se, 2), ")"), 
                round(other_mean, 4), paste0("(", round(other_se, 2), ")"))
    total_output <- cbind(total_output, output)
  }
  colnames(total_output) <- c("Control", "Reminder", "Tactics", "Emotion", "Combo", "Overall")
  rownames(total_output) <- c("Number of Characters", "",
                              "Number of Words", "",
                              "Yes Responses", "",
                              "No Responses", "",
                              "Only 'Yes' Responses", "",
                              "Only 'No' Responses", "",
                              "More than 'Yes' Responses", "",
                              "More than 'No' Responses", "",
                              "Other Responses", "")
  print(xtable(total_output, type = "latex", caption = paste0("Descriptive Statistics for ", output_name)), file = paste0("./freetext/", ftc, ".latex"))
}

df$num_characters_4 <- sapply(df$reflective_4, function(x) nchar(x))
df$num_characters_5 <- sapply(df$reflective_5, function(x) nchar(x))
test_4 <- t.test(df$num_characters_4[df$treatment == "emotion"], df$num_characters_4[df$treatment == "control_alternative"], alternative = c("greater"))
test_5 <- t.test(df$num_characters_5[df$treatment == "emotion"], df$num_characters_5[df$treatment == "control_alternative"], alternative = c("greater"))


paste0("diff: ", round(test_4$estimate[1] - test_4$estimate[2], 1), " se: ", round(test_4$stderr, 2))

## [1] "diff: 3.3 se: 1.82"

paste0("diff: ", round(test_5$estimate[1] - test_5$estimate[2], 1), " se: ", round(test_5$stderr, 2))

## [1] "diff: 2.9 se: 1.91"

Reflective Question 4

Has the Inoculation against Misinformation course changed your behavior on social media? If so, how?

Heuristics

df_4 <- df[, c("treatment", "reflective_4")]
df_4$contain_course_info <- ifelse(grepl("stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question|verify|verified|identified|prove|proved|differentiate|distinguish|tell.*difference|spot|confirm|confirmed|researched|analyzed|before|tell.*between|ask myself|asked myself", df_4$reflective_4), 1L, 0L)
df_4 %>% group_by(treatment) %>% summarise(percentage_mentioned = mean(contain_course_info), count_mentioned = sum(contain_course_info), total_in_group = n())

output <- df_4[order(df_4$contain_course_info, decreasing = TRUE), c("treatment", "contain_course_info", "reflective_4")]
colnames(output) <- c("treatment_group", "contain_course_keyword", reflective_questions[4])
write.csv(output, paste0("./freetext/reflective_questions_4_heuristics.csv"))

Contain Heuristics

datatable(output[output$contain_course_keyword == 1, 3])

Does not contain heuristics

datatable(output[output$contain_course_keyword == 0, 3])

Conduct Hypothesis Test

All treatment courses aggregated vs reminder
All treatment courses aggregated vs (reminder + control)
Emotion vs reminder
Tactics vs reminder
Combo vs reminder

test_1_q4 <- t.test(output %>% filter(treatment_group %in% c("emotion", "tactics", "combo")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")
test_2_q4 <- t.test(output %>% filter(treatment_group %in% c("control_delayed")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")
test_3_q4 <- t.test(output %>% filter(treatment_group %in% c("emotion")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")
test_4_q4 <- t.test(output %>% filter(treatment_group %in% c("tactics")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")
test_5_q4 <- t.test(output %>% filter(treatment_group %in% c("combo")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")

results_q4 = data.frame(matrix(NA,5,6))
colnames(results_q4) = c("estimates","std.err","CI_lw","CI_up","ts","p_val")
rownames(results_q4) = c("Test 1 - All Treatment Courses v. Reminder",
                         "Test 2 - Control (Combo) v. Reminder", 
                         "Test 3 - Emotion v. Reminder",
                         "Test 4 - Tactics v. Reminder",
                         "Test 5 - Combo v. Reminder")

results_q4$estimates <- c(test_1_q4$estimate[1] - test_1_q4$estimate[2],
                          test_2_q4$estimate[1] - test_2_q4$estimate[2],
                          test_3_q4$estimate[1] - test_3_q4$estimate[2],
                          test_4_q4$estimate[1] - test_4_q4$estimate[2],
                          test_5_q4$estimate[1] - test_5_q4$estimate[2])
results_q4$std.err <- c(test_1_q4$stderr,
                        test_2_q4$stderr,
                        test_3_q4$stderr,
                        test_4_q4$stderr,
                        test_5_q4$stderr)
results_q4$CI_lw <- c(test_1_q4$conf.int[1],
                      test_2_q4$conf.int[1],
                      test_3_q4$conf.int[1],
                      test_4_q4$conf.int[1],
                      test_5_q4$conf.int[1])
results_q4$CI_up <- c(test_1_q4$conf.int[2],
                      test_2_q4$conf.int[2],
                      test_3_q4$conf.int[2],
                      test_4_q4$conf.int[2],
                      test_5_q4$conf.int[2])
results_q4$ts <- c(test_1_q4$statistic,
                   test_2_q4$statistic,
                   test_3_q4$statistic,
                   test_4_q4$statistic,
                   test_5_q4$statistic)
results_q4$p_val <- c(test_1_q4$p.value,
                      test_2_q4$p.value,
                      test_3_q4$p.value,
                      test_4_q4$p.value,
                      test_5_q4$p.value)
results_q4$p_val_holm = p.adjust(results_q4$p_val,"holm")

results_q4

Plot

plot_gen = function(data,color,ylab,xlab,y_min,y_max,title,num_size,baseline,tests){
    
pic = ggplot(data=data,aes(x=label,y=mean))+
        theme_bw()+
        theme(axis.line.y = element_line(colour="black"),panel.border = element_blank(), 
              panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        legend.title = element_blank(),      
        legend.position = c(.9,.8),legend.justification = c("right", "bottom"), 
        legend.key = element_rect(colour = "transparent"),
        legend.box.just = "right", legend.text = element_text(size=12), legend.margin = margin(6, 6, 6, 6),
        legend.box.background = element_rect( fill="transparent", size=1),legend.background = element_blank()) +
    
        geom_bar(stat='identity',width=0.75,color="black",fill=color) +
        geom_errorbar(aes(ymin=mean-1.96*sd, ymax = mean+1.96*sd),width=0.1, size=1.5, position = position_dodge(1)) 
    
        if (baseline){
        
        pic = pic + geom_text(aes(label=as.character(formatC(mean,digits=3,format="f")),y=mean+sign(mean)*sd),vjust = -2.5,size=num_size)+
        geom_text(aes(label=paste("[",as.character(round(mean/mean[2]*100)),"%]",sep=""),y=mean+sign(mean)*sd),vjust = -1,size=num_size-1)}
    
        if (!baseline){
            
        pic = pic + geom_text(aes(label=as.character(formatC(mean,digits=3,format="f")),y=mean+sd),vjust = -1,size=num_size)    
        }
    
               
        
        pic = pic + geom_hline(yintercept = 0)+

        geom_segment(aes(x = 2, y = 0.92, xend = 5, yend = 0.92), color = "orange2",size=2)+
        geom_segment(aes(x = 2, y = 0.92, xend = 2, yend = 0.72), color = "orange2",size=2)+
        geom_segment(aes(x = 5, y = 0.92, xend = 5, yend = 0.60), color = "orange2",size=2)+
    
        geom_segment(aes(x = 2, y = 0.82, xend = 4, yend = 0.82), color = "pink2",size=2)+
        geom_segment(aes(x = 2, y = 0.82, xend = 2, yend = 0.72), color = "pink2",size=2)+
        geom_segment(aes(x = 4, y = 0.82, xend = 4, yend = 0.56), color = "pink2",size=2)+
    
        geom_segment(aes(x = 2, y = 0.72, xend = 3, yend = 0.72), color = "turquoise2",size=2)+
        geom_segment(aes(x = 2, y = 0.72, xend = 2, yend = 0.72), color = "turquoise2",size=2)+
        geom_segment(aes(x = 3, y = 0.72, xend = 3, yend = 0.62), color = "turquoise2",size=2)+
    
        geom_segment(aes(x = 1, y = 0.85, xend = 2, yend = 0.85), color = "gray70",size=2)+
        geom_segment(aes(x = 1, y = 0.85, xend = 1, yend = 0.68), color = "gray70",size=2)+
        geom_segment(aes(x = 2, y = 0.85, xend = 2, yend = 0.68), color = "gray70",size=2)+

    
        ggplot2::annotate("text",x=4.5,y=0.875,label = paste0(tests$estimates[4], "\n(",tests$std.err[4],")"),size=11) +
        ggplot2::annotate("text",x=3.5,y=0.775,label = paste0(tests$estimates[3], "\n(",tests$std.err[3],")"),size=11) +    
        ggplot2::annotate("text",x=2.5,y=0.675,label = paste0(tests$estimates[2], "\n(",tests$std.err[2],")"),size=11) +
        ggplot2::annotate("text",x=1.5,y=0.805,label = paste0(tests$estimates[1], "\n(",tests$std.err[1],")"),size=11) +
    
        #annotate("text",x=4.25,y=1.02,label = "Diff:\n",size=11) +
        #annotate("text",x=3.25,y=0.92,label = "Diff:\n",size=11) +    
        #annotate("text",x=2.25,y=0.82,label = "Diff:\n",size=11) +
        #annotate("text",x=1.25,y=0.95,label = "Diff:\n",size=11) +

    
       labs(y=ylab,x=xlab)+
        theme(axis.text.x = element_text(color = "black", size = 40, angle = 0, hjust = .5, vjust = 5, face = "plain"),
        axis.text.y = element_text(color = "black", size = 45, angle = 0, hjust = 0, vjust = .5, face = "plain",
                                   margin=unit(rep(0.5,4),"cm")),  
        axis.title.x = element_text(color = "black", size = 55, angle = 0, hjust = .5, vjust = 3, face = "bold"),
        axis.title.y = element_text(color = "black", size = 60, angle = 90, hjust = .5, vjust = .5, face = "bold"),
        axis.ticks.length.y = unit(-0.25,"cm"), axis.ticks.x=element_blank())+
        scale_y_continuous(limits = c(y_min,y_max)) +
        scale_x_discrete(limits = data$label) +

    
        ggtitle(title) + 
          theme(plot.title = element_text(face='bold', size=50, hjust=0.5, vjust=0.5))
        return(pic)
}

plot_q4_data = data.frame(matrix(NA,5,3))
colnames(plot_q4_data) = c("label","mean","sd")

plot_q4_data[,1:2] = aggregate(contain_course_keyword~treatment_group,output,mean)
plot_q4_data[,3] = aggregate(contain_course_keyword~treatment_group,output,std.error)[,2]


plot_q4_data$label[plot_q4_data$label == "control_delayed"] <- "No-course Control"
plot_q4_data$label[plot_q4_data$label == "control_alternative"] <- "Reminder Control"
plot_q4_data$label[plot_q4_data$label == "tactics"] <- "Info"
plot_q4_data$label[plot_q4_data$label == "emotion"] <- "Emotions"
plot_q4_data$label[plot_q4_data$label == "combo"] <- "Combo"

plot_q4_data <- plot_q4_data %>%
  slice(match(c("No-course Control", "Reminder Control",  "Info", "Emotions", "Combo"), label))

tests_plot_q4 = data.frame(matrix(NA),4,2)
tests_plot_q4 = round(results_q4[2:5,c("estimates","std.err")],3)


plot_gen(plot_q4_data,c("gray70","royalblue3","turquoise2","pink2","orange2"),"Proportion of Participant Responses containing Keywords","Assignment group",-0.0003,1,"",5,TRUE, tests_plot_q4)

WordCloud

vector_wc <- df_4$reflective_4
# Create corpus
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

# dataset_names <- list('Contain' = output[output$contain_course_keyword == 1, ], 
#                       'Does not contain' = output[output$contain_course_keyword == 0, ])
# openxlsx::write.xlsx(dataset_names,  paste0("./freetext/reflective_questions_4_heuristics.xlsx"))

Reflective Question 5

If you were to tell a friend what you learned in the course, what tip would you share?

Heuristics

df_5 <- df[, c("treatment", "reflective_5")]
df_5$contain_course_info <- ifelse(grepl("stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question|verify|verified|identified|prove|proved|differentiate|distinguish|tell.*difference|spot|confirm|confirmed|researched|analyzed|before|tell.*between|ask myself|asked myself", df_5$reflective_5), 1L, 0L)
df_5 %>% group_by(treatment) %>% summarise(percentage_mentioned = mean(contain_course_info), count_mentioned = sum(contain_course_info), total_in_group = n())

output <- df_5[order(df_5$contain_course_info, decreasing = TRUE), c("treatment", "contain_course_info", "reflective_5")]
colnames(output) <- c("treatment_group", "contain_course_keyword", reflective_questions[5])
# write.csv(output, paste0("./freetext/reflective_questions_5_heuristics.csv"))

# library(openxlsx)
# dataset_names <- list('Contain' = output[output$contain_course_keyword == 1, ], 
#                       'Does not contain' = output[output$contain_course_keyword == 0, ])
# openxlsx::write.xlsx(dataset_names,  paste0("./freetext/reflective_questions_5_heuristics.xlsx"))

Contain Heuristics

datatable(output[output$contain_course_keyword == 1, 3])

Does not contain heuristics

datatable(output[output$contain_course_keyword == 0, 3])

Conduct Hypothesis Test

All treatment courses aggregated vs reminder
All treatment courses aggregated vs (reminder + control)
Emotion vs reminder
Tactics vs reminder
Combo vs reminder

test_1_q5 <- t.test(output %>% filter(treatment_group %in% c("emotion", "tactics", "combo")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")
test_2_q5 <- t.test(output %>% filter(treatment_group %in% c("control_delayed")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")
test_3_q5 <- t.test(output %>% filter(treatment_group %in% c("emotion")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")
test_4_q5 <- t.test(output %>% filter(treatment_group %in% c("tactics")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")
test_5_q5 <- t.test(output %>% filter(treatment_group %in% c("combo")) %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("control_alternative")) %>% select(contain_course_keyword),
                    alternative = "greater")

results_q5 = data.frame(matrix(NA,5,6))
colnames(results_q5) = c("estimates","std.err","CI_lw","CI_up","ts","p_val")
rownames(results_q5) = c("Test 1 - All Treatment Courses v. Reminder",
                         "Test 2 - Control (Combo) v. Reminder", 
                         "Test 3 - Emotion v. Reminder",
                         "Test 4 - Tactics v. Reminder",
                         "Test 5 - Combo v. Reminder")

results_q5$estimates <- c(test_1_q5$estimate[1] - test_1_q5$estimate[2],
                          test_2_q5$estimate[1] - test_2_q5$estimate[2],
                          test_3_q5$estimate[1] - test_3_q5$estimate[2],
                          test_4_q5$estimate[1] - test_4_q5$estimate[2],
                          test_5_q5$estimate[1] - test_5_q5$estimate[2])
results_q5$std.err <- c(test_1_q5$stderr,
                        test_2_q5$stderr,
                        test_3_q5$stderr,
                        test_4_q5$stderr,
                        test_5_q5$stderr)
results_q5$CI_lw <- c(test_1_q5$conf.int[1],
                      test_2_q5$conf.int[1],
                      test_3_q5$conf.int[1],
                      test_4_q5$conf.int[1],
                      test_5_q5$conf.int[1])
results_q5$CI_up <- c(test_1_q5$conf.int[2],
                      test_2_q5$conf.int[2],
                      test_3_q5$conf.int[2],
                      test_4_q5$conf.int[2],
                      test_5_q5$conf.int[2])
results_q5$ts <- c(test_1_q5$statistic,
                   test_2_q5$statistic,
                   test_3_q5$statistic,
                   test_4_q5$statistic,
                   test_5_q5$statistic)
results_q5$p_val <- c(test_1_q5$p.value,
                      test_2_q5$p.value,
                      test_3_q5$p.value,
                      test_4_q5$p.value,
                      test_5_q5$p.value)
results_q5$p_val_holm = p.adjust(results_q5$p_val,"holm")

results_q5

Plot

plot_q5_data = data.frame(matrix(NA,5,3))
colnames(plot_q5_data) = c("label","mean","sd")

plot_q5_data[,1:2] = aggregate(contain_course_keyword~treatment_group,output,mean)
plot_q5_data[,3] = aggregate(contain_course_keyword~treatment_group,output,std.error)[,2]


plot_q5_data$label[plot_q5_data$label == "control_delayed"] <- "No-course Control"
plot_q5_data$label[plot_q5_data$label == "control_alternative"] <- "Reminder Control"
plot_q5_data$label[plot_q5_data$label == "tactics"] <- "Info"
plot_q5_data$label[plot_q5_data$label == "emotion"] <- "Emotions"
plot_q5_data$label[plot_q5_data$label == "combo"] <- "Combo"

plot_q5_data <- plot_q5_data %>%
  slice(match(c("No-course Control", "Reminder Control",  "Info", "Emotions", "Combo"), label))

tests_plot_q5 = data.frame(matrix(NA),4,2)
tests_plot_q5 = round(results_q5[2:5,c("estimates","std.err")],3)


plot_gen(plot_q5_data,c("gray70","royalblue3","turquoise2","pink2","orange2"),"Proportion of Participant Responses containing Keywords","Assignment group",-0.0003,1,"",5,TRUE, tests_plot_q5)

WordCloud

vector_wc <- df_5$reflective_5
# Create corpus
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Comparing Gender

df_final <- readRDS("df_final.rds")

df_why <- df_final %>% select(user, accuracy, treatment, att_check_pre, att_check_post, 
                              reflective_1, reflective_2, reflective_3, reflective_4, reflective_5, 
                              gender)
df_why <- df_why %>% distinct()

Reflective Question 4

Emotions

WordCloud with Word Frequency >= 25

Man

vector_wc <- df_why %>% filter(treatment == "emotion", gender == "Man") %>% select(reflective_4)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
yes	343
information	275
na na	203
na yes	148
share	110
media	105
social	102
social media	100
post	97
misinformation	90
sharing	73
posts	72
now	66
misleading	65
changed	63
dont	61
can	55
know	51
true	49
careful	42

Woman

vector_wc <- df_why %>% filter(treatment == "emotion", gender == "Woman") %>% select(reflective_4)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
yes	207
information	158
na na	103
social	85
media	84
social media	81
na yes	74
post	73
share	73
dont	54
sharing	51
misinformation	47
true	37
believe	33
misleading	33
posts	33
changed	32
know	31
now	30
sure	26

Reasoning

WordCloud with Word Frequency >= 25

Man

vector_wc <- df_why %>% filter(treatment == "tactics", gender == "Man") %>% select(reflective_4)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
yes	356
information	289
na na	164
na yes	133
social	130
media	128
social media	125
post	84
posts	80
now	77
share	76
misinformation	74
sharing	69
misleading	68
can	59
keen	56
dont	54
changed	52
careful	49
able	44

Woman

vector_wc <- df_why %>% filter(treatment == "tactics", gender == "Woman") %>% select(reflective_4)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
yes	183
information	133
na na	95
post	66
na yes	63
media	60
social	59
social media	58
dont	44
share	41
misleading	40
misinformation	38
posts	37
now	34
true	30
know	28
sharing	27
sure	27
changed	26
yesi	26

Combo

WordCloud with Word Frequency >= 25

Man

vector_wc <- df_why %>% filter(treatment == "combo", gender == "Man") %>% select(reflective_4)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
yes	385
information	260
na na	167
na yes	160
post	127
media	120
social	120
social media	117
share	107
misleading	91
posts	90
sharing	78
dont	74
now	71
misinformation	64
can	53
changed	49
misleading information	46
made	42
research	38

Woman

vector_wc <- df_why %>% filter(treatment == "combo", gender == "Woman") %>% select(reflective_4)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
yes	177
information	144
na na	95
media	69
na yes	69
social	69
social media	66
share	64
post	56
dont	55
misleading	43
sharing	40
misinformation	39
now	38
posts	37
sure	33
keen	30
see	29
believe	26
yesi	26

Reflective Question 5

Emotions

WordCloud with Word Frequency >= 25

Man

vector_wc <- df_why %>% filter(treatment == "emotion", gender == "Man") %>% select(reflective_5)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
information	347
na na	189
misinformation	188
media	184
social	181
social media	173
sharing	120
misleading	117
share	116
post	100
always	93
misleading information	56
posts	56
true	52
careful	48
avoid	45
sure	44
disinformation	41
dont	40
misinformation na	38

Woman

vector_wc <- df_why %>% filter(treatment == "emotion", gender == "Woman") %>% select(reflective_5)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
information	182
media	137
social	135
social media	133
misinformation	103
na na	103
share	80
sharing	67
misleading	59
post	59
true	47
posts	43
always	41
careful	31
believe	30
sure	30
misleading information	29
dont	28
never	28
see	28

Reasoning

WordCloud with Word Frequency >= 25

Man

vector_wc <- df_why %>% filter(treatment == "tactics", gender == "Man") %>% select(reflective_5)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
information	358
social	194
media	192
social media	186
misinformation	161
na na	158
misleading	131
share	98
sharing	88
misleading information	77
always	76
post	70
posts	63
true	60
careful	50
check	47
people	41
source	41
keen	38
avoid	36

Woman

vector_wc <- df_why %>% filter(treatment == "tactics", gender == "Woman") %>% select(reflective_5)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
information	144
media	113
social	113
social media	110
misinformation	94
na na	94
misleading	73
post	49
share	43
posts	41
true	38
sharing	34
always	33
see	31
misleading information	30
careful	28
everything	28
keen	28
tell	28
sure	25

Combo

WordCloud with Word Frequency >= 25

Man

vector_wc <- df_why %>% filter(treatment == "combo", gender == "Man") %>% select(reflective_5)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
information	344
media	184
social	181
social media	175
misinformation	160
na na	157
misleading	128
share	128
post	104
sharing	83
posts	76
always	75
misleading information	70
careful	54
dont	49
true	43
check	41
sure	39
tell	35
see	34

Woman

vector_wc <- df_why %>% filter(treatment == "combo", gender == "Woman") %>% select(reflective_5)
# Create corpus
docs <- VCorpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

BigramTokenizer <- function(x) c(unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE), 
                                 unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs,  control = list(tokenize = BigramTokenizer)))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)



# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 25, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

row.names(df_freetext) <- 1:nrow(df_freetext)
as.matrix(df_freetext[1:20,]) %>% kable(format = 'pipe') %>% print()

word	freq
information	162
social	99
media	96
misinformation	96
social media	92
na na	88
share	73
misleading	60
sharing	57
always	54
post	45
posts	44
true	41
careful	30
sure	30
misleading information	28
people	26
dont	24
avoid	23
know	23

Hypothesis Testing

Contain Course Keyword

reflective_questions <- c("What are some techniques that people use to create misleading social media posts?",
                          "When browsing your timeline in the last month, did you notice any post that looked misleading? If so, what made it seem misleading?",
                          "How did you feel when you saw the misleading post? If you haven't seen any misleading posts recently, how do you think you would feel?",
                          "Has the Inoculation against Misinformation course changed your behavior on social media? If so, how?",
                          "If you were to tell a friend what you learned in the course, what tip would you share?")

df_4 <- df_why[df_why$gender != "Other", c("treatment", "gender", "reflective_4")]
df_4$contain_course_info <- ifelse(grepl("stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question|verify|verified|identified|prove|proved|differentiate|distinguish|tell.*difference|spot|confirm|confirmed|researched|analyzed|before|tell.*between|ask myself|asked myself", df_4$reflective_4), 1L, 0L)
df_4 %>% group_by(treatment, gender) %>% summarise(percentage_mentioned = mean(contain_course_info), count_mentioned = sum(contain_course_info), total_in_group = n()) %>% filter(treatment %in% c("combo", "emotion", "tactics"))

output <- df_4[order(df_4$contain_course_info, decreasing = TRUE), c("treatment", "gender", "contain_course_info", "reflective_4")]
colnames(output) <- c("treatment_group", "gender", "contain_course_keyword", reflective_questions[4])

test_1_q4 <- t.test(output %>% filter(treatment_group %in% c("emotion") & gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("emotion") & gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "greater")

test_2_q4 <- t.test(output %>% filter(treatment_group %in% c("tactics"), gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("tactics"), gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "greater")

test_3_q4 <- t.test(output %>% filter(treatment_group %in% c("combo"), gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("combo"), gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "greater")



results_q4 = data.frame(matrix(NA, 3, 6))
colnames(results_q4) = c("estimates","std.err","CI_lw","CI_up","ts","p_val")
rownames(results_q4) = c("Test 1 - Man vs Woman (Emotion)",
                         "Test 2 - Man vs Woman (Reasoning)", 
                         "Test 3 - Man vs Woman (Combo)")

results_q4$estimates <- c(test_1_q4$estimate[1] - test_1_q4$estimate[2],
                          test_2_q4$estimate[1] - test_2_q4$estimate[2],
                          test_3_q4$estimate[1] - test_3_q4$estimate[2])
results_q4$std.err <- c(test_1_q4$stderr,
                        test_2_q4$stderr,
                        test_3_q4$stderr)
results_q4$CI_lw <- c(test_1_q4$conf.int[1],
                      test_2_q4$conf.int[1],
                      test_3_q4$conf.int[1])
results_q4$CI_up <- c(test_1_q4$conf.int[2],
                      test_2_q4$conf.int[2],
                      test_3_q4$conf.int[2])
results_q4$ts <- c(test_1_q4$statistic,
                   test_2_q4$statistic,
                   test_3_q4$statistic)
results_q4$p_val <- c(test_1_q4$p.value,
                      test_2_q4$p.value,
                      test_3_q4$p.value)
results_q4$p_val_holm = p.adjust(results_q4$p_val,"holm")

results_q4

df_5 <- df_why[df_why$gender != "Other", c("treatment", "gender", "reflective_5")]
df_5$contain_course_info <- ifelse(grepl("stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question|verify|verified|identified|prove|proved|differentiate|distinguish|tell.*difference|spot|confirm|confirmed|researched|analyzed|before|tell.*between|ask myself|asked myself", df_5$reflective_5), 1L, 0L)
df_5 %>% group_by(treatment, gender) %>% summarise(percentage_mentioned = mean(contain_course_info), count_mentioned = sum(contain_course_info), total_in_group = n()) %>% filter(treatment %in% c("combo", "emotion", "tactics"))

output <- df_5[order(df_5$contain_course_info, decreasing = TRUE), c("treatment", "gender", "contain_course_info", "reflective_5")]
colnames(output) <- c("treatment_group", "gender", "contain_course_keyword", reflective_questions[5])

test_1_q5 <- t.test(output %>% filter(treatment_group %in% c("emotion") & gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("emotion") & gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "greater")

test_2_q5 <- t.test(output %>% filter(treatment_group %in% c("tactics"), gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("tactics"), gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "greater")

test_3_q5 <- t.test(output %>% filter(treatment_group %in% c("combo"), gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("combo"), gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "greater")



results_q5 = data.frame(matrix(NA, 3, 6))
colnames(results_q5) = c("estimates","std.err","CI_lw","CI_up","ts","p_val")
rownames(results_q5) = c("Test 1 - Man vs Woman (Emotion)",
                         "Test 2 - Man vs Woman (Reasoning)", 
                         "Test 3 - Man vs Woman (Combo)")

results_q5$estimates <- c(test_1_q5$estimate[1] - test_1_q5$estimate[2],
                          test_2_q5$estimate[1] - test_2_q5$estimate[2],
                          test_3_q5$estimate[1] - test_3_q5$estimate[2])
results_q5$std.err <- c(test_1_q5$stderr,
                        test_2_q5$stderr,
                        test_3_q5$stderr)
results_q5$CI_lw <- c(test_1_q5$conf.int[1],
                      test_2_q5$conf.int[1],
                      test_3_q5$conf.int[1])
results_q5$CI_up <- c(test_1_q5$conf.int[2],
                      test_2_q5$conf.int[2],
                      test_3_q5$conf.int[2])
results_q5$ts <- c(test_1_q5$statistic,
                   test_2_q5$statistic,
                   test_3_q5$statistic)
results_q5$p_val <- c(test_1_q5$p.value,
                      test_2_q5$p.value,
                      test_3_q5$p.value)
results_q5$p_val_holm = p.adjust(results_q5$p_val,"holm")

results_q5

Contain Information Keyword

df_4 <- df_why[df_why$gender != "Other", c("treatment", "gender", "reflective_4")]
df_4$contain_misinfo_keyword <- ifelse(grepl("misinformation|misleading|disinformation|misinfo|disinfo|accurate|true", df_4$reflective_4), 1L, 0L)
df_4 %>% group_by(treatment, gender) %>% summarise(percentage_mentioned = mean(contain_misinfo_keyword), count_mentioned = sum(contain_misinfo_keyword), total_in_group = n()) %>% filter(treatment %in% c("combo", "emotion", "tactics"))

output <- df_4[order(df_4$contain_misinfo_keyword, decreasing = TRUE), c("treatment", "gender", "contain_misinfo_keyword", "reflective_4")]
colnames(output) <- c("treatment_group", "gender", "contain_course_keyword", reflective_questions[4])

test_1_q4 <- t.test(output %>% filter(treatment_group %in% c("emotion") & gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("emotion") & gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "less")

test_2_q4 <- t.test(output %>% filter(treatment_group %in% c("tactics"), gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("tactics"), gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "less")

test_3_q4 <- t.test(output %>% filter(treatment_group %in% c("combo"), gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("combo"), gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "less")



results_q4 = data.frame(matrix(NA, 3, 6))
colnames(results_q4) = c("estimates","std.err","CI_lw","CI_up","ts","p_val")
rownames(results_q4) = c("Test 1 - Man vs Woman (Emotion)",
                         "Test 2 - Man vs Woman (Reasoning)", 
                         "Test 3 - Man vs Woman (Combo)")

results_q4$estimates <- c(test_1_q4$estimate[1] - test_1_q4$estimate[2],
                          test_2_q4$estimate[1] - test_2_q4$estimate[2],
                          test_3_q4$estimate[1] - test_3_q4$estimate[2])
results_q4$std.err <- c(test_1_q4$stderr,
                        test_2_q4$stderr,
                        test_3_q4$stderr)
results_q4$CI_lw <- c(test_1_q4$conf.int[1],
                      test_2_q4$conf.int[1],
                      test_3_q4$conf.int[1])
results_q4$CI_up <- c(test_1_q4$conf.int[2],
                      test_2_q4$conf.int[2],
                      test_3_q4$conf.int[2])
results_q4$ts <- c(test_1_q4$statistic,
                   test_2_q4$statistic,
                   test_3_q4$statistic)
results_q4$p_val <- c(test_1_q4$p.value,
                      test_2_q4$p.value,
                      test_3_q4$p.value)
results_q4$p_val_holm = p.adjust(results_q4$p_val,"holm")

results_q4

df_5 <- df_why[df_why$gender != "Other", c("treatment", "gender", "reflective_5")]
df_5$contain_misinfo_keyword <- ifelse(grepl("misinformation|misleading|disinformation|misinfo|disinfo|accurate|true", df_5$reflective_5), 1L, 0L)
df_5 %>% group_by(treatment, gender) %>% summarise(percentage_mentioned = mean(contain_misinfo_keyword), count_mentioned = sum(contain_misinfo_keyword), total_in_group = n()) %>% filter(treatment %in% c("combo", "emotion", "tactics"))

output <- df_5[order(df_5$contain_misinfo_keyword, decreasing = TRUE), c("treatment", "gender", "contain_misinfo_keyword", "reflective_5")]
colnames(output) <- c("treatment_group", "gender", "contain_course_keyword", reflective_questions[5])

test_1_q5 <- t.test(output %>% filter(treatment_group %in% c("emotion") & gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("emotion") & gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "less")

test_2_q5 <- t.test(output %>% filter(treatment_group %in% c("tactics"), gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("tactics"), gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "less")

test_3_q5 <- t.test(output %>% filter(treatment_group %in% c("combo"), gender == "Man") %>% select(contain_course_keyword), 
                    output %>% filter(treatment_group %in% c("combo"), gender == "Woman") %>% select(contain_course_keyword),
                    alternative = "less")



results_q5 = data.frame(matrix(NA, 3, 6))
colnames(results_q5) = c("estimates","std.err","CI_lw","CI_up","ts","p_val")
rownames(results_q5) = c("Test 1 - Man vs Woman (Emotion)",
                         "Test 2 - Man vs Woman (Reasoning)", 
                         "Test 3 - Man vs Woman (Combo)")

results_q5$estimates <- c(test_1_q5$estimate[1] - test_1_q5$estimate[2],
                          test_2_q5$estimate[1] - test_2_q5$estimate[2],
                          test_3_q5$estimate[1] - test_3_q5$estimate[2])
results_q5$std.err <- c(test_1_q5$stderr,
                        test_2_q5$stderr,
                        test_3_q5$stderr)
results_q5$CI_lw <- c(test_1_q5$conf.int[1],
                      test_2_q5$conf.int[1],
                      test_3_q5$conf.int[1])
results_q5$CI_up <- c(test_1_q5$conf.int[2],
                      test_2_q5$conf.int[2],
                      test_3_q5$conf.int[2])
results_q5$ts <- c(test_1_q5$statistic,
                   test_2_q5$statistic,
                   test_3_q5$statistic)
results_q5$p_val <- c(test_1_q5$p.value,
                      test_2_q5$p.value,
                      test_3_q5$p.value)
results_q5$p_val_holm = p.adjust(results_q5$p_val,"holm")

results_q5

Free-Text Analysis

Zelin (James) Li

2023-01-30

Data

Generate Descriptive Statistics

Create variables of interest

Reflective Question 4

Heuristics

Contain Heuristics

Does not contain heuristics

Conduct Hypothesis Test

Plot

WordCloud

Reflective Question 5

Heuristics

Contain Heuristics

Does not contain heuristics

Conduct Hypothesis Test

Plot

WordCloud

Comparing Gender

Reflective Question 4

Emotions

Reasoning

Combo

Reflective Question 5

Emotions

Reasoning

Combo

Hypothesis Testing

Contain Course Keyword

Contain Information Keyword