rm(list = ls())
library(openxlsx)
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, zoo, data.table)
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, topicmodels, ldatuning, lda,SnowballC, pals, flextable, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir)
set.seed(94305)
dir.create(file.path('tables'), showWarnings = FALSE)
dir.create(file.path('figures'), showWarnings = FALSE)
dir.create(file.path('freetext'), showWarnings = FALSE)library(dplyr)
files <- list.files('../main_analysis/data', full.names = TRUE)
pre_files <- files[grepl("Main PRE", files)]
post_files <- files[grepl("Main POST", files)]
followup_files <- files[grepl("Followup", files)]
INPUT_FILENAME_PRE <- pre_files[which.max(file.info(pre_files)$mtime)]
INPUT_FILENAME_POST <- post_files[which.max(file.info(post_files)$mtime)]
INPUT_FILENAME_FOLLOWUP <- followup_files[which.max(file.info(followup_files)$mtime)]
# df_full_pre <- read_csv(INPUT_FILENAME_PRE) %>% clean_names %>% remove_empty()
df_full_post <- read_csv(INPUT_FILENAME_POST) %>% clean_names %>% remove_empty()
df_full_followup <- read_csv(INPUT_FILENAME_FOLLOWUP) %>% clean_names %>% remove_empty()
df_final <- readRDS("../main_analysis/data/df_final.rds")
df <- df_full_followup %>% filter(finished == "True") %>% filter(consent_course == "Yes, I agree") %>% filter(status == "IP Address")
contact_list <- read.csv("../contact_lists/main/misinfo_followup.csv")
contact_list <- contact_list %>% rename(external_reference = ExternalDataReference)
contact_list$external_reference <- paste0("+", as.character(contact_list$external_reference))
df <- left_join(df, contact_list, by = "external_reference")
df_post_valid_phone <- unique(df_final$phone_num)
df_post <- df_full_post[df_full_post$phone_num %in% df_post_valid_phone, ]# clean_phone_number <- function(phone){
# if (is.na(phone)){
# return ("")
# }
# if (((substr(phone, 1, 1) == "0") | (substr(phone, 1, 1) == "O") | (substr(phone, 1, 1) == "+")) & nchar(phone) == 10){
# phone <- substr(phone, 2, nchar(phone))
# } else if (((substr(phone, 1, 3) == "254")) & nchar(phone) == 12){
# phone <- substr(phone, 4, nchar(phone))
# } else if (((substr(phone, 1, 4) == "2540")) & nchar(phone) == 13){
# phone <- substr(phone, 5, nchar(phone))
# } else if (((substr(phone, 1, 4) == "+254")) & nchar(phone) == 13){
# phone <- substr(phone, 5, nchar(phone))
# } else if (((substr(phone, 1, 4) == "±254")) & nchar(phone) == 13){
# phone <- substr(phone, 5, nchar(phone))
# } else if (((substr(phone, 1, 4) == "†254")) & nchar(phone) == 13){
# phone <- substr(phone, 5, nchar(phone))
# } else if (((substr(phone, 1, 5) == "+2540")) & nchar(phone) == 14){
# phone <- substr(phone, 6, nchar(phone))
# } else {
# phone <- ""
# }
# return (phone)
#
#
# }
# df_post_phone <- sapply(df_post_valid_phone, function(x) clean_phone_number(x))free_text_columns <- colnames(df)[startsWith(colnames(df), "reflective")]
reflective_questions <- c("What are some techniques that people use to create misleading social media posts?",
"When browsing your timeline in the last month, did you notice any post that looked misleading? If so, what made it seem misleading?",
"How did you feel when you saw the misleading post? If you haven't seen any misleading posts recently, how do you think you would feel?",
"Has the Inoculation against Misinformation course changed your behavior on social media? If so, how?",
"If you were to tell a friend what you learned in the course, what tip would you share?")
# for (i in 1:5){
# output <- df[free_text_columns[i]]
# colnames(output) <- reflective_questions[i]
# write.csv(output, paste0("./freetext/reflective_questions_", i, ".csv"))
# }Has the Inoculation against Misinformation course changed your behavior on social media? If so, how?
Current Heuristics (contain any of the following keywords): stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question
df_4 <- df[, c("treatment", "reflective_4")]
df_4$contain_course_info <- ifelse(grepl("stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question", df_4$reflective_4), 1L, 0L)
df_4 %>% group_by(treatment) %>% summarise(percentage_mentioned = mean(contain_course_info), count_mentioned = sum(contain_course_info), total_in_group = n())output <- df_4[order(df_4$contain_course_info, decreasing = TRUE), c("treatment", "contain_course_info", "reflective_4")]
colnames(output) <- c("treatment_group", "contain_course_keyword", reflective_questions[4])
write.csv(output, paste0("./freetext/reflective_questions_4_heuristics.csv"))datatable(output[output$contain_course_keyword == 1, 3])datatable(output[output$contain_course_keyword == 0, 3])vector_wc <- df_4$reflective_4
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))# dataset_names <- list('Contain' = output[output$contain_course_keyword == 1, ],
# 'Does not contain' = output[output$contain_course_keyword == 0, ])
# openxlsx::write.xlsx(dataset_names, paste0("./freetext/reflective_questions_4_heuristics.xlsx"))If you were to tell a friend what you learned in the course, what tip would you share?
Current Heuristics (contain any of the following keywords): stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question
df_5 <- df[, c("treatment", "reflective_5")]
df_5$contain_course_info <- ifelse(grepl("stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question", df_5$reflective_5), 1L, 0L)
df_5 %>% group_by(treatment) %>% summarise(percentage_mentioned = mean(contain_course_info), count_mentioned = sum(contain_course_info), total_in_group = n())output <- df_5[order(df_5$contain_course_info, decreasing = TRUE), c("treatment", "contain_course_info", "reflective_5")]
colnames(output) <- c("treatment_group", "contain_course_keyword", reflective_questions[5])
# write.csv(output, paste0("./freetext/reflective_questions_5_heuristics.csv"))
# library(openxlsx)
# dataset_names <- list('Contain' = output[output$contain_course_keyword == 1, ],
# 'Does not contain' = output[output$contain_course_keyword == 0, ])
# openxlsx::write.xlsx(dataset_names, paste0("./freetext/reflective_questions_5_heuristics.xlsx"))datatable(output[output$contain_course_keyword == 1, 3])datatable(output[output$contain_course_keyword == 0, 3])vector_wc <- df_5$reflective_5
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))