Free-Text Analysis

rm(list = ls())
library(openxlsx)
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, zoo, data.table)
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, topicmodels, ldatuning, lda,SnowballC, pals, flextable, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir)

set.seed(94305)
dir.create(file.path('tables'), showWarnings = FALSE)
dir.create(file.path('figures'), showWarnings = FALSE)
dir.create(file.path('freetext'), showWarnings = FALSE)

Data

library(dplyr)
files <- list.files('../main_analysis/data', full.names = TRUE)
pre_files <- files[grepl("Main PRE", files)]
post_files <- files[grepl("Main POST", files)]
followup_files <- files[grepl("Followup", files)]
INPUT_FILENAME_PRE <- pre_files[which.max(file.info(pre_files)$mtime)]
INPUT_FILENAME_POST <- post_files[which.max(file.info(post_files)$mtime)]
INPUT_FILENAME_FOLLOWUP <- followup_files[which.max(file.info(followup_files)$mtime)]
# df_full_pre <- read_csv(INPUT_FILENAME_PRE) %>% clean_names %>% remove_empty()
df_full_post <- read_csv(INPUT_FILENAME_POST) %>% clean_names %>% remove_empty()
df_full_followup <- read_csv(INPUT_FILENAME_FOLLOWUP) %>% clean_names %>% remove_empty()

df_final <- readRDS("../main_analysis/data/df_final.rds")



df <- df_full_followup %>% filter(finished == "True") %>% filter(consent_course == "Yes, I agree") %>% filter(status == "IP Address")

contact_list <- read.csv("../contact_lists/main/misinfo_followup.csv") 
contact_list <- contact_list %>% rename(external_reference = ExternalDataReference)
contact_list$external_reference <- paste0("+", as.character(contact_list$external_reference))
df <- left_join(df, contact_list, by = "external_reference")

df_post_valid_phone <- unique(df_final$phone_num)
df_post <- df_full_post[df_full_post$phone_num %in% df_post_valid_phone, ]

# clean_phone_number <- function(phone){
#   if (is.na(phone)){
#     return ("")
#   }
#   if (((substr(phone, 1, 1) == "0") | (substr(phone, 1, 1) == "O") | (substr(phone, 1, 1) == "+")) & nchar(phone) == 10){
#     phone <- substr(phone, 2, nchar(phone))
#   } else if (((substr(phone, 1, 3) == "254")) & nchar(phone) == 12){
#     phone <- substr(phone, 4, nchar(phone))
#   } else if (((substr(phone, 1, 4) == "2540")) & nchar(phone) == 13){
#     phone <- substr(phone, 5, nchar(phone))
#   } else if (((substr(phone, 1, 4) == "+254")) & nchar(phone) == 13){
#     phone <- substr(phone, 5, nchar(phone))
#   } else if (((substr(phone, 1, 4) == "±254")) & nchar(phone) == 13){
#     phone <- substr(phone, 5, nchar(phone))
#   } else if (((substr(phone, 1, 4) == "†254")) & nchar(phone) == 13){
#     phone <- substr(phone, 5, nchar(phone))
#   } else if (((substr(phone, 1, 5) == "+2540")) & nchar(phone) == 14){
#     phone <- substr(phone, 6, nchar(phone))
#   } else {
#     phone <- ""
#   }
#   return (phone)
#   
#   
# }
# df_post_phone <- sapply(df_post_valid_phone, function(x) clean_phone_number(x))

free_text_columns <- colnames(df)[startsWith(colnames(df), "reflective")]
reflective_questions <- c("What are some techniques that people use to create misleading social media posts?",
                          "When browsing your timeline in the last month, did you notice any post that looked misleading? If so, what made it seem misleading?",
                          "How did you feel when you saw the misleading post? If you haven't seen any misleading posts recently, how do you think you would feel?",
                          "Has the Inoculation against Misinformation course changed your behavior on social media? If so, how?",
                          "If you were to tell a friend what you learned in the course, what tip would you share?")


# for (i in 1:5){
#   output <- df[free_text_columns[i]]
#   colnames(output) <- reflective_questions[i]
#   write.csv(output, paste0("./freetext/reflective_questions_", i, ".csv"))
# }

Reflective Question 4

Has the Inoculation against Misinformation course changed your behavior on social media? If so, how?

Heuristics

df_4 <- df[, c("treatment", "reflective_4")]
df_4$contain_course_info <- ifelse(grepl("stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question", df_4$reflective_4), 1L, 0L)
df_4 %>% group_by(treatment) %>% summarise(percentage_mentioned = mean(contain_course_info), count_mentioned = sum(contain_course_info), total_in_group = n())

output <- df_4[order(df_4$contain_course_info, decreasing = TRUE), c("treatment", "contain_course_info", "reflective_4")]
colnames(output) <- c("treatment_group", "contain_course_keyword", reflective_questions[4])
write.csv(output, paste0("./freetext/reflective_questions_4_heuristics.csv"))

Contain Heuristics

datatable(output[output$contain_course_keyword == 1, 3])

Does not contain heuristics

datatable(output[output$contain_course_keyword == 0, 3])

WordCloud

vector_wc <- df_4$reflective_4
# Create corpus
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

# dataset_names <- list('Contain' = output[output$contain_course_keyword == 1, ], 
#                       'Does not contain' = output[output$contain_course_keyword == 0, ])
# openxlsx::write.xlsx(dataset_names,  paste0("./freetext/reflective_questions_4_heuristics.xlsx"))

Reflective Question 5

If you were to tell a friend what you learned in the course, what tip would you share?

Heuristics

df_5 <- df[, c("treatment", "reflective_5")]
df_5$contain_course_info <- ifelse(grepl("stop|think|first|check|evaluate|identify|investigate|analyze|research|pause|question", df_5$reflective_5), 1L, 0L)
df_5 %>% group_by(treatment) %>% summarise(percentage_mentioned = mean(contain_course_info), count_mentioned = sum(contain_course_info), total_in_group = n())

output <- df_5[order(df_5$contain_course_info, decreasing = TRUE), c("treatment", "contain_course_info", "reflective_5")]
colnames(output) <- c("treatment_group", "contain_course_keyword", reflective_questions[5])
# write.csv(output, paste0("./freetext/reflective_questions_5_heuristics.csv"))

# library(openxlsx)
# dataset_names <- list('Contain' = output[output$contain_course_keyword == 1, ], 
#                       'Does not contain' = output[output$contain_course_keyword == 0, ])
# openxlsx::write.xlsx(dataset_names,  paste0("./freetext/reflective_questions_5_heuristics.xlsx"))

Contain Heuristics

datatable(output[output$contain_course_keyword == 1, 3])

Does not contain heuristics

datatable(output[output$contain_course_keyword == 0, 3])

WordCloud

vector_wc <- df_5$reflective_5
# Create corpus
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Free-Text Analysis

Zelin (James) Li

2022-10-11

Data

Reflective Question 4

Heuristics

Contain Heuristics

Does not contain heuristics

WordCloud

Reflective Question 5

Heuristics

Contain Heuristics

Does not contain heuristics

WordCloud