library(tidyverse)
library(kableExtra)
library(broom)
library(texreg)
library(patchwork)
library(readr)
library(lubridate)
library(boot)
library(stargazer)
library(fixest)
library(pander)
library(stringi)FB-Misinformation Data Cleaning
The purpose of this document is to clean the raw FB Misinformation Chatfuel data.
Loading R library
create_table <- function(data) {
# Reshape data: pivot from long to wide format
reshaped_data <- data %>%
pivot_wider(names_from = arm, values_from = phase_coded)
# Generate and return the table
kable(reshaped_data)
}Loading the raw dataset and generating a seperate anon_id for each respondents
The data was pulled on 01/18/2024. The data is downloaded from Chatfuel without personal private information of respondents such as name, profile picture url, and etc. Chatfuel user id is collected, but for safety, we’ll generate a unique id (anon_id) for each of the users.
setwd("C:/Users/kietle/OneDrive - Stanford/Documents/")
data <- read_csv("Project_Don_t_Get_Duped_2024_01_18_18_27_44.csv")
df_chat_anon <- data %>%
mutate(anon_id = 1:nrow(.)) %>%
select(-`chatfuel user id`)
write_csv(df_chat_anon, "Project_Don_t_Get_Duped_2024_01_18_18_27_44_anon.csv")setwd("C:/Users/kietle/OneDrive - Stanford/Documents/")
data <- read_csv("Project_Don_t_Get_Duped_2024_01_18_18_27_44_anon.csv")Creating a copy of raw dataset
data_raw <- data %>%
write_csv("misinfo_raw.csv")Cleaning phases (phase_coded)
Looking at respondents across different phases and generating generating a phase identifier (“phase_coded”)
Categorizing Phases
Respondents who were in Version ExperimentLaunchV1 or ExperimentLaunchV2 as Phase 1
Respondents who were in Version Phase2, Phase2_V2 or Phase2_V3 as Phase 2
total_count <- data %>%
count(NewVersion) %>%
summarise(total = sum(n)) %>%
pull(total)
data %>%
count(NewVersion) %>%
rbind(data.frame(NewVersion = "Total", n = total_count)) %>%
kable("html", col.names=(c("Phase", "Count"))) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") | Phase | Count |
|---|---|
| ExperimentLaunchV1 | 9914 |
| ExperimentLaunchV2 | 30519 |
| Final_Technical_Pilot | 181 |
| Misinfo_Pilot | 290 |
| Phase2 | 8058 |
| Phase2_V2 | 27276 |
| Phase2_V3 | 15512 |
| NA | 66568 |
| Total | 158318 |
# generating coded phase 1 and 2 variables and dropping respondents not in Phase 1 or Phase 2
data$phase_coded <-data$NewVersion
data$phase_coded[data$phase_coded=="ExperimentLaunchV1" | data$phase_coded=="ExperimentLaunchV2"] = "Phase 1"
data$phase_coded[data$phase_coded=="Phase2" | data$phase_coded=="Phase2_V2" | data$phase_coded=="Phase2_V3"] = "Phase 2"
total_count <- data %>%
count(phase_coded) %>%
summarise(total = sum(n)) %>%
pull(total)
data %>%
count(phase_coded) %>%
rbind(data.frame(phase_coded = "Total", n = total_count)) %>%
kable("html", col.names=(c("Phase", "Count"))) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") | Phase | Count |
|---|---|
| Final_Technical_Pilot | 181 |
| Misinfo_Pilot | 290 |
| Phase 1 | 40433 |
| Phase 2 | 50846 |
| NA | 66568 |
| Total | 158318 |
Dropping respondents not in Phase 1 or Phase 2
data <- data %>%
filter(phase_coded == "Phase 1" | phase_coded == "Phase 2")total_count <- data %>%
count(phase_coded) %>%
summarise(total = sum(n)) %>%
pull(total)
data %>%
count(phase_coded) %>%
rbind(data.frame(phase_coded = "Total", n = total_count)) %>%
kable("html", col.names=(c("Phase", "Count"))) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") | Phase | Count |
|---|---|
| Phase 1 | 40433 |
| Phase 2 | 50846 |
| Total | 91279 |
Identifying attempted and actual repeaters
Repeaters are classified as “attempted repeaters” or “actual repeaters”.
Attempted repeaters are respondents trying to repeat the chatbot but were turned away automatically. The chatbot variable “repeater” identify if a person attempted repeat and based on their personal and chatfuel identification, we can determine if that person had completed the chatbot before.
Actual repeaters are respondents who actually repeated the chatbot.These actual repeaters are identified by having a timestamp in at least two treatment arms.
data_repeaters <- data %>%
select(anon_id, phase_coded,
repeater,
arm,
InterArms_start_time, InterArmsVideo_end_time, # video start and end time
InterArmsLongBase_start_time, InterArmsLongBase_end_time, #long baseline start,end time
InterArmsBase_start_time, InterArmsBase_end_time, #short baseline start,end time
InterArmsSMS_start_time, InterArmsSMS_end_time, #sms start,end time
InterArmsGame_start_time, interArmsGame_end_time, # game start and end time
MisinfoQuiz_start_time) %>%
mutate(has_longbase_time = if_else(!is.na(InterArmsLongBase_start_time) | !is.na(InterArmsLongBase_end_time), TRUE, FALSE),
has_video_time = if_else(!is.na(InterArms_start_time) | !is.na(InterArmsVideo_end_time), TRUE, FALSE),
has_game_time = if_else(!is.na(InterArmsGame_start_time) | !is.na(interArmsGame_end_time), TRUE, FALSE),
has_shortbase_time = if_else(!is.na(InterArmsBase_start_time) | !is.na(InterArmsBase_end_time), TRUE, FALSE),
has_sms_time = if_else(!is.na(InterArmsSMS_start_time) | !is.na(InterArmsSMS_end_time), TRUE, FALSE),
repeated = has_longbase_time + has_shortbase_time + has_sms_time + has_video_time +has_game_time > 1) %>%
select(anon_id, arm, repeater, repeated, phase_coded, MisinfoQuiz_start_time)
data_repeaters_by_date_P1 <- data_repeaters %>%
filter(phase_coded=="Phase 1") %>%
mutate(day = as.Date(MisinfoQuiz_start_time)) %>%
group_by(day) %>%
count(repeated) %>%
mutate(pct = n/sum(n))
data_repeaters_by_date_P2 <- data_repeaters %>%
filter(phase_coded=="Phase 2") %>%
mutate(day = as.Date(MisinfoQuiz_start_time)) %>%
group_by(day) %>%
count(repeated) %>%
mutate(pct = n/sum(n))summarized_data <- data_repeaters %>%
group_by(phase_coded) %>%
summarize(
attempted_repeaters = sum(repeater, na.rm = TRUE),
actual_repeaters = sum(repeated)
) %>%
ungroup()
# Calculate total row
total_row <- summarized_data %>%
summarise(
phase_coded = "Total",
attempted_repeaters = sum(attempted_repeaters, na.rm = TRUE),
actual_repeaters = sum(actual_repeaters, na.rm = TRUE)
)
# Combine the total row with the summarized data
final_data <- bind_rows(summarized_data, total_row)
# Create the table
final_data %>%
mutate(total=attempted_repeaters+actual_repeaters) %>%
kable("html", col.names = c("Phase", "Attempted Repeaters", "Actual Repeaters", "Total")) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Phase | Attempted Repeaters | Actual Repeaters | Total |
|---|---|---|---|
| Phase 1 | 11612 | 3425 | 15037 |
| Phase 2 | 9955 | 23 | 9978 |
| Total | 21567 | 3448 | 25015 |
Repeaters entered chatbot in Phase 1
ggplot(data_repeaters_by_date_P1 %>% filter(repeated),
aes(x = day, y = pct)) +
geom_bar(position = "identity", stat = "identity") +
scale_y_continuous(limits = c(0, 1), labels = scales::percent, expand = c(0, 0, 0, 0)) +
scale_x_date(breaks = "day") +
labs(x = "Day",
y = "% respondents who repeatedly entered chatbot in Phase 1") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))Repeaters entered chatbot in Phase 2
ggplot(data_repeaters_by_date_P2 %>% filter(repeated),
aes(x = day, y = pct)) +
geom_bar(position = "identity", stat = "identity") +
scale_y_continuous(limits = c(0, 1), labels = scales::percent, expand = c(0, 0, 0, 0)) +
scale_x_date(breaks = "day") +
labs(x = "Day",
y = "% respondents who repeatedly entered chatbot in Phase 2") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))Excluding repeaters
data <- left_join(data, data_repeaters %>% select(anon_id, repeated), by = "anon_id") %>%
filter(!repeated)summarized_data <- data_repeaters %>%
group_by(phase_coded) %>%
summarize(
attempted_repeaters = sum(repeater, na.rm = TRUE),
actual_repeaters = sum(repeated)
) %>%
ungroup()
# Calculate total row
total_row <- summarized_data %>%
summarise(
phase_coded = "Total",
attempted_repeaters = sum(attempted_repeaters, na.rm = TRUE),
actual_repeaters = sum(actual_repeaters, na.rm = TRUE)
)
# Combine the total row with the summarized data
final_data <- bind_rows(summarized_data, total_row)
# Create the table
final_data %>%
mutate(total=attempted_repeaters+actual_repeaters) %>%
kable("html", col.names = c("Phase", "Attempted Repeaters", "Actual Repeaters", "Total")) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Phase | Attempted Repeaters | Actual Repeaters | Total |
|---|---|---|---|
| Phase 1 | 11612 | 3425 | 15037 |
| Phase 2 | 9955 | 23 | 9978 |
| Total | 21567 | 3448 | 25015 |
Cleaning treatment arms variable (arm_coded)
Checking the raw treatment arm variable (“arm”), we see the following possible entries: baseline, game, longbaseline, sms, video
There is one entry of “video” in Phase 2, so we’ll treat it as a missing value
Baseline 1 : If arm == “baseline” and phase_coded == “Phase 1”
SMS: If arm == “sms”
Video: If arm == “video” and phase_coded == “Phase 1”
Game : If arm == “game” and phase_coded == “Phase 1”
Short Baseline 2 : If arm == “baseline” and phase_coded == “Phase 2”
Long Baseline: If arm == “longbaseline” and phase_coded == “Phase 2”
#checking possible treatment variables
data %>%
count(arm) %>%
kable("html", col.names=(c("Treatment Arm", "Count"))) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") | Treatment Arm | Count |
|---|---|
| baseline | 13715 |
| game | 7491 |
| longbaseline | 14160 |
| sms | 27849 |
| video | 7546 |
| NA | 17070 |
Recoding treatment arm variables
data <- data %>%
mutate(
arm_coded = case_when(
arm == "baseline" & phase_coded == "Phase 1" ~ "Baseline 1",
arm %in% c("SMS", "sms") ~ "SMS",
arm == "video" & phase_coded == "Phase 1" ~ "Video",
arm == "game" & phase_coded == "Phase 1" ~ "Game",
arm == "baseline" & phase_coded == "Phase 2" ~ "Short Baseline 2",
arm == "longbaseline" & phase_coded == "Phase 2" ~ "Long Baseline",
TRUE ~ NA_character_
)
)
data %>%
group_by(phase_coded) %>%
summarise(baseline_1=sum(arm_coded=="Baseline 1", na.rm=TRUE),
sms= sum(arm_coded=="SMS", na.rm=TRUE),
video= sum(arm_coded=="Video", na.rm=TRUE),
game= sum(arm_coded=="Game", na.rm=TRUE),
short_baseline_2=sum(arm_coded=="Short Baseline 2", na.rm=TRUE),
long_baseline = sum(arm_coded=="Long Baseline", na.rm=TRUE)) %>%
mutate(total=baseline_1+sms+video+game+short_baseline_2+long_baseline) %>%
kable("html", col.names=(c("Phase", "Baseline 1", "SMS",
"Video", "Game", "Short Baseline 2",
"Long Baseline", "Total"))) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%")| Phase | Baseline 1 | SMS | Video | Game | Short Baseline 2 | Long Baseline | Total |
|---|---|---|---|---|---|---|---|
| Phase 1 | 7463 | 7564 | 7545 | 7491 | 0 | 0 | 30063 |
| Phase 2 | 0 | 20285 | 0 | 0 | 6252 | 14158 | 40695 |
Cleaning user country (country_coded)
Variable user_country indicates where the respondents come from
Responses are messy with some random links and emojis which are treated as missing values
Answers who include an explicit name of the country or small typos are renamed as one of the four countries
Checking possible country answers
data$country_coded <- data$user_country
data %>%
count(country_coded)# A tibble: 221 × 2
country_coded n
<chr> <int>
1 ",nigeria" 1
2 ".\nSouth Africa" 1
3 ". south Africa" 1
4 ".." 1
5 "Accra" 1
6 "Alright" 1
7 "Am also called BLESS" 1
8 "Am from Dubai" 1
9 "Am from Ukraine" 1
10 "Am from benue state" 1
# ℹ 211 more rows
Cleaning and renaming country responses
data$country_coded[data$country_coded == "kenya" |data$country_coded == "Iam from Kenya" | data$country_coded == "KENYA" | data$country_coded == "Kenya." | data$country_coded == "kENYA" | data$country_coded == "Iam from Kenya"] = "Kenya"
data$country_coded[data$country_coded == "South africa" | data$country_coded == "south africa" | data$country_coded == "SOUTH AFRICA" | data$country_coded == "south Africa" | data$country_coded == ".\nSouth Africa" | data$country_coded == ". south Africa" | data$country_coded == "SOUTH: AFRICA:" | data$country_coded == "South africa" | data$country_coded == "South Africa." | data$country_coded == "South Africa..." | data$country_coded == "SOUTH Africa" ] = "South Africa"
data$country_coded[data$country_coded == "nigeria" | data$country_coded == "NIGERIA" | data$country_coded== ",nigeria" | data$country_coded== "Am in Nigeria" | data$country_coded== "I'm from Nigeria" |data$country_coded == "Hi Hi\nNigeria"| data$country_coded == "Nigeria." | data$country_coded == "Nigeria.." | data$country_coded == "Nigeria 🇳🇬" | data$country_coded == "Plateau State, Nigeria"| data$country_coded == "Plateau State, Nigeria" | data$country_coded == "nigerian"] = "Nigeria"
data$country_coded[data$country_coded == "GHANA" | data$country_coded == "ghana" | data$country_coded == "Am in Ghana" | data$country_coded == "GHANA." | data$country_coded == "GHAna" | data$country_coded == "Gahna" | data$country_coded == "Ghana." | data$country_coded == "In Ghana" | data$country_coded == "In Ghana" | data$country_coded == "From Ghana please" ] = "Ghana"
data$country_coded[!data$country_coded %in% c("Kenya", "South Africa", "Nigeria", "Ghana")] <- NAsummarized_data <- data %>%
group_by(phase_coded) %>%
summarise(
ghana = sum(country_coded == "Ghana", na.rm = TRUE),
kenya = sum(country_coded == "Kenya", na.rm = TRUE),
sa = sum(country_coded == "South Africa", na.rm = TRUE),
nigeria = sum(country_coded == "Nigeria", na.rm = TRUE)
) %>%
ungroup()
# Calculate total row
total_row <- summarized_data %>%
summarise(
phase_coded = "Total",
ghana = sum(ghana, na.rm = TRUE),
kenya = sum(kenya, na.rm = TRUE),
sa = sum(sa, na.rm = TRUE),
nigeria = sum(nigeria, na.rm = TRUE)
)
# Combine the total row with the summarized data
final_data <- bind_rows(summarized_data, total_row)
# Add total column and create the table
final_data %>%
mutate(total = ghana + kenya + sa + nigeria) %>%
kable("html", col.names = c("Phase", "Ghana", "Kenya", "South Africa", "Nigeria", "Total")) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(width = "100%")| Phase | Ghana | Kenya | South Africa | Nigeria | Total |
|---|---|---|---|---|---|
| Phase 1 | 4415 | 4286 | 9707 | 14191 | 32599 |
| Phase 2 | 4674 | 6122 | 14106 | 20470 | 45372 |
| Total | 9089 | 10408 | 23813 | 34661 | 77971 |
Cleaning consent responses (consent_coded)
The chatbot quick responses are “I consent, start now” for consenting and “No, let’s stop here” for not providing consent
Beside the preset selection, We received random responses such emojis, questions, links and other incomplete sentences
If responses beside “I consent, start now” include an explicit agreement, then those responses are considered “I consent start now”. There are some responses only contain the age information without an explicit agreement and these responses are not considered “I consent, start now”.
Those who did not answer consent or did not answer are considered “No, let’s stop here”
Checking possible consent responses
data$consent_coded <- data$consent
data %>% count(consent_coded)# A tibble: 147 × 2
consent_coded n
<chr> <int>
1 ". When?" 1
2 "0533521502" 1
3 "0745432220" 1
4 "09130184439" 1
5 "26 old" 1
6 "37 year old" 1
7 "6es" 1
8 "Abi \U0001f644\U0001f644" 1
9 "Alright" 2
10 "Am 22" 1
# ℹ 137 more rows
Cleaning and renaming consent responses
data$consent_coded[data$consent_coded=="I Consent, start now" | data$consent_coded=="I consent start now" | data$consent_coded=="I consent start now " | data$consent_coded=="I consent,\nStart now" | data$consent_coded=="Sure!" | data$consent_coded=="Sure, let's go!" | data$consent_coded=="Yes" | data$consent_coded=="Yes I im" | data$consent_coded=="Yes I'm 23 years"
| data$consent_coded=="Yes am 20" | data$consent_coded=="Yed" | data$consent_coded=="Yeah" | data$consent_coded=="Yes am above 18" | data$consent_coded=="Yes am over 18" | data$consent_coded=="Yes i am 36 yrs" | data$consent_coded=="Yh I'm 21" | data$consent_coded=="sure let go"] <- "I consent, start now"
data$consent_coded[!data$consent_coded %in% c("I consent, start now")] <- "No, let's stop here"summarized_data <- data %>%
group_by(phase_coded) %>%
summarise(
yes = sum(consent_coded == "I consent, start now", na.rm = TRUE),
no = sum(consent_coded == "No, let's stop here", na.rm = TRUE)
) %>%
ungroup()
# Calculate total row
total_row <- summarized_data %>%
summarise(
phase_coded = "Total",
yes = sum(yes, na.rm = TRUE),
no = sum(no, na.rm = TRUE)
)
# Combine the total row with the summarized data
final_data <- bind_rows(summarized_data, total_row)
# Add total column and create the table
final_data %>%
mutate(total = yes + no) %>%
kable("html", col.names = c("Phase", "I consent, start now", "No, let's stop here", "Total")) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(width = "100%")| Phase | I consent, start now | No, let's stop here | Total |
|---|---|---|---|
| Phase 1 | 29993 | 7015 | 37008 |
| Phase 2 | 40681 | 10142 | 50823 |
| Total | 70674 | 17157 | 87831 |
Looking at consent and attempted repeaters
data %>%
group_by(repeater) %>%
summarise(yes = sum(consent_coded == "I consent, start now", na.rm = TRUE),
no = sum(consent_coded == "No, let's stop here", na.rm = TRUE)
) %>%
mutate(total = yes + no) %>%
kable("html", col.names = c("Attempted Repeater", "I consent, start now", "No, let's stop here", "Total")) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(width = "100%")| Attempted Repeater | I consent, start now | No, let's stop here | Total |
|---|---|---|---|
| TRUE | 19572 | 16 | 19588 |
| NA | 51102 | 17141 | 68243 |
Generating/Exporting a clean dataset
- The clean dataset is named “misinfo_clean.csv”
data_clean <- data %>%
write_csv("misinfo_clean.csv")