library(tidyverse)
library(kableExtra)
library(broom)
library(texreg)
library(patchwork)
library(readr)
library(lubridate)
library(boot)
library(stargazer)
library(fixest)
library(pander)
library(stringi)
FB-Misinformation Data Cleaning
The purpose of this document is to clean the raw FB Misinformation Chatfuel data.
Loading R library
<- function(data) {
create_table # Reshape data: pivot from long to wide format
<- data %>%
reshaped_data pivot_wider(names_from = arm, values_from = phase_coded)
# Generate and return the table
kable(reshaped_data)
}
Loading the raw dataset and generating a seperate anon_id for each respondents
The data was pulled on 01/18/2024. The data is downloaded from Chatfuel without personal private information of respondents such as name, profile picture url, and etc. Chatfuel user id is collected, but for safety, we’ll generate a unique id (anon_id) for each of the users.
setwd("C:/Users/kietle/OneDrive - Stanford/Documents/")
<- read_csv("Project_Don_t_Get_Duped_2024_01_18_18_27_44.csv")
data <- data %>%
df_chat_anon mutate(anon_id = 1:nrow(.)) %>%
select(-`chatfuel user id`)
write_csv(df_chat_anon, "Project_Don_t_Get_Duped_2024_01_18_18_27_44_anon.csv")
setwd("C:/Users/kietle/OneDrive - Stanford/Documents/")
<- read_csv("Project_Don_t_Get_Duped_2024_01_18_18_27_44_anon.csv") data
Creating a copy of raw dataset
<- data %>%
data_raw write_csv("misinfo_raw.csv")
Cleaning phases (phase_coded)
Looking at respondents across different phases and generating generating a phase identifier (“phase_coded”)
Categorizing Phases
Respondents who were in Version ExperimentLaunchV1 or ExperimentLaunchV2 as Phase 1
Respondents who were in Version Phase2, Phase2_V2 or Phase2_V3 as Phase 2
<- data %>%
total_count count(NewVersion) %>%
summarise(total = sum(n)) %>%
pull(total)
%>%
data count(NewVersion) %>%
rbind(data.frame(NewVersion = "Total", n = total_count)) %>%
kable("html", col.names=(c("Phase", "Count"))) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") kableExtra
Phase | Count |
---|---|
ExperimentLaunchV1 | 9914 |
ExperimentLaunchV2 | 30519 |
Final_Technical_Pilot | 181 |
Misinfo_Pilot | 290 |
Phase2 | 8058 |
Phase2_V2 | 27276 |
Phase2_V3 | 15512 |
NA | 66568 |
Total | 158318 |
# generating coded phase 1 and 2 variables and dropping respondents not in Phase 1 or Phase 2
$phase_coded <-data$NewVersion
data$phase_coded[data$phase_coded=="ExperimentLaunchV1" | data$phase_coded=="ExperimentLaunchV2"] = "Phase 1"
data$phase_coded[data$phase_coded=="Phase2" | data$phase_coded=="Phase2_V2" | data$phase_coded=="Phase2_V3"] = "Phase 2"
data
<- data %>%
total_count count(phase_coded) %>%
summarise(total = sum(n)) %>%
pull(total)
%>%
data count(phase_coded) %>%
rbind(data.frame(phase_coded = "Total", n = total_count)) %>%
kable("html", col.names=(c("Phase", "Count"))) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") kableExtra
Phase | Count |
---|---|
Final_Technical_Pilot | 181 |
Misinfo_Pilot | 290 |
Phase 1 | 40433 |
Phase 2 | 50846 |
NA | 66568 |
Total | 158318 |
Dropping respondents not in Phase 1 or Phase 2
<- data %>%
data filter(phase_coded == "Phase 1" | phase_coded == "Phase 2")
<- data %>%
total_count count(phase_coded) %>%
summarise(total = sum(n)) %>%
pull(total)
%>%
data count(phase_coded) %>%
rbind(data.frame(phase_coded = "Total", n = total_count)) %>%
kable("html", col.names=(c("Phase", "Count"))) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") kableExtra
Phase | Count |
---|---|
Phase 1 | 40433 |
Phase 2 | 50846 |
Total | 91279 |
Identifying attempted and actual repeaters
Repeaters are classified as “attempted repeaters” or “actual repeaters”.
Attempted repeaters are respondents trying to repeat the chatbot but were turned away automatically. The chatbot variable “repeater” identify if a person attempted repeat and based on their personal and chatfuel identification, we can determine if that person had completed the chatbot before.
Actual repeaters are respondents who actually repeated the chatbot.These actual repeaters are identified by having a timestamp in at least two treatment arms.
<- data %>%
data_repeaters select(anon_id, phase_coded,
repeater,
arm, # video start and end time
InterArms_start_time, InterArmsVideo_end_time, #long baseline start,end time
InterArmsLongBase_start_time, InterArmsLongBase_end_time, #short baseline start,end time
InterArmsBase_start_time, InterArmsBase_end_time, #sms start,end time
InterArmsSMS_start_time, InterArmsSMS_end_time, # game start and end time
InterArmsGame_start_time, interArmsGame_end_time, %>%
MisinfoQuiz_start_time) mutate(has_longbase_time = if_else(!is.na(InterArmsLongBase_start_time) | !is.na(InterArmsLongBase_end_time), TRUE, FALSE),
has_video_time = if_else(!is.na(InterArms_start_time) | !is.na(InterArmsVideo_end_time), TRUE, FALSE),
has_game_time = if_else(!is.na(InterArmsGame_start_time) | !is.na(interArmsGame_end_time), TRUE, FALSE),
has_shortbase_time = if_else(!is.na(InterArmsBase_start_time) | !is.na(InterArmsBase_end_time), TRUE, FALSE),
has_sms_time = if_else(!is.na(InterArmsSMS_start_time) | !is.na(InterArmsSMS_end_time), TRUE, FALSE),
repeated = has_longbase_time + has_shortbase_time + has_sms_time + has_video_time +has_game_time > 1) %>%
select(anon_id, arm, repeater, repeated, phase_coded, MisinfoQuiz_start_time)
<- data_repeaters %>%
data_repeaters_by_date_P1 filter(phase_coded=="Phase 1") %>%
mutate(day = as.Date(MisinfoQuiz_start_time)) %>%
group_by(day) %>%
count(repeated) %>%
mutate(pct = n/sum(n))
<- data_repeaters %>%
data_repeaters_by_date_P2 filter(phase_coded=="Phase 2") %>%
mutate(day = as.Date(MisinfoQuiz_start_time)) %>%
group_by(day) %>%
count(repeated) %>%
mutate(pct = n/sum(n))
<- data_repeaters %>%
summarized_data group_by(phase_coded) %>%
summarize(
attempted_repeaters = sum(repeater, na.rm = TRUE),
actual_repeaters = sum(repeated)
%>%
) ungroup()
# Calculate total row
<- summarized_data %>%
total_row summarise(
phase_coded = "Total",
attempted_repeaters = sum(attempted_repeaters, na.rm = TRUE),
actual_repeaters = sum(actual_repeaters, na.rm = TRUE)
)
# Combine the total row with the summarized data
<- bind_rows(summarized_data, total_row)
final_data
# Create the table
%>%
final_data mutate(total=attempted_repeaters+actual_repeaters) %>%
kable("html", col.names = c("Phase", "Attempted Repeaters", "Actual Repeaters", "Total")) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) kableExtra
Phase | Attempted Repeaters | Actual Repeaters | Total |
---|---|---|---|
Phase 1 | 11612 | 3425 | 15037 |
Phase 2 | 9955 | 23 | 9978 |
Total | 21567 | 3448 | 25015 |
Repeaters entered chatbot in Phase 1
ggplot(data_repeaters_by_date_P1 %>% filter(repeated),
aes(x = day, y = pct)) +
geom_bar(position = "identity", stat = "identity") +
scale_y_continuous(limits = c(0, 1), labels = scales::percent, expand = c(0, 0, 0, 0)) +
scale_x_date(breaks = "day") +
labs(x = "Day",
y = "% respondents who repeatedly entered chatbot in Phase 1") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
Repeaters entered chatbot in Phase 2
ggplot(data_repeaters_by_date_P2 %>% filter(repeated),
aes(x = day, y = pct)) +
geom_bar(position = "identity", stat = "identity") +
scale_y_continuous(limits = c(0, 1), labels = scales::percent, expand = c(0, 0, 0, 0)) +
scale_x_date(breaks = "day") +
labs(x = "Day",
y = "% respondents who repeatedly entered chatbot in Phase 2") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
Excluding repeaters
<- left_join(data, data_repeaters %>% select(anon_id, repeated), by = "anon_id") %>%
data filter(!repeated)
<- data_repeaters %>%
summarized_data group_by(phase_coded) %>%
summarize(
attempted_repeaters = sum(repeater, na.rm = TRUE),
actual_repeaters = sum(repeated)
%>%
) ungroup()
# Calculate total row
<- summarized_data %>%
total_row summarise(
phase_coded = "Total",
attempted_repeaters = sum(attempted_repeaters, na.rm = TRUE),
actual_repeaters = sum(actual_repeaters, na.rm = TRUE)
)
# Combine the total row with the summarized data
<- bind_rows(summarized_data, total_row)
final_data
# Create the table
%>%
final_data mutate(total=attempted_repeaters+actual_repeaters) %>%
kable("html", col.names = c("Phase", "Attempted Repeaters", "Actual Repeaters", "Total")) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) kableExtra
Phase | Attempted Repeaters | Actual Repeaters | Total |
---|---|---|---|
Phase 1 | 11612 | 3425 | 15037 |
Phase 2 | 9955 | 23 | 9978 |
Total | 21567 | 3448 | 25015 |
Cleaning treatment arms variable (arm_coded)
Checking the raw treatment arm variable (“arm”), we see the following possible entries: baseline, game, longbaseline, sms, video
There is one entry of “video” in Phase 2, so we’ll treat it as a missing value
Baseline 1 : If arm == “baseline” and phase_coded == “Phase 1”
SMS: If arm == “sms”
Video: If arm == “video” and phase_coded == “Phase 1”
Game : If arm == “game” and phase_coded == “Phase 1”
Short Baseline 2 : If arm == “baseline” and phase_coded == “Phase 2”
Long Baseline: If arm == “longbaseline” and phase_coded == “Phase 2”
#checking possible treatment variables
%>%
data count(arm) %>%
kable("html", col.names=(c("Treatment Arm", "Count"))) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") kableExtra
Treatment Arm | Count |
---|---|
baseline | 13715 |
game | 7491 |
longbaseline | 14160 |
sms | 27849 |
video | 7546 |
NA | 17070 |
Recoding treatment arm variables
<- data %>%
data mutate(
arm_coded = case_when(
== "baseline" & phase_coded == "Phase 1" ~ "Baseline 1",
arm %in% c("SMS", "sms") ~ "SMS",
arm == "video" & phase_coded == "Phase 1" ~ "Video",
arm == "game" & phase_coded == "Phase 1" ~ "Game",
arm == "baseline" & phase_coded == "Phase 2" ~ "Short Baseline 2",
arm == "longbaseline" & phase_coded == "Phase 2" ~ "Long Baseline",
arm TRUE ~ NA_character_
)
)
%>%
data group_by(phase_coded) %>%
summarise(baseline_1=sum(arm_coded=="Baseline 1", na.rm=TRUE),
sms= sum(arm_coded=="SMS", na.rm=TRUE),
video= sum(arm_coded=="Video", na.rm=TRUE),
game= sum(arm_coded=="Game", na.rm=TRUE),
short_baseline_2=sum(arm_coded=="Short Baseline 2", na.rm=TRUE),
long_baseline = sum(arm_coded=="Long Baseline", na.rm=TRUE)) %>%
mutate(total=baseline_1+sms+video+game+short_baseline_2+long_baseline) %>%
kable("html", col.names=(c("Phase", "Baseline 1", "SMS",
"Video", "Game", "Short Baseline 2",
"Long Baseline", "Total"))) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") kableExtra
Phase | Baseline 1 | SMS | Video | Game | Short Baseline 2 | Long Baseline | Total |
---|---|---|---|---|---|---|---|
Phase 1 | 7463 | 7564 | 7545 | 7491 | 0 | 0 | 30063 |
Phase 2 | 0 | 20285 | 0 | 0 | 6252 | 14158 | 40695 |
Cleaning user country (country_coded)
Variable user_country indicates where the respondents come from
Responses are messy with some random links and emojis which are treated as missing values
Answers who include an explicit name of the country or small typos are renamed as one of the four countries
Checking possible country answers
$country_coded <- data$user_country
data%>%
data count(country_coded)
# A tibble: 221 × 2
country_coded n
<chr> <int>
1 ",nigeria" 1
2 ".\nSouth Africa" 1
3 ". south Africa" 1
4 ".." 1
5 "Accra" 1
6 "Alright" 1
7 "Am also called BLESS" 1
8 "Am from Dubai" 1
9 "Am from Ukraine" 1
10 "Am from benue state" 1
# ℹ 211 more rows
Cleaning and renaming country responses
$country_coded[data$country_coded == "kenya" |data$country_coded == "Iam from Kenya" | data$country_coded == "KENYA" | data$country_coded == "Kenya." | data$country_coded == "kENYA" | data$country_coded == "Iam from Kenya"] = "Kenya"
data
$country_coded[data$country_coded == "South africa" | data$country_coded == "south africa" | data$country_coded == "SOUTH AFRICA" | data$country_coded == "south Africa" | data$country_coded == ".\nSouth Africa" | data$country_coded == ". south Africa" | data$country_coded == "SOUTH: AFRICA:" | data$country_coded == "South africa" | data$country_coded == "South Africa." | data$country_coded == "South Africa..." | data$country_coded == "SOUTH Africa" ] = "South Africa"
data
$country_coded[data$country_coded == "nigeria" | data$country_coded == "NIGERIA" | data$country_coded== ",nigeria" | data$country_coded== "Am in Nigeria" | data$country_coded== "I'm from Nigeria" |data$country_coded == "Hi Hi\nNigeria"| data$country_coded == "Nigeria." | data$country_coded == "Nigeria.." | data$country_coded == "Nigeria 🇳🇬" | data$country_coded == "Plateau State, Nigeria"| data$country_coded == "Plateau State, Nigeria" | data$country_coded == "nigerian"] = "Nigeria"
data
$country_coded[data$country_coded == "GHANA" | data$country_coded == "ghana" | data$country_coded == "Am in Ghana" | data$country_coded == "GHANA." | data$country_coded == "GHAna" | data$country_coded == "Gahna" | data$country_coded == "Ghana." | data$country_coded == "In Ghana" | data$country_coded == "In Ghana" | data$country_coded == "From Ghana please" ] = "Ghana"
data
$country_coded[!data$country_coded %in% c("Kenya", "South Africa", "Nigeria", "Ghana")] <- NA data
<- data %>%
summarized_data group_by(phase_coded) %>%
summarise(
ghana = sum(country_coded == "Ghana", na.rm = TRUE),
kenya = sum(country_coded == "Kenya", na.rm = TRUE),
sa = sum(country_coded == "South Africa", na.rm = TRUE),
nigeria = sum(country_coded == "Nigeria", na.rm = TRUE)
%>%
) ungroup()
# Calculate total row
<- summarized_data %>%
total_row summarise(
phase_coded = "Total",
ghana = sum(ghana, na.rm = TRUE),
kenya = sum(kenya, na.rm = TRUE),
sa = sum(sa, na.rm = TRUE),
nigeria = sum(nigeria, na.rm = TRUE)
)
# Combine the total row with the summarized data
<- bind_rows(summarized_data, total_row)
final_data
# Add total column and create the table
%>%
final_data mutate(total = ghana + kenya + sa + nigeria) %>%
kable("html", col.names = c("Phase", "Ghana", "Kenya", "South Africa", "Nigeria", "Total")) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtrascroll_box(width = "100%")
Phase | Ghana | Kenya | South Africa | Nigeria | Total |
---|---|---|---|---|---|
Phase 1 | 4415 | 4286 | 9707 | 14191 | 32599 |
Phase 2 | 4674 | 6122 | 14106 | 20470 | 45372 |
Total | 9089 | 10408 | 23813 | 34661 | 77971 |
Cleaning consent responses (consent_coded)
The chatbot quick responses are “I consent, start now” for consenting and “No, let’s stop here” for not providing consent
Beside the preset selection, We received random responses such emojis, questions, links and other incomplete sentences
If responses beside “I consent, start now” include an explicit agreement, then those responses are considered “I consent start now”. There are some responses only contain the age information without an explicit agreement and these responses are not considered “I consent, start now”.
Those who did not answer consent or did not answer are considered “No, let’s stop here”
Checking possible consent responses
$consent_coded <- data$consent
data%>% count(consent_coded) data
# A tibble: 147 × 2
consent_coded n
<chr> <int>
1 ". When?" 1
2 "0533521502" 1
3 "0745432220" 1
4 "09130184439" 1
5 "26 old" 1
6 "37 year old" 1
7 "6es" 1
8 "Abi \U0001f644\U0001f644" 1
9 "Alright" 2
10 "Am 22" 1
# ℹ 137 more rows
Cleaning and renaming consent responses
$consent_coded[data$consent_coded=="I Consent, start now" | data$consent_coded=="I consent start now" | data$consent_coded=="I consent start now " | data$consent_coded=="I consent,\nStart now" | data$consent_coded=="Sure!" | data$consent_coded=="Sure, let's go!" | data$consent_coded=="Yes" | data$consent_coded=="Yes I im" | data$consent_coded=="Yes I'm 23 years"
data| data$consent_coded=="Yes am 20" | data$consent_coded=="Yed" | data$consent_coded=="Yeah" | data$consent_coded=="Yes am above 18" | data$consent_coded=="Yes am over 18" | data$consent_coded=="Yes i am 36 yrs" | data$consent_coded=="Yh I'm 21" | data$consent_coded=="sure let go"] <- "I consent, start now"
$consent_coded[!data$consent_coded %in% c("I consent, start now")] <- "No, let's stop here" data
<- data %>%
summarized_data group_by(phase_coded) %>%
summarise(
yes = sum(consent_coded == "I consent, start now", na.rm = TRUE),
no = sum(consent_coded == "No, let's stop here", na.rm = TRUE)
%>%
) ungroup()
# Calculate total row
<- summarized_data %>%
total_row summarise(
phase_coded = "Total",
yes = sum(yes, na.rm = TRUE),
no = sum(no, na.rm = TRUE)
)
# Combine the total row with the summarized data
<- bind_rows(summarized_data, total_row)
final_data
# Add total column and create the table
%>%
final_data mutate(total = yes + no) %>%
kable("html", col.names = c("Phase", "I consent, start now", "No, let's stop here", "Total")) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtrascroll_box(width = "100%")
Phase | I consent, start now | No, let's stop here | Total |
---|---|---|---|
Phase 1 | 29993 | 7015 | 37008 |
Phase 2 | 40681 | 10142 | 50823 |
Total | 70674 | 17157 | 87831 |
Looking at consent and attempted repeaters
%>%
data group_by(repeater) %>%
summarise(yes = sum(consent_coded == "I consent, start now", na.rm = TRUE),
no = sum(consent_coded == "No, let's stop here", na.rm = TRUE)
%>%
) mutate(total = yes + no) %>%
kable("html", col.names = c("Attempted Repeater", "I consent, start now", "No, let's stop here", "Total")) %>%
::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtrascroll_box(width = "100%")
Attempted Repeater | I consent, start now | No, let's stop here | Total |
---|---|---|---|
TRUE | 19572 | 16 | 19588 |
NA | 51102 | 17141 | 68243 |
Generating/Exporting a clean dataset
- The clean dataset is named “misinfo_clean.csv”
<- data %>%
data_clean write_csv("misinfo_clean.csv")