FB-Misinformation Data Cleaning

Author

Kiet Le

Published

January 24, 2024

The purpose of this document is to clean the raw FB Misinformation Chatfuel data.

Loading R library

library(tidyverse)
library(kableExtra)
library(broom)
library(texreg)
library(patchwork)
library(readr)
library(lubridate)
library(boot)
library(stargazer)
library(fixest)
library(pander)
library(stringi)
create_table <- function(data) {
  # Reshape data: pivot from long to wide format
  reshaped_data <- data %>% 
    pivot_wider(names_from = arm, values_from = phase_coded)

  # Generate and return the table
  kable(reshaped_data)
}

Loading the raw dataset and generating a seperate anon_id for each respondents

The data was pulled on 01/18/2024. The data is downloaded from Chatfuel without personal private information of respondents such as name, profile picture url, and etc. Chatfuel user id is collected, but for safety, we’ll generate a unique id (anon_id) for each of the users.

setwd("C:/Users/kietle/OneDrive - Stanford/Documents/")
data <- read_csv("Project_Don_t_Get_Duped_2024_01_18_18_27_44.csv")
df_chat_anon <- data %>%
  mutate(anon_id = 1:nrow(.)) %>% 
  select(-`chatfuel user id`)
write_csv(df_chat_anon, "Project_Don_t_Get_Duped_2024_01_18_18_27_44_anon.csv")
setwd("C:/Users/kietle/OneDrive - Stanford/Documents/")
data <- read_csv("Project_Don_t_Get_Duped_2024_01_18_18_27_44_anon.csv")

Creating a copy of raw dataset

data_raw <- data %>%
  write_csv("misinfo_raw.csv")

Cleaning phases (phase_coded)

Looking at respondents across different phases and generating generating a phase identifier (“phase_coded”)

Categorizing Phases

  • Respondents who were in Version ExperimentLaunchV1 or ExperimentLaunchV2 as Phase 1

  • Respondents who were in Version Phase2, Phase2_V2 or Phase2_V3 as Phase 2

total_count <- data %>% 
  count(NewVersion) %>% 
  summarise(total = sum(n)) %>%
  pull(total)

data %>% 
  count(NewVersion) %>%
  rbind(data.frame(NewVersion = "Total", n = total_count)) %>%
  kable("html", col.names=(c("Phase", "Count"))) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") 
Phase Count
ExperimentLaunchV1 9914
ExperimentLaunchV2 30519
Final_Technical_Pilot 181
Misinfo_Pilot 290
Phase2 8058
Phase2_V2 27276
Phase2_V3 15512
NA 66568
Total 158318
# generating coded phase 1 and 2 variables and dropping respondents not in Phase 1 or Phase 2
data$phase_coded <-data$NewVersion 
data$phase_coded[data$phase_coded=="ExperimentLaunchV1" | data$phase_coded=="ExperimentLaunchV2"] = "Phase 1"
data$phase_coded[data$phase_coded=="Phase2" | data$phase_coded=="Phase2_V2" | data$phase_coded=="Phase2_V3"] = "Phase 2"

total_count <- data %>% 
  count(phase_coded) %>% 
  summarise(total = sum(n)) %>%
  pull(total)

data %>% 
  count(phase_coded) %>%
  rbind(data.frame(phase_coded = "Total", n = total_count)) %>%
  kable("html", col.names=(c("Phase", "Count"))) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") 
Phase Count
Final_Technical_Pilot 181
Misinfo_Pilot 290
Phase 1 40433
Phase 2 50846
NA 66568
Total 158318

Dropping respondents not in Phase 1 or Phase 2

data <- data %>% 
  filter(phase_coded == "Phase 1" | phase_coded == "Phase 2")
total_count <- data %>% 
  count(phase_coded) %>% 
  summarise(total = sum(n)) %>%
  pull(total)

data %>% 
  count(phase_coded) %>%
  rbind(data.frame(phase_coded = "Total", n = total_count)) %>%
  kable("html", col.names=(c("Phase", "Count"))) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") 
Phase Count
Phase 1 40433
Phase 2 50846
Total 91279

Identifying attempted and actual repeaters

  • Repeaters are classified as “attempted repeaters” or “actual repeaters”.

  • Attempted repeaters are respondents trying to repeat the chatbot but were turned away automatically. The chatbot variable “repeater” identify if a person attempted repeat and based on their personal and chatfuel identification, we can determine if that person had completed the chatbot before.

  • Actual repeaters are respondents who actually repeated the chatbot.These actual repeaters are identified by having a timestamp in at least two treatment arms.

data_repeaters <- data %>%
    select(anon_id, phase_coded,
           repeater,
           arm, 
           InterArms_start_time, InterArmsVideo_end_time, # video start and end time
           InterArmsLongBase_start_time, InterArmsLongBase_end_time, #long baseline start,end time
           InterArmsBase_start_time, InterArmsBase_end_time, #short baseline start,end time
           InterArmsSMS_start_time, InterArmsSMS_end_time, #sms start,end time
           InterArmsGame_start_time, interArmsGame_end_time, # game start and end time
           MisinfoQuiz_start_time) %>%
    mutate(has_longbase_time = if_else(!is.na(InterArmsLongBase_start_time) | !is.na(InterArmsLongBase_end_time), TRUE, FALSE),
           has_video_time = if_else(!is.na(InterArms_start_time) | !is.na(InterArmsVideo_end_time), TRUE, FALSE),
           has_game_time = if_else(!is.na(InterArmsGame_start_time) | !is.na(interArmsGame_end_time), TRUE, FALSE),
           has_shortbase_time = if_else(!is.na(InterArmsBase_start_time) | !is.na(InterArmsBase_end_time), TRUE, FALSE),
           has_sms_time = if_else(!is.na(InterArmsSMS_start_time) | !is.na(InterArmsSMS_end_time), TRUE, FALSE),
           repeated = has_longbase_time + has_shortbase_time + has_sms_time + has_video_time +has_game_time > 1) %>%
    select(anon_id, arm, repeater, repeated, phase_coded, MisinfoQuiz_start_time)

data_repeaters_by_date_P1 <- data_repeaters %>% 
  filter(phase_coded=="Phase 1") %>%
  mutate(day = as.Date(MisinfoQuiz_start_time)) %>% 
  group_by(day) %>% 
  count(repeated) %>%
  mutate(pct = n/sum(n))

data_repeaters_by_date_P2 <- data_repeaters %>% 
  filter(phase_coded=="Phase 2") %>%
  mutate(day = as.Date(MisinfoQuiz_start_time)) %>% 
  group_by(day) %>% 
  count(repeated) %>%
  mutate(pct = n/sum(n))
summarized_data <- data_repeaters %>%
  group_by(phase_coded) %>%
  summarize(
    attempted_repeaters = sum(repeater, na.rm = TRUE),
    actual_repeaters = sum(repeated)
  ) %>%
  ungroup()

# Calculate total row
total_row <- summarized_data %>%
  summarise(
    phase_coded = "Total",
    attempted_repeaters = sum(attempted_repeaters, na.rm = TRUE),
    actual_repeaters = sum(actual_repeaters, na.rm = TRUE)
  )

# Combine the total row with the summarized data
final_data <- bind_rows(summarized_data, total_row)

# Create the table
final_data %>%
  mutate(total=attempted_repeaters+actual_repeaters) %>%
  kable("html", col.names = c("Phase", "Attempted Repeaters", "Actual Repeaters", "Total")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Phase Attempted Repeaters Actual Repeaters Total
Phase 1 11612 3425 15037
Phase 2 9955 23 9978
Total 21567 3448 25015

Repeaters entered chatbot in Phase 1

ggplot(data_repeaters_by_date_P1 %>% filter(repeated),
       aes(x = day, y = pct)) +
  geom_bar(position = "identity", stat = "identity") +
  scale_y_continuous(limits = c(0, 1), labels = scales::percent, expand = c(0, 0, 0, 0)) +
  scale_x_date(breaks = "day") +
  labs(x = "Day",
       y = "% respondents who repeatedly entered chatbot in Phase 1") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90))

Repeaters entered chatbot in Phase 2

ggplot(data_repeaters_by_date_P2 %>% filter(repeated),
       aes(x = day, y = pct)) +
  geom_bar(position = "identity", stat = "identity") +
  scale_y_continuous(limits = c(0, 1), labels = scales::percent, expand = c(0, 0, 0, 0)) +
  scale_x_date(breaks = "day") +
  labs(x = "Day",
       y = "% respondents who repeatedly entered chatbot in Phase 2") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90))

Excluding repeaters

data <- left_join(data, data_repeaters %>% select(anon_id, repeated), by = "anon_id") %>%
    filter(!repeated)
summarized_data <- data_repeaters %>%
  group_by(phase_coded) %>%
  summarize(
    attempted_repeaters = sum(repeater, na.rm = TRUE),
    actual_repeaters = sum(repeated)
  ) %>%
  ungroup()

# Calculate total row
total_row <- summarized_data %>%
  summarise(
    phase_coded = "Total",
    attempted_repeaters = sum(attempted_repeaters, na.rm = TRUE),
    actual_repeaters = sum(actual_repeaters, na.rm = TRUE)
  )

# Combine the total row with the summarized data
final_data <- bind_rows(summarized_data, total_row)

# Create the table
final_data %>%
  mutate(total=attempted_repeaters+actual_repeaters) %>%
  kable("html", col.names = c("Phase", "Attempted Repeaters", "Actual Repeaters", "Total")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
Phase Attempted Repeaters Actual Repeaters Total
Phase 1 11612 3425 15037
Phase 2 9955 23 9978
Total 21567 3448 25015

Cleaning treatment arms variable (arm_coded)

  • Checking the raw treatment arm variable (“arm”), we see the following possible entries: baseline, game, longbaseline, sms, video

  • There is one entry of “video” in Phase 2, so we’ll treat it as a missing value

  • Baseline 1 : If arm == “baseline” and phase_coded == “Phase 1”

  • SMS: If arm == “sms”

  • Video: If arm == “video” and phase_coded == “Phase 1”

  • Game : If arm == “game” and phase_coded == “Phase 1”

  • Short Baseline 2 : If arm == “baseline” and phase_coded == “Phase 2”

  • Long Baseline: If arm == “longbaseline” and phase_coded == “Phase 2”

#checking possible treatment variables 
data %>% 
  count(arm) %>%
  kable("html", col.names=(c("Treatment Arm", "Count"))) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%") 
Treatment Arm Count
baseline 13715
game 7491
longbaseline 14160
sms 27849
video 7546
NA 17070

Recoding treatment arm variables

data <- data %>%
  mutate(
    arm_coded = case_when(
      arm == "baseline" & phase_coded == "Phase 1" ~ "Baseline 1",
      arm %in% c("SMS", "sms") ~ "SMS",
      arm == "video" & phase_coded == "Phase 1" ~ "Video",
      arm == "game" & phase_coded == "Phase 1" ~ "Game",
      arm == "baseline" & phase_coded == "Phase 2" ~ "Short Baseline 2",
      arm == "longbaseline" & phase_coded == "Phase 2" ~ "Long Baseline",
      TRUE ~ NA_character_
    )
  )

data %>%
  group_by(phase_coded) %>%
  summarise(baseline_1=sum(arm_coded=="Baseline 1", na.rm=TRUE), 
            sms= sum(arm_coded=="SMS", na.rm=TRUE), 
            video= sum(arm_coded=="Video", na.rm=TRUE),
            game= sum(arm_coded=="Game", na.rm=TRUE),
            short_baseline_2=sum(arm_coded=="Short Baseline 2", na.rm=TRUE),
            long_baseline = sum(arm_coded=="Long Baseline", na.rm=TRUE)) %>%
   mutate(total=baseline_1+sms+video+game+short_baseline_2+long_baseline) %>%
   kable("html", col.names=(c("Phase", "Baseline 1", "SMS",
                             "Video", "Game", "Short Baseline 2",
                             "Long Baseline", "Total"))) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(width = "100%")
Phase Baseline 1 SMS Video Game Short Baseline 2 Long Baseline Total
Phase 1 7463 7564 7545 7491 0 0 30063
Phase 2 0 20285 0 0 6252 14158 40695

Cleaning user country (country_coded)

  • Variable user_country indicates where the respondents come from

  • Responses are messy with some random links and emojis which are treated as missing values

  • Answers who include an explicit name of the country or small typos are renamed as one of the four countries

Checking possible country answers

data$country_coded <- data$user_country
data %>%
  count(country_coded)
# A tibble: 221 × 2
   country_coded              n
   <chr>                  <int>
 1 ",nigeria"                 1
 2 ".\nSouth Africa"          1
 3 ". south Africa"           1
 4 ".."                       1
 5 "Accra"                    1
 6 "Alright"                  1
 7 "Am also called BLESS"     1
 8 "Am from Dubai"            1
 9 "Am from Ukraine"          1
10 "Am from benue state"      1
# ℹ 211 more rows

Cleaning and renaming country responses

data$country_coded[data$country_coded == "kenya" |data$country_coded == "Iam from Kenya" | data$country_coded == "KENYA" | data$country_coded == "Kenya." | data$country_coded == "kENYA" | data$country_coded == "Iam from Kenya"] = "Kenya"

data$country_coded[data$country_coded == "South africa" | data$country_coded == "south africa" | data$country_coded == "SOUTH AFRICA" | data$country_coded == "south Africa" | data$country_coded == ".\nSouth Africa" | data$country_coded == ". south Africa" | data$country_coded == "SOUTH: AFRICA:" | data$country_coded == "South africa" | data$country_coded == "South Africa." | data$country_coded == "South Africa..." | data$country_coded == "SOUTH Africa"  ] = "South Africa"

data$country_coded[data$country_coded == "nigeria" | data$country_coded == "NIGERIA" | data$country_coded== ",nigeria" |  data$country_coded== "Am in Nigeria" |  data$country_coded== "I'm from Nigeria" |data$country_coded == "Hi Hi\nNigeria"| data$country_coded == "Nigeria." | data$country_coded == "Nigeria.." | data$country_coded == "Nigeria 🇳🇬" | data$country_coded == "Plateau State, Nigeria"| data$country_coded == "Plateau State, Nigeria" | data$country_coded == "nigerian"] = "Nigeria"


data$country_coded[data$country_coded == "GHANA" | data$country_coded == "ghana" | data$country_coded == "Am in Ghana" | data$country_coded == "GHANA." | data$country_coded == "GHAna" | data$country_coded == "Gahna" | data$country_coded == "Ghana." | data$country_coded == "In Ghana" | data$country_coded == "In Ghana" | data$country_coded == "From Ghana please" ] = "Ghana"

data$country_coded[!data$country_coded %in% c("Kenya", "South Africa", "Nigeria", "Ghana")] <- NA
summarized_data <- data %>%
  group_by(phase_coded) %>%
  summarise(
    ghana = sum(country_coded == "Ghana", na.rm = TRUE), 
    kenya = sum(country_coded == "Kenya", na.rm = TRUE), 
    sa = sum(country_coded == "South Africa", na.rm = TRUE), 
    nigeria = sum(country_coded == "Nigeria", na.rm = TRUE)
  ) %>%
  ungroup()

# Calculate total row
total_row <- summarized_data %>%
  summarise(
    phase_coded = "Total",
    ghana = sum(ghana, na.rm = TRUE), 
    kenya = sum(kenya, na.rm = TRUE), 
    sa = sum(sa, na.rm = TRUE), 
    nigeria = sum(nigeria, na.rm = TRUE)
  )

# Combine the total row with the summarized data
final_data <- bind_rows(summarized_data, total_row)

# Add total column and create the table
final_data %>%
  mutate(total = ghana + kenya + sa + nigeria) %>%
  kable("html", col.names = c("Phase", "Ghana", "Kenya", "South Africa", "Nigeria", "Total")) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(width = "100%")
Phase Ghana Kenya South Africa Nigeria Total
Phase 1 4415 4286 9707 14191 32599
Phase 2 4674 6122 14106 20470 45372
Total 9089 10408 23813 34661 77971

Generating/Exporting a clean dataset

  • The clean dataset is named “misinfo_clean.csv”
data_clean <- data %>%
  write_csv("misinfo_clean.csv")