# ── Rename key columns ────────────────────────────────────────────────────────
df <- df_raw |>
rename(
timestamp = c1,
area_live = c3,
area_work = c4,
gender = c5,
education = c6,
employment = c7,
marital_status = c9,
household_size = c10,
visit_frequency = c11,
fav_soups = c12,
dining_channel = c15,
spend_raw = c16,
imp_taste = c17,
imp_freshness = c18,
imp_consistency = c19,
imp_portions = c20,
imp_hygiene = c21,
imp_ambience = c22,
imp_location = c23,
imp_parking = c24,
imp_speed = c25,
imp_staff = c26,
imp_delivery = c27,
imp_online_ord = c28,
imp_pricing = c29,
imp_variety = c30,
imp_takeaway = c31,
imp_authentic = c32,
dissatisfaction = c33,
premium_willing = c34,
pref_setting = c35,
aware_dalos = c46,
overall_exp = c47,
food_quality_exp = c48,
relaunch_intent = c50
)
# ── Likert encoder: works on ANY encoding of the text ────────────────────────
# Matches on lowercase keywords so UTF-8 / mojibake / spacing never matters
encode_likert <- function(x) {
x <- str_to_lower(str_trim(iconv(as.character(x), to = "ASCII//TRANSLIT")))
dplyr::case_when(
str_detect(x, "not important") ~ 1L,
str_detect(x, "slightly") ~ 2L,
str_detect(x, "moderately") ~ 3L,
str_detect(x, "^important") ~ 4L,
str_detect(x, "extremely") ~ 5L,
TRUE ~ NA_integer_
)
}
likert_cols <- c("imp_taste","imp_freshness","imp_consistency","imp_portions",
"imp_hygiene","imp_ambience","imp_location","imp_parking",
"imp_speed","imp_staff","imp_delivery","imp_online_ord",
"imp_pricing","imp_variety","imp_takeaway","imp_authentic")
df <- df |>
mutate(across(all_of(likert_cols), encode_likert))
# ── Spend per meal → ordinal 1–5 ─────────────────────────────────────────────
# Uses digit patterns that survive any currency-symbol encoding
df <- df |>
mutate(
spend_num = dplyr::case_when(
str_detect(as.character(spend_raw), "(?i)below|elow") ~ 1L,
str_detect(as.character(spend_raw), "1.?500|1500") ~ 2L,
str_detect(as.character(spend_raw), "3.?001|3001") ~ 3L,
str_detect(as.character(spend_raw), "5.?001|5001") ~ 4L,
str_detect(as.character(spend_raw), "(?i)above|bove|8.?000") ~ 5L,
TRUE ~ NA_integer_
),
spend_label = dplyr::case_when(
spend_num == 1L ~ "Below N1,500",
spend_num == 2L ~ "N1,500-3,000",
spend_num == 3L ~ "N3,001-5,000",
spend_num == 4L ~ "N5,001-8,000",
spend_num == 5L ~ "Above N8,000",
TRUE ~ NA_character_
),
spend_label = factor(spend_label,
levels = c("Below N1,500","N1,500-3,000","N3,001-5,000",
"N5,001-8,000","Above N8,000"))
)
# ── Visit frequency → ordinal 1–5 ────────────────────────────────────────────
df <- df |>
mutate(
visit_num = dplyr::case_when(
str_detect(as.character(visit_frequency), "(?i)less") ~ 1L,
str_detect(as.character(visit_frequency), "(?i)month") ~ 2L,
str_detect(as.character(visit_frequency), "(?i)1.2.*week|1.*2.*week") ~ 3L,
str_detect(as.character(visit_frequency), "(?i)3.4|3.*4") ~ 4L,
str_detect(as.character(visit_frequency), "(?i)daily") ~ 5L,
TRUE ~ NA_integer_
),
freq_label = dplyr::case_when(
visit_num == 1L ~ "< Once/month",
visit_num == 2L ~ "1-2x/month",
visit_num == 3L ~ "1-2x/week",
visit_num == 4L ~ "3-4x/week",
visit_num == 5L ~ "Daily",
TRUE ~ NA_character_
),
freq_label = factor(freq_label,
levels = c("< Once/month","1-2x/month","1-2x/week","3-4x/week","Daily"))
)
# ── Education groups ──────────────────────────────────────────────────────────
df <- df |>
mutate(
edu_group = dplyr::case_when(
str_detect(as.character(education), "(?i)secondary|waec|neco|ond") ~ "Secondary/OND",
str_detect(as.character(education), "(?i)hnd") ~ "HND",
str_detect(as.character(education), "(?i)bachelor|b\\.sc|b\\.a") ~ "Bachelor's",
str_detect(as.character(education), "(?i)postgrad|mba|m\\.sc|ph") ~ "Postgraduate",
str_detect(as.character(education), "(?i)professional|cert") ~ "Professional Cert",
TRUE ~ "Other"
),
edu_group = factor(edu_group,
levels = c("Secondary/OND","HND","Bachelor's",
"Postgraduate","Professional Cert"))
)
# ── Employment groups ─────────────────────────────────────────────────────────
df <- df |>
mutate(
emp_group = dplyr::case_when(
str_detect(as.character(employment), "(?i)private") ~ "Private sector",
str_detect(as.character(employment), "(?i)self|business") ~ "Self-employed",
str_detect(as.character(employment), "(?i)student") ~ "Student",
str_detect(as.character(employment), "(?i)unemploy") ~ "Unemployed",
str_detect(as.character(employment), "(?i)public|gov|church") ~ "Public/Other",
TRUE ~ "Other"
)
)
# ── Binary outcome variables ──────────────────────────────────────────────────
df <- df |>
mutate(
# 1 = Very likely OR Extremely likely; 0 = everything else
intent_binary = if_else(
str_detect(as.character(relaunch_intent), "(?i)very likely|extremely likely"),
1L, 0L
),
# 1 = Definitely yes OR Probably yes; 0 = everything else
premium_binary = if_else(
str_detect(as.character(premium_willing), "(?i)definitely yes|probably yes"),
1L, 0L
)
)
# ── Median-impute the small number of NAs created by encoding ────────────────
df <- df |>
mutate(
spend_num = if_else(is.na(spend_num),
as.integer(median(spend_num, na.rm = TRUE)), spend_num),
visit_num = if_else(is.na(visit_num),
as.integer(median(visit_num, na.rm = TRUE)), visit_num)
)
# ── Quick sanity check ────────────────────────────────────────────────────────
cat("Rows:", nrow(df), "\n")