Code
library(tidyverse)
library(readxl)
library(janitor)
library(corrplot)
library(ggcorrplot)
library(car)
library(pROC)
library(ResourceSelection)
library(knitr)
library(kableExtra)
library(scales)
library(readr)
library(patchwork)library(tidyverse)
library(readxl)
library(janitor)
library(corrplot)
library(ggcorrplot)
library(car)
library(pROC)
library(ResourceSelection)
library(knitr)
library(kableExtra)
library(scales)
library(readr)
library(patchwork)# Load raw data
df_raw <- read_excel("sih_data_to_be_used.xlsx") %>%
clean_names()
# Rename columns to short readable names
df <- df_raw %>%
rename(
timestamp = timestamp,
num_children = how_many_children_do_you_have,
age_groups = what_are_the_age_groups_of_your_children_tick_all_that_apply,
area = which_area_of_lagos_are_you_based_in,
income = what_is_your_monthly_household_income_range,
events_per_year = how_many_childrens_events_do_you_host_per_year,
event_types = what_types_of_childrens_events_do_you_host_tick_all_that_apply,
budget_raw = what_is_your_typical_total_budget_per_childrens_event,
treat_types = which_treat_types_do_your_children_enjoy_most_at_events_select_up_to_3,
treat_pct = what_percentage_of_your_event_budget_goes_to_treats_or_food_for_children,
vendor_used = have_you_used_a_professional_treat_vendor_at_a_childrens_event_before,
satisfaction = how_satisfied_were_you_with_that_vendor_rate_from_1_very_dissatisfied_to_10_extremely_satisfied,
find_vendors = how_do_you_typically_find_childrens_event_vendors_tick_all_that_apply,
premium_willing = would_you_pay_a_premium_for_a_dedicated_childrens_treat_vendor_with_custom_branding_and_themed_setups,
frustration = what_is_your_biggest_frustration_or_challenge_when_organizing_treats_for_childrens_events,
unforgettable = what_would_make_a_childrens_treat_experience_truly_unforgettable_for_your_family
) %>%
# Drop Score column - 100% missing, never activated in Google Forms
select(-score) %>%
# Fix events_per_year - Google Forms stored number 2 as a date
mutate(events_per_year = case_when(
grepl("2026", as.character(events_per_year)) ~ "2",
TRUE ~ as.character(events_per_year)
)) %>%
# Clean budget column - free text with 61 different formats
mutate(budget_cleaned = parse_number(
budget_raw, locale = locale(grouping_mark = ",")
)) %>%
mutate(budget_cleaned = case_when(
grepl("1\\.5m|1\\.5M|1,500,000", budget_raw, ignore.case = TRUE) ~ 1500000,
grepl("don|dnt|depends|No answer|Less than",
budget_raw, ignore.case = TRUE) ~ NA_real_,
TRUE ~ budget_cleaned
)) %>%
# Encode ordinal variables as numbers for correlation and regression
mutate(
income_encoded = case_when(
grepl("Below", income, ignore.case = TRUE) ~ 1,
grepl("200,000 - 500|200k - 500", income, ignore.case = TRUE) ~ 2,
grepl("500,000 - 1|500k - 1", income, ignore.case = TRUE) ~ 3,
grepl("Above|1,000,000", income, ignore.case = TRUE) ~ 4,
TRUE ~ NA_real_
),
treat_pct_encoded = case_when(
grepl("less than 10|<10", treat_pct, ignore.case = TRUE) ~ 1,
grepl("10.*20|10-20", treat_pct, ignore.case = TRUE) ~ 2,
grepl("20.*30|20-30", treat_pct, ignore.case = TRUE) ~ 3,
grepl("more than 30|>30", treat_pct, ignore.case = TRUE) ~ 4,
TRUE ~ NA_real_
),
premium_encoded = case_when(
grepl("No", premium_willing, ignore.case = TRUE) ~ 0,
grepl("Maybe", premium_willing, ignore.case = TRUE) ~ 1,
grepl("Yes", premium_willing, ignore.case = TRUE) ~ 2,
TRUE ~ NA_real_
),
premium_yes = if_else(
grepl("Yes", premium_willing, ignore.case = TRUE), 1, 0
),
vendor_used_num = if_else(
grepl("Yes", vendor_used, ignore.case = TRUE), 1, 0
),
area_encoded = if_else(
grepl("Island|Lekki|Victoria", area, ignore.case = TRUE), 1, 0
),
num_children_encoded = case_when(
grepl("^1$", num_children) ~ 1,
grepl("^2$", num_children) ~ 2,
grepl("^3$", num_children) ~ 3,
grepl("4|more", num_children, ignore.case = TRUE) ~ 4,
TRUE ~ NA_real_
)
)
# Confirm it worked
cat("Rows:", nrow(df), "\nColumns:", ncol(df))Rows: 100
Columns: 24
*To be written last — after all analyses are complete.*
# 2. Professional Disclosure
I am Adeyinka Adedoyin Obabiolorunkosi, Founder and Chief Experience Officer
of Sweet Indulgence by Hobams, Lagos’s premier kiddies treat experience brand.
My organisation provides premium popcorn, ice cream, popsicles and cotton candy
at children’s parties, school events, corporate family days and community
activations across Lagos. In my role I make daily decisions about market
positioning, pricing strategy, customer targeting and product mix.
The five analytical techniques applied in this study are not academic exercises —
they map directly to decisions I face as a practitioner:
- **EDA** — Understanding what my customers look like and what they want, so I
know which products to stock and which events to target.
- **Visualisation** — Communicating market insights clearly to potential partners,
sponsors and the Hobams Foundation board.
- **Hypothesis Testing** — Testing whether customer behaviour and spending differ
by location, so I can decide whether to offer tiered pricing across Lagos.
- **Correlation Analysis** — Identifying which customer characteristics move
together, so I can build a clearer profile of my ideal premium customer.
- **Logistic Regression** — Predicting which parents are most likely to pay a
premium for branded treat experiences, so I can prioritise the right households
in my sales and marketing outreach.
# 3. Data Collection & Sampling
## Source & Collection Method
The dataset was collected via a structured Google Forms survey designed and
administered by the researcher in her capacity as Founder of Sweet Indulgence
by Hobams. The survey was distributed through WhatsApp groups, Instagram direct
messages and Instagram Stories — the same channels used to market the brand —
between June and December 2025.
## Sampling Frame
The target population is Lagos-based parents with at least one child who
organise or attend children’s events. This population is the direct target
customer for Sweet Indulgence by Hobams. A purposive convenience sampling
method was used, appropriate because the brand’s market is socially-connected
Lagos parents rather than a random general population.
## Sample Size & Statistical Rationale
A total of 100 complete responses were collected across 17 variables. By the
Central Limit Theorem, n ≥ 30 is sufficient for sampling distributions of means
to be approximately normal; n = 100 provides adequate statistical power for the
t-tests, chi-squared tests, correlation analyses and logistic regression planned
in this study.
## Ethical Notes
No personally identifiable information was collected. Participation was
voluntary. Respondents were informed the data would be used for academic and
business research purposes. No formal ethics board approval was required as the
survey collected no sensitive personal data.
## Data Quality Issues Identified
Three data quality issues were identified and resolved during cleaning:
1. **Budget variable**: collected as free text, producing 61 unique
non-standardised formats (e.g. “500,000”, “500k”, “1.5m”). Resolved using
`parse_number()` with manual override for edge cases.
2. **Events per year**: Google Forms misread numeric input “2” as a date
(“2026-02-03”). Resolved by detecting and recoding date-formatted entries.
3. **Score column**: contained zero non-null values across all 100 rows — the
Google Forms quiz-scoring feature was never activated. Column excluded from
all analyses.
# 4. Data Description
*EDA code and output go here — next step*
# 5. Technique 1 — Exploratory Data Analysis
Exploratory Data Analysis (EDA) is the foundation of any rigorous analytical workflow. Before fitting models or running tests, a practitioner must understand the shape, distribution, and quality of their data. For Sweet Indulgence by Hobams, EDA answers the most fundamental business question: who are my customers and what do they look like?
# Summary statistics for all numeric variables
df %>%
select(num_children_encoded, income_encoded, treat_pct_encoded,
premium_encoded, vendor_used_num, area_encoded,
satisfaction, budget_cleaned) %>%
summary() num_children_encoded income_encoded treat_pct_encoded premium_encoded
Min. :1.0 Min. :1.000 Min. :1.00 Min. :0.00
1st Qu.:2.0 1st Qu.:1.750 1st Qu.:2.75 1st Qu.:1.00
Median :2.0 Median :4.000 Median :4.00 Median :1.00
Mean :2.3 Mean :3.222 Mean :3.22 Mean :1.15
3rd Qu.:3.0 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:2.00
Max. :4.0 Max. :4.000 Max. :4.00 Max. :2.00
NA's :46
vendor_used_num area_encoded satisfaction budget_cleaned
Min. :0.00 Min. :0.00 Length:100 Min. : 1
1st Qu.:0.00 1st Qu.:0.00 Class :character 1st Qu.: 30000
Median :1.00 Median :1.00 Mode :character Median : 150000
Mean :0.66 Mean :0.56 Mean : 367117
3rd Qu.:1.00 3rd Qu.:1.00 3rd Qu.: 500000
Max. :1.00 Max. :1.00 Max. :3000000
NA's :11
# Missing value analysis
missing_summary <- df %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(),
names_to = "Variable",
values_to = "Missing_Count") %>%
mutate(Missing_Pct = round(Missing_Count / nrow(df) * 100, 1)) %>%
filter(Missing_Count > 0) %>%
arrange(desc(Missing_Count))
missing_summary %>%
kable(caption = "Table 1: Missing Values by Variable") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Variable | Missing_Count | Missing_Pct |
|---|---|---|
| income_encoded | 46 | 46 |
| satisfaction | 17 | 17 |
| events_per_year | 14 | 14 |
| budget_cleaned | 11 | 11 |
| frustration | 2 | 2 |
| budget_raw | 1 | 1 |
# Distribution of key categorical variables
p1 <- df %>%
count(income) %>%
mutate(income = str_wrap(income, 20)) %>%
ggplot(aes(x = reorder(income, n), y = n, fill = income)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Income Distribution of Respondents",
x = "Income Range", y = "Count") +
theme_minimal()
p2 <- df %>%
count(area) %>%
mutate(area = str_wrap(area, 20)) %>%
ggplot(aes(x = reorder(area, n), y = n, fill = area)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Respondents by Lagos Area",
x = "Area", y = "Count") +
theme_minimal()
p3 <- df %>%
count(events_per_year) %>%
ggplot(aes(x = reorder(events_per_year, n), y = n, fill = events_per_year)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Events Hosted Per Year",
x = "Events Per Year", y = "Count") +
theme_minimal()
p4 <- df %>%
filter(!is.na(budget_cleaned)) %>%
ggplot(aes(x = budget_cleaned)) +
geom_histogram(bins = 20, fill = "#2C7BB6", colour = "white") +
scale_x_continuous(labels = comma) +
labs(title = "Distribution of Event Budget (₦)",
x = "Budget (₦)", y = "Count") +
theme_minimal()
library(patchwork)
(p1 + p2) / (p3 + p4)# Outlier detection using boxplot on budget and satisfaction
p5 <- df %>%
filter(!is.na(budget_cleaned)) %>%
ggplot(aes(y = budget_cleaned)) +
geom_boxplot(fill = "#FDB863") +
scale_y_continuous(labels = comma) +
labs(title = "Boxplot: Event Budget (₦)",
y = "Budget (₦)") +
theme_minimal()
p6 <- df %>%
filter(!is.na(satisfaction)) %>%
ggplot(aes(y = satisfaction)) +
geom_boxplot(fill = "#74ADD1") +
labs(title = "Boxplot: Vendor Satisfaction Score",
y = "Satisfaction (1-10)") +
theme_minimal()
p5 + p6Key EDA Findings:
# 6. Technique 2 — Data Visualisation
Effective data visualisation transforms numbers into decisions. For Sweet Indulgence by Hobams, the goal of this visualisation narrative is to answer one central business question: who is my customer and what drives their willingness to pay a premium? Five complementary plots are presented below, each chosen deliberately for its ability to communicate a specific pattern in the data.
# Plot 1 - Treat type popularity (explode multi-select column)
df %>%
filter(!is.na(treat_types)) %>%
separate_rows(treat_types, sep = ",") %>%
mutate(treat_types = str_trim(treat_types)) %>%
count(treat_types, sort = TRUE) %>%
ggplot(aes(x = reorder(treat_types, n), y = n, fill = treat_types)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(
title = "Figure 1: Most Popular Treat Types at Children's Events",
subtitle = "Multiple selections allowed — n = 100 respondents",
x = "Treat Type",
y = "Number of Mentions",
caption = "Source: Sweet Indulgence by Hobams Customer Survey, 2025"
) +
theme_minimal(base_size = 12)Business interpretation (Figure 1): Ice cream and popcorn dominate treat preferences among Lagos parents. For Sweet Indulgence by Hobams, this confirms that the core product offering — premium popcorn and ice cream — is correctly aligned with market demand. Cotton candy and popsicles serve as complementary upsell products.
# Plot 2 - Income group vs premium willingness (stacked bar)
df %>%
filter(!is.na(income), !is.na(premium_willing)) %>%
mutate(income = str_wrap(income, 25)) %>%
count(income, premium_willing) %>%
group_by(income) %>%
mutate(pct = n / sum(n) * 100) %>%
ggplot(aes(x = income, y = pct, fill = premium_willing)) +
geom_col(position = "stack") +
coord_flip() +
scale_fill_brewer(palette = "RdYlGn", direction = 1) +
labs(
title = "Figure 2: Premium Willingness by Income Group",
subtitle = "Percentage within each income bracket",
x = "Income Range",
y = "Percentage (%)",
fill = "Premium Willingness",
caption = "Source: Sweet Indulgence by Hobams Customer Survey, 2025"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")Business interpretation (Figure 2): Higher income brackets show a progressively larger share of “Yes, definitely” responses. This gives Sweet Indulgence by Hobams a clear targeting signal — premium branded packages should be marketed primarily to households earning above ₦500,000 per month.
# Plot 3 - Vendor satisfaction by area (boxplot)
df %>%
filter(!is.na(satisfaction), !is.na(area)) %>%
mutate(area = str_wrap(area, 20)) %>%
ggplot(aes(x = reorder(area, satisfaction, median),
y = satisfaction, fill = area)) +
geom_boxplot(show.legend = FALSE, alpha = 0.7) +
coord_flip() +
labs(
title = "Figure 3: Vendor Satisfaction Scores by Lagos Area",
subtitle = "Among the 83 respondents who have used a vendor before",
x = "Lagos Area",
y = "Satisfaction Score (1–10)",
caption = "Source: Sweet Indulgence by Hobams Customer Survey, 2025"
) +
theme_minimal(base_size = 12)Business interpretation (Figure 3): Satisfaction scores vary across Lagos areas, with some areas showing wider spread and lower medians — indicating underserved markets where current vendor quality is inconsistent. These areas represent the strongest opportunity for Sweet Indulgence by Hobams to enter and capture dissatisfied customers.
# Plot 4 - Treat budget percentage (bar chart)
df %>%
filter(!is.na(treat_pct)) %>%
count(treat_pct) %>%
mutate(treat_pct = factor(treat_pct,
levels = c("Less than 10%", "10 - 20%", "20 - 30%", "More than 30%"))) %>%
ggplot(aes(x = treat_pct, y = n, fill = treat_pct)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n), vjust = -0.5, size = 4) +
labs(
title = "Figure 4: Treat Spend as Percentage of Event Budget",
subtitle = "51 of 100 respondents allocate more than 30% of their budget to treats",
x = "Treat Budget Percentage",
y = "Number of Respondents",
caption = "Source: Sweet Indulgence by Hobams Customer Survey, 2025"
) +
theme_minimal(base_size = 12)Business interpretation (Figure 4): The majority of respondents — 51 out of 100 — allocate more than 30% of their event budget to treats. This demolishes the assumption that treats are an afterthought at children’s events. For Sweet Indulgence by Hobams, treats are a primary spend category, not a secondary one. This justifies premium pricing.
# Plot 5 - Budget vs satisfaction scatter
df %>%
filter(!is.na(budget_cleaned), !is.na(satisfaction)) %>%
ggplot(aes(x = budget_cleaned, y = satisfaction,
colour = vendor_used)) +
geom_point(alpha = 0.7, size = 3) +
geom_smooth(method = "lm", se = TRUE, colour = "black",
linetype = "dashed") +
scale_x_continuous(labels = comma) +
scale_colour_manual(values = c("Yes" = "#2C7BB6", "No" = "#D7191C")) +
labs(
title = "Figure 5: Event Budget vs Vendor Satisfaction Score",
subtitle = "Coloured by whether a professional vendor was used",
x = "Total Event Budget (₦)",
y = "Vendor Satisfaction Score (1–10)",
colour = "Used Professional Vendor",
caption = "Source: Sweet Indulgence by Hobams Customer Survey, 2025"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")Business interpretation (Figure 5): There is a visible positive trend between event budget and vendor satisfaction — higher-spending parents tend to report higher satisfaction with their vendors. This scatter plot sets up the formal correlation analysis in Section 8 and supports the regression model in Section 9. The dashed trend line confirms the direction of the relationship before formal testing.
# 7. Technique 3 — Hypothesis Testing
*Code and analysis go here*
# 8. Technique 4 — Correlation Analysis
*Code and analysis go here*
# 9. Technique 5 — Logistic Regression
*Code and analysis go here*
# 10. Integrated Findings
*To be written after all five techniques are complete*
# 11. Limitations & Further Work
*To be written near the end*
# References
*APA references to be added*
# Appendix: AI Usage Statement
*To be written at the end*