# Step 1 Loading Packages and adding adults
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble 3.1.8 ✔ purrr 1.0.1
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.5 ✔ forcats 0.5.2
## Warning: package 'readr' was built under R version 4.2.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(completejourney)
## Warning: package 'completejourney' was built under R version 4.2.3
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
demographics_adults <- demographics %>%
mutate(Adults = as.numeric(gsub(" .*", "", household_comp)))
Now that I have adults added as a numeric value. I will complete the following. Joining sales value and dividing kids count to adults to get a ratio of how many kids are in a household per adult. Then, this is divided by sales value to get a numeric number that shows how willing a family is to spend based upon number of kids in their household. The idea is that a higher number will mean lower spending per transaction and more kids per adult.
library(ggplot2)
library(dplyr)
# Join and process data
sales_adults <- transactions_sample %>%
select(household_id, sales_value) %>%
inner_join(demographics_adults, by = "household_id")
sales_adults <- sales_adults %>%
mutate(
kids_count = ifelse(kids_count == "3+", 3, as.numeric(kids_count)),
Kids_adults = kids_count / Adults,
Kids_Sales_Ratio = Kids_adults / sales_value
)
# Plotting the Correlation
ggplot(sales_adults, aes(x = Kids_Sales_Ratio, fill = income)) +
geom_histogram(binwidth = 0.1, color = "black", alpha = 0.7) +
facet_wrap(~income, scales = "free_y") +
scale_x_continuous(limits = c(0, 3)) +
labs(title = "Distribution of Kids_Sales_Ratio Across Income Levels",
x = "Kids to Sales Ratio",
y = "Frequency",
fill = "Income Level") +
theme_minimal()
## Warning: Removed 1099 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 24 rows containing missing values (`geom_bar()`).
Overall the reflection shown in the charts are intriguing. It indicates most households have a lower ratio indicating fewer kids/higher spending. However, what the chart doesn’t tell me is how the ratios are formed. For example, a lower ratio in the under 15k group is likely due to fewer kids, when 250K+ is likely from higher spending. This leads me into my next question.
sales_adults <- sales_adults %>%
mutate(
income = as.character(income),
income_group = case_when(
income %in% c("Under 15K", "15-24K", "25-34K", "35-49K") ~ "Under 15K - 35-49K",
income %in% c("50-74K", "75-99K", "100-124K", "125-149K") ~ "50-74K - 125-149K",
income %in% c("150-174K", "175-199K", "200-249K") ~ "150-174K - 200-249K",
income %in% c("250K+") ~ "250K+",
TRUE ~ NA_character_
)
)
ggplot(sales_adults, aes(x = factor(kids_count), fill = income_group)) +
geom_bar(position = "dodge", color = "black", alpha = 0.8) +
facet_wrap(~ income_group, scales = "free_y") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Kids Count Distribution Across Income Groups",
subtitle = "Comparison of household sizes across economic levels",
x = "Number of Kids",
y = "Number of Households",
fill = "Income Group") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 12, angle = 0, vjust = 0.5),
axis.text.y = element_text(size = 12),
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, face = "italic"),
legend.position = "bottom"
)
This chart shows a confirmation within what I was expecting. It appears each group follows the same correlation of 1 kid being the most, then 2 and 3 except for the range of 50 to 150K. It appears those in the middle class have a higher prioritization on creating a larger family. I believe this is because those in the lower class may not have the financial stability to afford a larger family while the upper class is career focused.
This last visualization I am looking to get a closer look at the specific income distribution among those with 3 kids. I start by filtering my data set to only include those with 3 kids.
sales_kids_3 <- sales_adults %>%
filter(kids_count >= 3)
Now I am going to plot the data to find a correlation
sales_kids_3_summary <- sales_kids_3 %>%
group_by(income_group) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(desc(count))
ggplot(sales_kids_3_summary, aes(x = reorder(income_group, count), y = count, fill = income_group)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), vjust = -0.3, size = 5) + # Add data labels
scale_fill_manual(values = c("#E74C3C", "#2E86C1", "#28B463", "#9B59B6")) + # Custom colors
labs(
title = "Income Distribution Among Families with 3+ Kids",
x = "Income Group",
y = "Number of Households",
fill = "Income Group"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
This visualization builds upon the conclusion drawn upon plot 2. We clearly see the median income range incorporating the highest number of families with 3+ kids.