df <- readr::read_csv("marketing.csv") %>%
janitor::clean_names()
# Parse and engineer
df <- df %>%
mutate(dt_customer = ymd(dt_customer),
age = 2025 - year_birth,
total_spend = mnt_wines + mnt_fruits + mnt_meat_products + mnt_fish_products + mnt_sweet_products + mnt_gold_prods,
total_purchases = num_deals_purchases + num_web_purchases + num_catalog_purchases + num_store_purchases)
glimpse(df)
## Rows: 2,240
## Columns: 22
## $ id <dbl> 1826, 1, 10476, 1386, 5371, 7348, 4073, 1991, 40…
## $ year_birth <dbl> 1970, 1961, 1958, 1967, 1989, 1958, 1954, 1967, …
## $ education <chr> "Graduation", "Graduation", "Graduation", "Gradu…
## $ marital_status <chr> "Divorced", "Single", "Married", "Together", "Si…
## $ income <chr> "$84,835.00", "$57,091.00", "$67,267.00", "$32,4…
## $ kidhome <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ dt_customer <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ mnt_wines <dbl> 189, 464, 134, 10, 6, 336, 769, 78, 384, 384, 45…
## $ mnt_fruits <dbl> 104, 5, 11, 0, 16, 130, 80, 0, 0, 0, 26, 4, 82, …
## $ mnt_meat_products <dbl> 379, 64, 59, 1, 24, 411, 252, 11, 102, 102, 535,…
## $ mnt_fish_products <dbl> 111, 7, 15, 0, 11, 240, 15, 0, 21, 21, 73, 0, 80…
## $ mnt_sweet_products <dbl> 189, 0, 2, 0, 0, 32, 34, 0, 32, 32, 98, 13, 20, …
## $ mnt_gold_prods <dbl> 218, 37, 30, 0, 34, 43, 65, 7, 5, 5, 26, 4, 102,…
## $ num_deals_purchases <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1, 0, 4, …
## $ num_web_purchases <dbl> 4, 7, 3, 1, 3, 4, 10, 2, 6, 6, 5, 3, 3, 1, 25, 2…
## $ num_catalog_purchases <dbl> 4, 3, 2, 0, 1, 7, 10, 1, 2, 2, 6, 1, 6, 1, 0, 1,…
## $ num_store_purchases <dbl> 6, 7, 5, 2, 2, 5, 7, 3, 9, 9, 10, 6, 6, 2, 0, 5,…
## $ response <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ country <chr> "SP", "CA", "US", "AUS", "SP", "SP", "GER", "SP"…
## $ age <dbl> 55, 64, 67, 58, 36, 67, 71, 58, 71, 71, 78, 46, …
## $ total_spend <dbl> 1190, 577, 251, 11, 91, 1192, 1215, 96, 544, 544…
## $ total_purchases <dbl> 15, 18, 11, 4, 8, 17, 28, 7, 20, 20, 22, 12, 16,…
summary(select(df, where(is.numeric)))
## id year_birth kidhome mnt_wines
## Min. : 0 Min. :1893 Min. :0.0000 Min. : 0.00
## 1st Qu.: 2828 1st Qu.:1959 1st Qu.:0.0000 1st Qu.: 23.75
## Median : 5458 Median :1970 Median :0.0000 Median : 173.50
## Mean : 5592 Mean :1969 Mean :0.4442 Mean : 303.94
## 3rd Qu.: 8428 3rd Qu.:1977 3rd Qu.:1.0000 3rd Qu.: 504.25
## Max. :11191 Max. :1996 Max. :2.0000 Max. :1493.00
## mnt_fruits mnt_meat_products mnt_fish_products mnt_sweet_products
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.0 1st Qu.: 16.0 1st Qu.: 3.00 1st Qu.: 1.00
## Median : 8.0 Median : 67.0 Median : 12.00 Median : 8.00
## Mean : 26.3 Mean : 166.9 Mean : 37.53 Mean : 27.06
## 3rd Qu.: 33.0 3rd Qu.: 232.0 3rd Qu.: 50.00 3rd Qu.: 33.00
## Max. :199.0 Max. :1725.0 Max. :259.00 Max. :263.00
## mnt_gold_prods num_deals_purchases num_web_purchases num_catalog_purchases
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 0.000
## Median : 24.00 Median : 2.000 Median : 4.000 Median : 2.000
## Mean : 44.02 Mean : 2.325 Mean : 4.085 Mean : 2.662
## 3rd Qu.: 56.00 3rd Qu.: 3.000 3rd Qu.: 6.000 3rd Qu.: 4.000
## Max. :362.00 Max. :15.000 Max. :27.000 Max. :28.000
## num_store_purchases response age total_spend
## Min. : 0.00 Min. :0.0000 Min. : 29.00 Min. : 5.00
## 1st Qu.: 3.00 1st Qu.:0.0000 1st Qu.: 48.00 1st Qu.: 68.75
## Median : 5.00 Median :0.0000 Median : 55.00 Median : 396.00
## Mean : 5.79 Mean :0.1491 Mean : 56.19 Mean : 605.80
## 3rd Qu.: 8.00 3rd Qu.:0.0000 3rd Qu.: 66.00 3rd Qu.:1045.50
## Max. :13.00 Max. :1.0000 Max. :132.00 Max. :2525.00
## total_purchases
## Min. : 0.00
## 1st Qu.: 8.00
## Median :15.00
## Mean :14.86
## 3rd Qu.:21.00
## Max. :44.00
df %>% summarise(across(everything(), ~ sum(is.na(.)), .names = "na_{.col}")) %>% pivot_longer(everything())
df %>%
summarise(across(c(mnt_wines,mnt_fruits,mnt_meat_products,mnt_fish_products,mnt_sweet_products,mnt_gold_prods), mean, na.rm = TRUE)) %>%
pivot_longer(everything(), names_to = "category", values_to = "avg_spend") %>%
ggplot(aes(x = fct_reorder(category, avg_spend, .desc = TRUE), y = avg_spend)) +
geom_col() +
coord_flip() +
labs(title = "Average spend by product category", x = "Category", y = "Average spend")
df %>%
group_by(education) %>%
summarise(response_rate = mean(response, na.rm = TRUE)) %>%
ggplot(aes(x = fct_reorder(education, response_rate), y = response_rate)) +
geom_col() +
coord_flip() +
labs(title = "Response rate by education", x = "Education", y = "Response rate")
df %>%
group_by(marital_status) %>%
summarise(response_rate = mean(response, na.rm = TRUE)) %>%
ggplot(aes(x = fct_reorder(marital_status, response_rate), y = response_rate)) +
geom_col() +
coord_flip() +
labs(title = "Response rate by marital status", x = "Marital status", y = "Response rate")
df %>%
ggplot(aes(x = income, y = total_spend)) +
geom_point(alpha = 0.6) +
labs(title = "Income vs Total Spend")
df_numeric <- df %>% select(where(is.numeric))
corr_mat <- cor(df_numeric, use = "pairwise.complete.obs")
sort(corr_mat["response", ], decreasing = TRUE)
## response total_spend mnt_wines
## 1.000000000 0.265298009 0.247254499
## mnt_meat_products num_catalog_purchases total_purchases
## 0.236335120 0.220810419 0.155138096
## num_web_purchases mnt_gold_prods mnt_fruits
## 0.148729585 0.139850137 0.125288808
## mnt_sweet_products mnt_fish_products num_store_purchases
## 0.117371901 0.111330795 0.039363444
## year_birth num_deals_purchases age
## 0.021325214 0.002238313 -0.021325214
## id kidhome
## -0.021967818 -0.080007782
df %>%
group_by(country) %>%
summarise(n = n(),
mean_income = mean(income, na.rm = TRUE),
mean_spend = mean(total_spend, na.rm = TRUE),
response_rate = mean(response, na.rm = TRUE)) %>%
arrange(desc(n))
Introduction The goal of this analysis is to profile customers and identify factors associated with campaign response. I analyzed a marketing dataset (n=2,240) containing demographics, spending by product category, purchase channels, and a binary response variable.
Method After cleaning column names and dates, I engineered three features: Age (2025 − Year_Birth), TotalSpend (sum of category spends), and TotalPurchases (sum of channel purchases). I cleaned Income by removing currency symbols and commas, then converted to numeric (24 missing). I summarized missingness, computed descriptive statistics, created histograms/boxplots/bar charts, and calculated correlations (numeric) and grouped response rates (Education, Marital Status, Country).
Results Mean age is ~56.2 (median 55). Income (cleaned) averages ~$52.2k (median ~$51.4k) with notable outliers (max ~$666k). Spending is right-skewed (mean TotalSpend ~$606; median 396). Average category spend is dominated by Wines (≈50%) and Meat (≈28%). The overall response rate is ~14.9%. The strongest positive associations with response are TotalSpend (r≈0.27), Wines (r≈0.25), Meat (r≈0.24), and Catalog Purchases (r≈0.22); Kidhome relates weakly and negatively (r≈−0.08). Higher observed response appears in PhD/Master groups and among Single/Divorced/Widow segments; country differences are present, with Spain and South Africa slightly higher. Some categories (e.g., “YOLO/Absurd”, “ME”) have tiny counts and should be disregarded.
Conclusion Response is most closely linked to value and channel: bigger spenders and catalog shoppers are more likely to respond. Marketing should focus on high Wine/Meat spenders and customers with catalog purchase history, while segment tests by education bands may offer incremental lift. Prior to modeling, I recommend handling income outliers and considering feature scaling or winsorization.