Load data

df <- readr::read_csv("marketing.csv") %>%
  janitor::clean_names()

# Parse and engineer
df <- df %>%
  mutate(dt_customer = ymd(dt_customer),
         age = 2025 - year_birth,
         total_spend = mnt_wines + mnt_fruits + mnt_meat_products + mnt_fish_products + mnt_sweet_products + mnt_gold_prods,
         total_purchases = num_deals_purchases + num_web_purchases + num_catalog_purchases + num_store_purchases)

Overview

glimpse(df)

## Rows: 2,240
## Columns: 22
## $ id                    <dbl> 1826, 1, 10476, 1386, 5371, 7348, 4073, 1991, 40…
## $ year_birth            <dbl> 1970, 1961, 1958, 1967, 1989, 1958, 1954, 1967, …
## $ education             <chr> "Graduation", "Graduation", "Graduation", "Gradu…
## $ marital_status        <chr> "Divorced", "Single", "Married", "Together", "Si…
## $ income                <chr> "$84,835.00", "$57,091.00", "$67,267.00", "$32,4…
## $ kidhome               <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ dt_customer           <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ mnt_wines             <dbl> 189, 464, 134, 10, 6, 336, 769, 78, 384, 384, 45…
## $ mnt_fruits            <dbl> 104, 5, 11, 0, 16, 130, 80, 0, 0, 0, 26, 4, 82, …
## $ mnt_meat_products     <dbl> 379, 64, 59, 1, 24, 411, 252, 11, 102, 102, 535,…
## $ mnt_fish_products     <dbl> 111, 7, 15, 0, 11, 240, 15, 0, 21, 21, 73, 0, 80…
## $ mnt_sweet_products    <dbl> 189, 0, 2, 0, 0, 32, 34, 0, 32, 32, 98, 13, 20, …
## $ mnt_gold_prods        <dbl> 218, 37, 30, 0, 34, 43, 65, 7, 5, 5, 26, 4, 102,…
## $ num_deals_purchases   <dbl> 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 2, 1, 1, 0, 4, …
## $ num_web_purchases     <dbl> 4, 7, 3, 1, 3, 4, 10, 2, 6, 6, 5, 3, 3, 1, 25, 2…
## $ num_catalog_purchases <dbl> 4, 3, 2, 0, 1, 7, 10, 1, 2, 2, 6, 1, 6, 1, 0, 1,…
## $ num_store_purchases   <dbl> 6, 7, 5, 2, 2, 5, 7, 3, 9, 9, 10, 6, 6, 2, 0, 5,…
## $ response              <dbl> 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ country               <chr> "SP", "CA", "US", "AUS", "SP", "SP", "GER", "SP"…
## $ age                   <dbl> 55, 64, 67, 58, 36, 67, 71, 58, 71, 71, 78, 46, …
## $ total_spend           <dbl> 1190, 577, 251, 11, 91, 1192, 1215, 96, 544, 544…
## $ total_purchases       <dbl> 15, 18, 11, 4, 8, 17, 28, 7, 20, 20, 22, 12, 16,…

summary(select(df, where(is.numeric)))

##        id          year_birth      kidhome         mnt_wines      
##  Min.   :    0   Min.   :1893   Min.   :0.0000   Min.   :   0.00  
##  1st Qu.: 2828   1st Qu.:1959   1st Qu.:0.0000   1st Qu.:  23.75  
##  Median : 5458   Median :1970   Median :0.0000   Median : 173.50  
##  Mean   : 5592   Mean   :1969   Mean   :0.4442   Mean   : 303.94  
##  3rd Qu.: 8428   3rd Qu.:1977   3rd Qu.:1.0000   3rd Qu.: 504.25  
##  Max.   :11191   Max.   :1996   Max.   :2.0000   Max.   :1493.00  
##    mnt_fruits    mnt_meat_products mnt_fish_products mnt_sweet_products
##  Min.   :  0.0   Min.   :   0.0    Min.   :  0.00    Min.   :  0.00    
##  1st Qu.:  1.0   1st Qu.:  16.0    1st Qu.:  3.00    1st Qu.:  1.00    
##  Median :  8.0   Median :  67.0    Median : 12.00    Median :  8.00    
##  Mean   : 26.3   Mean   : 166.9    Mean   : 37.53    Mean   : 27.06    
##  3rd Qu.: 33.0   3rd Qu.: 232.0    3rd Qu.: 50.00    3rd Qu.: 33.00    
##  Max.   :199.0   Max.   :1725.0    Max.   :259.00    Max.   :263.00    
##  mnt_gold_prods   num_deals_purchases num_web_purchases num_catalog_purchases
##  Min.   :  0.00   Min.   : 0.000      Min.   : 0.000    Min.   : 0.000       
##  1st Qu.:  9.00   1st Qu.: 1.000      1st Qu.: 2.000    1st Qu.: 0.000       
##  Median : 24.00   Median : 2.000      Median : 4.000    Median : 2.000       
##  Mean   : 44.02   Mean   : 2.325      Mean   : 4.085    Mean   : 2.662       
##  3rd Qu.: 56.00   3rd Qu.: 3.000      3rd Qu.: 6.000    3rd Qu.: 4.000       
##  Max.   :362.00   Max.   :15.000      Max.   :27.000    Max.   :28.000       
##  num_store_purchases    response           age          total_spend     
##  Min.   : 0.00       Min.   :0.0000   Min.   : 29.00   Min.   :   5.00  
##  1st Qu.: 3.00       1st Qu.:0.0000   1st Qu.: 48.00   1st Qu.:  68.75  
##  Median : 5.00       Median :0.0000   Median : 55.00   Median : 396.00  
##  Mean   : 5.79       Mean   :0.1491   Mean   : 56.19   Mean   : 605.80  
##  3rd Qu.: 8.00       3rd Qu.:0.0000   3rd Qu.: 66.00   3rd Qu.:1045.50  
##  Max.   :13.00       Max.   :1.0000   Max.   :132.00   Max.   :2525.00  
##  total_purchases
##  Min.   : 0.00  
##  1st Qu.: 8.00  
##  Median :15.00  
##  Mean   :14.86  
##  3rd Qu.:21.00  
##  Max.   :44.00

df %>% summarise(across(everything(), ~ sum(is.na(.)), .names = "na_{.col}")) %>% pivot_longer(everything())

Product category spend

df %>%
  summarise(across(c(mnt_wines,mnt_fruits,mnt_meat_products,mnt_fish_products,mnt_sweet_products,mnt_gold_prods), mean, na.rm = TRUE)) %>%
  pivot_longer(everything(), names_to = "category", values_to = "avg_spend") %>%
  ggplot(aes(x = fct_reorder(category, avg_spend, .desc = TRUE), y = avg_spend)) +
  geom_col() +
  coord_flip() +
  labs(title = "Average spend by product category", x = "Category", y = "Average spend")

Response rate by education & marital status

df %>%
  group_by(education) %>%
  summarise(response_rate = mean(response, na.rm = TRUE)) %>%
  ggplot(aes(x = fct_reorder(education, response_rate), y = response_rate)) +
  geom_col() +
  coord_flip() +
  labs(title = "Response rate by education", x = "Education", y = "Response rate")

df %>%
  group_by(marital_status) %>%
  summarise(response_rate = mean(response, na.rm = TRUE)) %>%
  ggplot(aes(x = fct_reorder(marital_status, response_rate), y = response_rate)) +
  geom_col() +
  coord_flip() +
  labs(title = "Response rate by marital status", x = "Marital status", y = "Response rate")

Income vs. Total Spend

df %>%
  ggplot(aes(x = income, y = total_spend)) +
  geom_point(alpha = 0.6) +
  labs(title = "Income vs Total Spend")

Correlations

df_numeric <- df %>% select(where(is.numeric))
corr_mat <- cor(df_numeric, use = "pairwise.complete.obs")
sort(corr_mat["response", ], decreasing = TRUE)

##              response           total_spend             mnt_wines 
##           1.000000000           0.265298009           0.247254499 
##     mnt_meat_products num_catalog_purchases       total_purchases 
##           0.236335120           0.220810419           0.155138096 
##     num_web_purchases        mnt_gold_prods            mnt_fruits 
##           0.148729585           0.139850137           0.125288808 
##    mnt_sweet_products     mnt_fish_products   num_store_purchases 
##           0.117371901           0.111330795           0.039363444 
##            year_birth   num_deals_purchases                   age 
##           0.021325214           0.002238313          -0.021325214 
##                    id               kidhome 
##          -0.021967818          -0.080007782

Country summary

df %>%
  group_by(country) %>%
  summarise(n = n(),
            mean_income = mean(income, na.rm = TRUE),
            mean_spend = mean(total_spend, na.rm = TRUE),
            response_rate = mean(response, na.rm = TRUE)) %>%
  arrange(desc(n))

Notes & Insights

Introduction The goal of this analysis is to profile customers and identify factors associated with campaign response. I analyzed a marketing dataset (n=2,240) containing demographics, spending by product category, purchase channels, and a binary response variable.

Method After cleaning column names and dates, I engineered three features: Age (2025 − Year_Birth), TotalSpend (sum of category spends), and TotalPurchases (sum of channel purchases). I cleaned Income by removing currency symbols and commas, then converted to numeric (24 missing). I summarized missingness, computed descriptive statistics, created histograms/boxplots/bar charts, and calculated correlations (numeric) and grouped response rates (Education, Marital Status, Country).

Results Mean age is ~56.2 (median 55). Income (cleaned) averages ~$52.2k (median ~$51.4k) with notable outliers (max ~$666k). Spending is right-skewed (mean TotalSpend ~$606; median 396). Average category spend is dominated by Wines (≈50%) and Meat (≈28%). The overall response rate is ~14.9%. The strongest positive associations with response are TotalSpend (r≈0.27), Wines (r≈0.25), Meat (r≈0.24), and Catalog Purchases (r≈0.22); Kidhome relates weakly and negatively (r≈−0.08). Higher observed response appears in PhD/Master groups and among Single/Divorced/Widow segments; country differences are present, with Spain and South Africa slightly higher. Some categories (e.g., “YOLO/Absurd”, “ME”) have tiny counts and should be disregarded.

Conclusion Response is most closely linked to value and channel: bigger spenders and catalog shoppers are more likely to respond. Marketing should focus on high Wine/Meat spenders and customers with catalog purchase history, while segment tests by education bands may offer incremental lift. Prior to modeling, I recommend handling income outliers and considering feature scaling or winsorization.

Week 1 – Descriptive Statistics & Visualizations

2025-09-04