Customer Purchasing Behavior Analysis

Project Overview

This project analyses customer purchasing behaviour using marketing campaign data. The analysis focuses on customer income, spending patterns, education levels, marital status, and campaign responses through exploratory data visualisation techniques in R.

Import Data

dat <- read_csv("marketing_data.csv")

## Rows: 2240 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): Education, Marital_Status, Country
## dbl  (24): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, Mnt...
## date  (1): Dt_Customer
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(dat)

## # A tibble: 6 × 28
##      ID Year_Birth Education  Marital_Status Income Kidhome Teenhome Dt_Customer
##   <dbl>      <dbl> <chr>      <chr>           <dbl>   <dbl>    <dbl> <date>     
## 1  1826       1970 Graduation Divorced        84835       0        0 2014-06-16 
## 2     1       1961 Graduation Single          57091       0        0 2014-06-15 
## 3 10476       1958 Graduation Married         67267       0        1 2014-05-13 
## 4  1386       1967 Graduation Together        32474       1        1 2014-05-11 
## 5  5371       1989 Graduation Single          21474       1        0 2014-04-08 
## 6  7348       1958 PhD        Single          71691       0        0 2014-03-17 
## # ℹ 20 more variables: Recency <dbl>, MntWines <dbl>, MntFruits <dbl>,
## #   MntMeatProducts <dbl>, MntFishProducts <dbl>, MntSweetProducts <dbl>,
## #   MntGoldProds <dbl>, NumDealsPurchases <dbl>, NumWebPurchases <dbl>,
## #   NumCatalogPurchases <dbl>, NumStorePurchases <dbl>,
## #   NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>, AcceptedCmp4 <dbl>,
## #   AcceptedCmp5 <dbl>, AcceptedCmp1 <dbl>, AcceptedCmp2 <dbl>, Response <dbl>,
## #   Complain <dbl>, Country <chr>

Figure 1: Income vs Wine Spending

ggplot(dat, aes(x = Income, y = MntWines)) +

  geom_point(color = "darkblue",
             alpha = 0.6) +

  geom_smooth(method = "lm",
              color = "red",
              se = FALSE,
              linewidth = 1) +

  labs(
    title = "Relationship Between Customer Income and Wine Spending",
    x = "Customer Income",
    y = "Wine Spending Amount"
  ) +

  theme_minimal() +

  coord_cartesian(
  xlim = c(0, 120000),
  ylim = c(0, 1600)
)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

Figure 2: Income Distribution by Education Level

ggplot(
  dat,
  
  aes(
    x = forcats::fct_recode(
      Education,
      "Secondary" = "2n Cycle",
      "Basic" = "Basic",
      "Graduate" = "Graduation",
      "Master's" = "Master",
      "PhD" = "PhD"
    ),
    
    y = Income,
    
    fill = forcats::fct_recode(
      Education,
      "Secondary" = "2n Cycle",
      "Basic" = "Basic",
      "Graduate" = "Graduation",
      "Master's" = "Master",
      "PhD" = "PhD"
    )
  )
  
) +

geom_boxplot() +

labs(
  title = "Customer Income Distribution by Education Level",
  x = "Education Level",
  y = "Customer Income"
) +

theme_minimal() +

theme(
  legend.position = "none"
) +

coord_cartesian(
  ylim = c(0, 120000)
)

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Figure 3: Average Spending by Product Category

product_spending <- tibble(

  Category = c(
    "Wine",
    "Fruits",
    "Meat",
    "Fish",
    "Sweets",
    "Gold Products"
  ),

  Average_Spending = c(
    mean(dat$MntWines, na.rm = TRUE),
    mean(dat$MntFruits, na.rm = TRUE),
    mean(dat$MntMeatProducts, na.rm = TRUE),
    mean(dat$MntFishProducts, na.rm = TRUE),
    mean(dat$MntSweetProducts, na.rm = TRUE),
    mean(dat$MntGoldProds, na.rm = TRUE)
  )

)

product_spending <- product_spending %>%
  arrange(desc(Average_Spending))

ggplot(
  product_spending,

  aes(
    x = reorder(Category, Average_Spending),
    y = Average_Spending,
    fill = Category
  )

) +

geom_col() +

coord_flip() +

labs(
  title = "Average Customer Spending Across Product Categories",
 x = "Product Category",
y = "Average Customer Spending"
) +

theme_minimal() +

theme(
  legend.position = "none",
  plot.title = element_text(face = "bold"),
  plot.margin = margin(10, 20, 10, 30)
)

Figure 4: Distribution of Customer Income

ggplot(
  dat,
  
  aes(x = Income)

) +

geom_histogram(
  
  bins = 40,
  
  fill = "steelblue",
  
  color = "white",
  
  linewidth = 0.3

) +

labs(
  
  title = "Distribution of Customer Income Levels",
  
  x = "Customer Income",
  
  y = "Number of Customers"

) +

theme_minimal() +

theme(
  
  plot.title = element_text(face = "bold")

) +

coord_cartesian(
  
  xlim = c(0, 120000)

)

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

Figure 5: Customer Enrollment Over Time

customer_trend <- dat %>%
  mutate(Month = floor_date(Dt_Customer, "month")) %>%
  count(Month)

ggplot(customer_trend,
       aes(x = Month,
           y = n)) +
  geom_line(color = "darkgreen",
            linewidth = 1.2) +
  labs(
    title = "Monthly Customer Enrollment Trend",
    x = "Month",
    y = "Number of Customers"
  ) +
  theme_minimal()

Figure 6: Correlation Between Product Spending Categories

# Select spending variables
spending_data <- dat %>%
  select(
    MntWines,
    MntFruits,
    MntMeatProducts,
    MntFishProducts,
    MntSweetProducts,
    MntGoldProds
  )

# Rename columns for presentation
colnames(spending_data) <- c(
  "Wine",
  "Fruits",
  "Meat",
  "Fish",
  "Sweets",
  "Gold Products"
)

# Create correlation matrix
cor_matrix <- cor(
  spending_data,
  use = "complete.obs"
)

# Convert matrix to dataframe
cor_data <- as.data.frame(as.table(cor_matrix))

# Plot heatmap
ggplot(
  cor_data,
  aes(
    x = Var1,
    y = Var2,
    fill = Freq
  )
) +

  geom_tile(color = "white") +

  geom_text(
    aes(label = round(Freq, 2)),
    color = "white",
    size = 4
  ) +

  scale_fill_gradient(
    low = "steelblue",
    high = "darkred"
  ) +

  labs(
    title = "Correlation Between Product Spending Categories",
    x = "Product Category",
    y = "Product Category",
    fill = "Correlation"
  ) +

  theme_minimal() +

  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(
      angle = 45,
      hjust = 1
    )
  )

Figure 7: Campaign Response by Marital Status

response_data <- dat %>%
  
  filter(
    Marital_Status %in%
      c(
        "Married",
        "Single",
        "Together",
        "Divorced",
        "Widow"
      )
  ) %>%
  
  mutate(
    Response = factor(
      Response,
      levels = c(0, 1),
      labels = c("No Response", "Accepted")
    )
  ) %>%
  
  group_by(
    Marital_Status,
    Response
  ) %>%
  
  summarise(
    Count = n(),
    .groups = "drop"
  )

ggplot(
  response_data,
  
  aes(
    x = Marital_Status,
    y = Count,
    fill = Response
  )
) +

  geom_bar(
    stat = "identity",
    position = "dodge"
  ) +

  labs(
    title = "Campaign Response Across Marital Status Groups",
    x = "Marital Status",
    y = "Number of Customers",
    fill = "Campaign Outcome"
  ) +

  scale_fill_manual(
    values = c(
      "steelblue",
      "darkorange"
    )
  ) +

  theme_minimal() +

  theme(
    plot.title = element_text(face = "bold"),
    
    axis.text.x = element_text(
      angle = 10,
      hjust = 1
    )
  )

Figure 8: Income vs Total Spending

# Create total spending variable
dat$total_spending <- 
  dat$MntWines +
  dat$MntFruits +
  dat$MntMeatProducts +
  dat$MntFishProducts +
  dat$MntSweetProducts +
  dat$MntGoldProds

# Clean education labels
dat$Education_Level <- forcats::fct_recode(
  dat$Education,
  
  "Secondary" = "2n Cycle",
  "Basic" = "Basic",
  "Graduate" = "Graduation",
  "Master's" = "Master",
  "PhD" = "PhD"
)

# Scatter plot
interactive_plot <- ggplot(
  
  dat,
  
  aes(
    x = Income,
    y = total_spending,
    color = Education_Level
  )
) +

  geom_point(
    alpha = 0.6,
    size = 2
  ) +

  labs(
    title = "Income vs Total Customer Spending",
    x = "Customer Income",
    y = "Total Spending Amount",
    color = "Education Level"
  ) +

  theme_minimal() +

  theme(
    plot.title = element_text(face = "bold")
  ) +

  coord_cartesian(
    xlim = c(0, 120000)
  )

ggplotly(
  interactive_plot,
  tooltip = c("x", "y", "color")
)