This project analyses customer purchasing behaviour using marketing campaign data. The analysis focuses on customer income, spending patterns, education levels, marital status, and campaign responses through exploratory data visualisation techniques in R.
dat <- read_csv("marketing_data.csv")
## Rows: 2240 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Education, Marital_Status, Country
## dbl (24): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, Mnt...
## date (1): Dt_Customer
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(dat)
## # A tibble: 6 × 28
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <date>
## 1 1826 1970 Graduation Divorced 84835 0 0 2014-06-16
## 2 1 1961 Graduation Single 57091 0 0 2014-06-15
## 3 10476 1958 Graduation Married 67267 0 1 2014-05-13
## 4 1386 1967 Graduation Together 32474 1 1 2014-05-11
## 5 5371 1989 Graduation Single 21474 1 0 2014-04-08
## 6 7348 1958 PhD Single 71691 0 0 2014-03-17
## # ℹ 20 more variables: Recency <dbl>, MntWines <dbl>, MntFruits <dbl>,
## # MntMeatProducts <dbl>, MntFishProducts <dbl>, MntSweetProducts <dbl>,
## # MntGoldProds <dbl>, NumDealsPurchases <dbl>, NumWebPurchases <dbl>,
## # NumCatalogPurchases <dbl>, NumStorePurchases <dbl>,
## # NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>, AcceptedCmp4 <dbl>,
## # AcceptedCmp5 <dbl>, AcceptedCmp1 <dbl>, AcceptedCmp2 <dbl>, Response <dbl>,
## # Complain <dbl>, Country <chr>
ggplot(dat, aes(x = Income, y = MntWines)) +
geom_point(color = "darkblue",
alpha = 0.6) +
geom_smooth(method = "lm",
color = "red",
se = FALSE,
linewidth = 1) +
labs(
title = "Relationship Between Customer Income and Wine Spending",
x = "Customer Income",
y = "Wine Spending Amount"
) +
theme_minimal() +
coord_cartesian(
xlim = c(0, 120000),
ylim = c(0, 1600)
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
dat,
aes(
x = forcats::fct_recode(
Education,
"Secondary" = "2n Cycle",
"Basic" = "Basic",
"Graduate" = "Graduation",
"Master's" = "Master",
"PhD" = "PhD"
),
y = Income,
fill = forcats::fct_recode(
Education,
"Secondary" = "2n Cycle",
"Basic" = "Basic",
"Graduate" = "Graduation",
"Master's" = "Master",
"PhD" = "PhD"
)
)
) +
geom_boxplot() +
labs(
title = "Customer Income Distribution by Education Level",
x = "Education Level",
y = "Customer Income"
) +
theme_minimal() +
theme(
legend.position = "none"
) +
coord_cartesian(
ylim = c(0, 120000)
)
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
product_spending <- tibble(
Category = c(
"Wine",
"Fruits",
"Meat",
"Fish",
"Sweets",
"Gold Products"
),
Average_Spending = c(
mean(dat$MntWines, na.rm = TRUE),
mean(dat$MntFruits, na.rm = TRUE),
mean(dat$MntMeatProducts, na.rm = TRUE),
mean(dat$MntFishProducts, na.rm = TRUE),
mean(dat$MntSweetProducts, na.rm = TRUE),
mean(dat$MntGoldProds, na.rm = TRUE)
)
)
product_spending <- product_spending %>%
arrange(desc(Average_Spending))
ggplot(
product_spending,
aes(
x = reorder(Category, Average_Spending),
y = Average_Spending,
fill = Category
)
) +
geom_col() +
coord_flip() +
labs(
title = "Average Customer Spending Across Product Categories",
x = "Product Category",
y = "Average Customer Spending"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(face = "bold"),
plot.margin = margin(10, 20, 10, 30)
)
ggplot(
dat,
aes(x = Income)
) +
geom_histogram(
bins = 40,
fill = "steelblue",
color = "white",
linewidth = 0.3
) +
labs(
title = "Distribution of Customer Income Levels",
x = "Customer Income",
y = "Number of Customers"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold")
) +
coord_cartesian(
xlim = c(0, 120000)
)
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).
customer_trend <- dat %>%
mutate(Month = floor_date(Dt_Customer, "month")) %>%
count(Month)
ggplot(customer_trend,
aes(x = Month,
y = n)) +
geom_line(color = "darkgreen",
linewidth = 1.2) +
labs(
title = "Monthly Customer Enrollment Trend",
x = "Month",
y = "Number of Customers"
) +
theme_minimal()
# Select spending variables
spending_data <- dat %>%
select(
MntWines,
MntFruits,
MntMeatProducts,
MntFishProducts,
MntSweetProducts,
MntGoldProds
)
# Rename columns for presentation
colnames(spending_data) <- c(
"Wine",
"Fruits",
"Meat",
"Fish",
"Sweets",
"Gold Products"
)
# Create correlation matrix
cor_matrix <- cor(
spending_data,
use = "complete.obs"
)
# Convert matrix to dataframe
cor_data <- as.data.frame(as.table(cor_matrix))
# Plot heatmap
ggplot(
cor_data,
aes(
x = Var1,
y = Var2,
fill = Freq
)
) +
geom_tile(color = "white") +
geom_text(
aes(label = round(Freq, 2)),
color = "white",
size = 4
) +
scale_fill_gradient(
low = "steelblue",
high = "darkred"
) +
labs(
title = "Correlation Between Product Spending Categories",
x = "Product Category",
y = "Product Category",
fill = "Correlation"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold"),
axis.text.x = element_text(
angle = 45,
hjust = 1
)
)
response_data <- dat %>%
filter(
Marital_Status %in%
c(
"Married",
"Single",
"Together",
"Divorced",
"Widow"
)
) %>%
mutate(
Response = factor(
Response,
levels = c(0, 1),
labels = c("No Response", "Accepted")
)
) %>%
group_by(
Marital_Status,
Response
) %>%
summarise(
Count = n(),
.groups = "drop"
)
ggplot(
response_data,
aes(
x = Marital_Status,
y = Count,
fill = Response
)
) +
geom_bar(
stat = "identity",
position = "dodge"
) +
labs(
title = "Campaign Response Across Marital Status Groups",
x = "Marital Status",
y = "Number of Customers",
fill = "Campaign Outcome"
) +
scale_fill_manual(
values = c(
"steelblue",
"darkorange"
)
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold"),
axis.text.x = element_text(
angle = 10,
hjust = 1
)
)
# Create total spending variable
dat$total_spending <-
dat$MntWines +
dat$MntFruits +
dat$MntMeatProducts +
dat$MntFishProducts +
dat$MntSweetProducts +
dat$MntGoldProds
# Clean education labels
dat$Education_Level <- forcats::fct_recode(
dat$Education,
"Secondary" = "2n Cycle",
"Basic" = "Basic",
"Graduate" = "Graduation",
"Master's" = "Master",
"PhD" = "PhD"
)
# Scatter plot
interactive_plot <- ggplot(
dat,
aes(
x = Income,
y = total_spending,
color = Education_Level
)
) +
geom_point(
alpha = 0.6,
size = 2
) +
labs(
title = "Income vs Total Customer Spending",
x = "Customer Income",
y = "Total Spending Amount",
color = "Education Level"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold")
) +
coord_cartesian(
xlim = c(0, 120000)
)
ggplotly(
interactive_plot,
tooltip = c("x", "y", "color")
)