This tutorial investigates the connection between customer income and
wine spending using data from a simulated marketing campaign. We analyze
the first 100 entries from a modified version of the “Customer
Personality Analysis” dataset on Kaggle, which contains consumer
demographic information and purchase history. The objective is to assess
whether higher income levels are associated with increased wine
spending, offering insights that could support targeted marketing
efforts.
Link
to dataset
# Loading necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Check working directory
getwd()
## [1] "/cloud/project"
# Loading data - make sure the file exists in this location
data <- read_csv("marketing_campaign.csv")
## Rows: 2240 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_C...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Check if data loaded properly and verify column names
names(data)
## [1] "ID\tYear_Birth\tEducation\tMarital_Status\tIncome\tKidhome\tTeenhome\tDt_Customer\tRecency\tMntWines\tMntFruits\tMntMeatProducts\tMntFishProducts\tMntSweetProducts\tMntGoldProds\tNumDealsPurchases\tNumWebPurchases\tNumCatalogPurchases\tNumStorePurchases\tNumWebVisitsMonth\tAcceptedCmp3\tAcceptedCmp4\tAcceptedCmp5\tAcceptedCmp1\tAcceptedCmp2\tComplain\tZ_CostContact\tZ_Revenue\tResponse"
# View structure to confirm data loaded correctly
head(data)
## # A tibble: 6 × 1
## ID\tYear_Birth\tEducation\tMarital_Status\tIncome\tKidhome\tTeenhome\tDt_Cus…¹
## <chr>
## 1 "5524\t1957\tGraduation\tSingle\t58138\t0\t0\t04-09-2012\t58\t635\t88\t546\t1…
## 2 "2174\t1954\tGraduation\tSingle\t46344\t1\t1\t08-03-2014\t38\t11\t1\t6\t2\t1\…
## 3 "4141\t1965\tGraduation\tTogether\t71613\t0\t0\t21-08-2013\t26\t426\t49\t127\…
## 4 "6182\t1984\tGraduation\tTogether\t26646\t1\t0\t10-02-2014\t26\t11\t4\t20\t10…
## 5 "5324\t1981\tPhD\tMarried\t58293\t1\t0\t19-01-2014\t94\t173\t43\t118\t46\t27\…
## 6 "7446\t1967\tMaster\tTogether\t62513\t0\t1\t09-09-2013\t16\t520\t42\t98\t0\t4…
## # ℹ abbreviated name:
## # ¹`ID\tYear_Birth\tEducation\tMarital_Status\tIncome\tKidhome\tTeenhome\tDt_Customer\tRecency\tMntWines\tMntFruits\tMntMeatProducts\tMntFishProducts\tMntSweetProducts\tMntGoldProds\tNumDealsPurchases\tNumWebPurchases\tNumCatalogPurchases\tNumStorePurchases\tNumWebVisitsMonth\tAcceptedCmp3\tAcceptedCmp4\tAcceptedCmp5\tAcceptedCmp1\tAcceptedCmp2\tComplain\tZ_CostContact\tZ_Revenue\tResponse`
# Get summary statistics
summary(data)
## ID\tYear_Birth\tEducation\tMarital_Status\tIncome\tKidhome\tTeenhome\tDt_Customer\tRecency\tMntWines\tMntFruits\tMntMeatProducts\tMntFishProducts\tMntSweetProducts\tMntGoldProds\tNumDealsPurchases\tNumWebPurchases\tNumCatalogPurchases\tNumStorePurchases\tNumWebVisitsMonth\tAcceptedCmp3\tAcceptedCmp4\tAcceptedCmp5\tAcceptedCmp1\tAcceptedCmp2\tComplain\tZ_CostContact\tZ_Revenue\tResponse
## Length:2240
## Class :character
## Mode :character
# Histograms - only proceed if data loaded correctly
if("Income" %in% names(data)) {
ggplot(data, aes(x = Income)) +
geom_histogram(binwidth = 5000, fill = "steelblue", color = "black") +
theme_minimal() +
labs(title = "Income Distribution")
} else {
print("Error: 'Income' column not found in the dataset")
}
## [1] "Error: 'Income' column not found in the dataset"
if("MntWines" %in% names(data)) {
ggplot(data, aes(x = MntWines)) +
geom_histogram(binwidth = 20, fill = "seagreen", color = "black") +
theme_minimal() +
labs(title = "Wine Spending Distribution")
} else {
print("Error: 'MntWines' column not found in the dataset")
}
## [1] "Error: 'MntWines' column not found in the dataset"
# Linear regression - only proceed if both columns exist
if(all(c("Income", "MntWines") %in% names(data))) {
model <- lm(MntWines ~ Income, data = data)
summary(model)
# Scatter plot with regression line
ggplot(data, aes(x = Income, y = MntWines)) +
geom_point() +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Income vs Wine Spending", x = "Income", y = "Wine Spending") +
theme_minimal()
} else {
print("Error: Required columns not found for regression analysis")
}
## [1] "Error: Required columns not found for regression analysis"