Dataset: Marketing Data
Task: Exploratory and Statistical Analysis Task.
Contents:
library(tidyverse)
library(skimr)
library(lubridate)
library(reshape)
library(Hmisc)
library(corrplot)
library(tibble)
library(viridis)
library(rstatix)
library(wesanderson)
library(ggsci)
data = read.csv("marketing_data.csv",header=TRUE)
dim(data)
[1] 2240 28
str(data)
'data.frame': 2240 obs. of 28 variables:
$ ID : int 1826 1 10476 1386 5371 7348 4073 1991 4047 9477 ...
$ Year_Birth : int 1970 1961 1958 1967 1989 1958 1954 1967 1954 1954 ...
$ Education : chr "Graduation" "Graduation" "Graduation" "Graduation" ...
$ Marital_Status : chr "Divorced" "Single" "Married" "Together" ...
$ Income : chr "$84,835.00 " "$57,091.00 " "$67,267.00 " "$32,474.00 " ...
$ Kidhome : int 0 0 0 1 1 0 0 0 0 0 ...
$ Teenhome : int 0 0 1 1 0 0 0 1 1 1 ...
$ Dt_Customer : chr "6/16/14" "6/15/14" "5/13/14" "5/11/14" ...
$ Recency : int 0 0 0 0 0 0 0 0 0 0 ...
$ MntWines : int 189 464 134 10 6 336 769 78 384 384 ...
$ MntFruits : int 104 5 11 0 16 130 80 0 0 0 ...
$ MntMeatProducts : int 379 64 59 1 24 411 252 11 102 102 ...
$ MntFishProducts : int 111 7 15 0 11 240 15 0 21 21 ...
$ MntSweetProducts : int 189 0 2 0 0 32 34 0 32 32 ...
$ MntGoldProds : int 218 37 30 0 34 43 65 7 5 5 ...
$ NumDealsPurchases : int 1 1 1 1 2 1 1 1 3 3 ...
$ NumWebPurchases : int 4 7 3 1 3 4 10 2 6 6 ...
$ NumCatalogPurchases: int 4 3 2 0 1 7 10 1 2 2 ...
$ NumStorePurchases : int 6 7 5 2 2 5 7 3 9 9 ...
$ NumWebVisitsMonth : int 1 5 2 7 7 2 6 5 4 4 ...
$ AcceptedCmp3 : int 0 0 0 0 1 0 1 0 0 0 ...
$ AcceptedCmp4 : int 0 0 0 0 0 0 0 0 0 0 ...
$ AcceptedCmp5 : int 0 0 0 0 0 0 0 0 0 0 ...
$ AcceptedCmp1 : int 0 0 0 0 0 0 0 0 0 0 ...
$ AcceptedCmp2 : int 0 1 0 0 0 0 0 0 0 0 ...
$ Response : int 1 1 0 0 1 1 1 0 0 0 ...
$ Complain : int 0 0 0 0 0 0 0 0 0 0 ...
$ Country : chr "SP" "CA" "US" "AUS" ...
# Format dates in Dt_Customer
data$Dt_Customer = mdy(data$Dt_Customer)
summary(data$Dt_Customer)
Min. 1st Qu. Median Mean 3rd Qu. Max.
"2012-07-30" "2013-01-16" "2013-07-08" "2013-07-10" "2013-12-30" "2014-06-29"
# Drop special characters in Income
data$Income = gsub('[$]([0-9]+)[,]([0-9]+)','\\1\\2',data$Income)
data$Income = as.numeric(data$Income)
# Missing data analysis
skim(data)
── Data Summary ────────────────────────
Values
Name data
Number of rows 2240
Number of columns 28
_______________________
Column type frequency:
character 3
Date 1
numeric 24
________________________
Group variables None
── Variable type: character ─────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 Education 0 1 3 10 0 5 0
2 Marital_Status 0 1 4 8 0 8 0
3 Country 0 1 2 3 0 8 0
── Variable type: Date ──────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max median n_unique
1 Dt_Customer 0 1 2012-07-30 2014-06-29 2013-07-08 663
── Variable type: numeric ───────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 ID 0 1 5592. 3247. 0 2828. 5458. 8428. 11191 ▇▇▇▇▇
2 Year_Birth 0 1 1969. 12.0 1893 1959 1970 1977 1996 ▁▁▂▇▅
3 Income 24 0.989 52247. 25173. 1730 35303 51382. 68522 666666 ▇▁▁▁▁
4 Kidhome 0 1 0.444 0.538 0 0 0 1 2 ▇▁▆▁▁
5 Teenhome 0 1 0.506 0.545 0 0 0 1 2 ▇▁▇▁▁
6 Recency 0 1 49.1 29.0 0 24 49 74 99 ▇▇▇▇▇
7 MntWines 0 1 304. 337. 0 23.8 174. 504. 1493 ▇▂▂▁▁
8 MntFruits 0 1 26.3 39.8 0 1 8 33 199 ▇▁▁▁▁
9 MntMeatProducts 0 1 167. 226. 0 16 67 232 1725 ▇▁▁▁▁
10 MntFishProducts 0 1 37.5 54.6 0 3 12 50 259 ▇▁▁▁▁
11 MntSweetProducts 0 1 27.1 41.3 0 1 8 33 263 ▇▁▁▁▁
12 MntGoldProds 0 1 44.0 52.2 0 9 24 56 362 ▇▁▁▁▁
13 NumDealsPurchases 0 1 2.33 1.93 0 1 2 3 15 ▇▂▁▁▁
14 NumWebPurchases 0 1 4.08 2.78 0 2 4 6 27 ▇▃▁▁▁
15 NumCatalogPurchases 0 1 2.66 2.92 0 0 2 4 28 ▇▂▁▁▁
16 NumStorePurchases 0 1 5.79 3.25 0 3 5 8 13 ▂▇▂▃▂
17 NumWebVisitsMonth 0 1 5.32 2.43 0 3 6 7 20 ▅▇▁▁▁
18 AcceptedCmp3 0 1 0.0728 0.260 0 0 0 0 1 ▇▁▁▁▁
19 AcceptedCmp4 0 1 0.0746 0.263 0 0 0 0 1 ▇▁▁▁▁
20 AcceptedCmp5 0 1 0.0728 0.260 0 0 0 0 1 ▇▁▁▁▁
21 AcceptedCmp1 0 1 0.0643 0.245 0 0 0 0 1 ▇▁▁▁▁
22 AcceptedCmp2 0 1 0.0134 0.115 0 0 0 0 1 ▇▁▁▁▁
23 Response 0 1 0.149 0.356 0 0 0 0 1 ▇▁▁▁▂
24 Complain 0 1 0.00938 0.0964 0 0 0 0 1 ▇▁▁▁▁
# Drop 24 obs without income
data = data %>% filter(!is.na(Income))
dim(data)
[1] 2216 28
# Unique levels in categorical variables
summary(as.factor(data$Education))
2n Cycle Basic Graduation Master PhD
200 54 1116 365 481
summary(as.factor(data$Marital_Status))
Absurd Alone Divorced Married Single Together Widow YOLO
2 3 232 857 471 573 76 2
summary(as.factor(data$Country))
AUS CA GER IND ME SA SP US
147 266 116 147 3 337 1093 107
# Find duplicates
length(unique(data$ID))
[1] 2216
datacopy = data %>% distinct(Income,Year_Birth, Kidhome, Teenhome, Recency, Dt_Customer, MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts, MntGoldProds,NumDealsPurchases, NumWebPurchases, NumCatalogPurchases, NumStorePurchases, NumWebVisitsMonth, .keep_all=TRUE)
dim(datacopy)
[1] 2011 28
# Recode levels after filtering duplicates
library(plyr)
revalue(datacopy$Education,c("2n Cycle" = "Master")) -> datacopy$Education
revalue(datacopy$Marital_Status,c("Alone" = "Single")) -> datacopy$Marital_Status
revalue(datacopy$Marital_Status,c("Absurd" = "Together")) -> datacopy$Marital_Status
revalue(datacopy$Country,c("ME" = "AUS")) -> datacopy$Country
# Summary of unique levels after cleaning
summary(as.factor(datacopy$Education))
Basic Graduation Master PhD
49 1013 514 435
summary(as.factor(datacopy$Marital_Status))
Divorced Married Single Together Widow
212 781 438 511 69
summary(as.factor(datacopy$Country))
AUS CA GER IND SA SP US
130 244 103 132 305 994 103
data = datacopy
# Change variables types
data = data %>% mutate_at(vars(AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain),list(factor))
# Outliers
# Scale
numeric_data = data %>% select(where(is.numeric))
z_data = data.frame(scale(numeric_data))
z_data$type = rownames(data)
# Plot all numeric variables
z_data %>%
pivot_longer(cols=-type) %>%
ggplot(aes(x = name, y = value)) +
geom_boxplot() +
theme_classic() +
expand_limits(x=12.6) + coord_flip()
# Plot Income
summary(data$Income)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1730 35534 51563 52375 68656 666666
data %>% ggplot(aes(x=Year_Birth)) + geom_boxplot()
# Drop obs with outliers in Income and Year_Birth
data = data %>% filter(!Year_Birth<1930) %>% filter(!Income==666666)
dim(data)
[1] 2007 28
# Feature Construction
# Dt_Customer days since the epoch
data$Dt_Customer_num = as.numeric(data$Dt_Customer)
summary(data$Dt_Customer_num)
Min. 1st Qu. Median Mean 3rd Qu. Max.
15551 15722 15899 15898 16072 16250
# TotalPurchases
data$TotalPurchases = data$NumWebPurchases + data$NumCatalogPurchases + data$NumStorePurchases + data$NumDealsPurchases
summary(data$TotalPurchases)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 8.00 15.00 14.89 21.00 44.00
# Obs without purchases
NoPurchases = data %>% filter(TotalPurchases==0)
head(NoPurchases)
# TotalProducts
data$TotalProducts = data$MntWines + data$MntFruits + data$MntMeatProducts + data$MntFishProducts + data$MntSweetProducts + data$MntGoldProds
summary(data$TotalProducts)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 69.0 397.0 607.8 1047.5 2525.0
# Plot TotalPurchases and TotalProducts
data %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_point(alpha=0.5)
# Plot anomalies
data %>% filter(TotalPurchases==0) %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_count() + scale_y_continuous(limits=c(0.00,10.00)) + scale_size_continuous(breaks=round)
# Drop anomalies
data = data %>% filter(TotalPurchases!=0)
dim(data)
[1] 2003 31
# Label
data22 = data
data22$country2 = ifelse(data22$Country=="US","US","World")
Hmisc::describe(data22$country2)
data22$country2
n missing distinct
2003 0 2
Value US World
Frequency 103 1900
Proportion 0.051 0.949
# Independent t test
t.test(TotalPurchases ~ country2, data= data22, var.equal=TRUE,alternative="greater")
Two Sample t-test
data: TotalPurchases by country2
t = 1.8535, df = 2001, p-value = 0.03198
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
0.1605314 Inf
sample estimates:
mean in group US mean in group World
16.28155 14.85053
# Label
data23 = data
data23$gold2 = ifelse(data23$MntGoldProds> mean(data23$MntGoldProds),"above_avg","not_above_avg")
Hmisc::describe(data23$gold2)
data23$gold2
n missing distinct
2003 0 2
Value above_avg not_above_avg
Frequency 631 1372
Proportion 0.315 0.685
# Independent t test
t.test(NumStorePurchases ~ gold2, data= data23, var.equal=TRUE,alternative="greater")
Two Sample t-test
data: NumStorePurchases by gold2
t = 19.39, df = 2001, p-value < 2.2e-16
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
2.531203 Inf
sample estimates:
mean in group above_avg mean in group not_above_avg
7.698891 4.932945
# Summary of groups
data25 = data
data25$Response = as.numeric(data25$Response)
data25 %>% group_by(Country) %>% get_summary_stats(Response, type="mean_sd")
# Compare means of group using ANOVA test
res.aov = data25 %>% anova_test(Response ~ Country)
res.aov
# Prepare data
data3 = data %>% select (AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4, AcceptedCmp5)
data3 = mutate_if(data3, is.factor, ~as.numeric(as.character(.x)))
# Plot
data3 %>% gather(Cmp, Outcome, AcceptedCmp1:AcceptedCmp5) %>% filter(Outcome>0) %>% group_by(Cmp) %>% tally() %>% ggplot(aes(x=reorder(Cmp,n), y=n, fill=Cmp)) + geom_col(width=0.7) + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3.5) + scale_y_continuous(limits=c(0,180)) + coord_flip() + labs(y="", x="",title="Which marketing campaign is the most sucessful?", subtitle ="Number of customers who accepted the offer") + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank())
# Average Year_Birth and Income
data %>% summarise(avg_Year_Birth = round(mean(Year_Birth)), avg_Income = round(mean(Income)), avg_Kidhome = round(mean(Kidhome)), avg_Teenhome= round(mean(Teenhome)))
# Income
p1 = data %>% ggplot(aes(x=Income)) + geom_histogram(binwidth=5000, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 52002, color="#e63946",linetype="dashed") + scale_y_continuous(limits=c(0,180)) + annotate("text",label="Average = $52,002", x= 95000,y=170) + labs(x="", y="Customer Count", title="Yearly household income")
# Year_Birth
p2 = data %>% ggplot(aes(x=Year_Birth)) + geom_histogram(binwidth=5, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 1969, color="#e63946",linetype="dashed") + scale_y_continuous(limits=c(0,380)) + annotate("text",label="Average = 1969", x= 1985,y=350) + labs(title="Birth year", y="Customer Count",x="")
# Plot
ggarrange(p1,p2,ncol=2)
# Kidhome
p3 = data %>% group_by(Kidhome) %>% tally() %>% ggplot(aes(x=factor(Kidhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of children in household") + scale_y_continuous(limits=c(0,1300))
# Teenhome
p4 = data %>% group_by(Teenhome) %>% tally() %>% ggplot(aes(x=factor(Teenhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of teens in household") + scale_y_continuous(limits=c(0,1300))
# Marital Status
p5 = data %>% group_by(Marital_Status) %>% tally() %>% ggplot(aes(x=reorder(Marital_Status,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Marital status") + theme(legend.position="none")
# Education
p6 = data %>% group_by(Education) %>% tally() %>% ggplot(aes(x=reorder(Education,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Customer's educational level") + theme(legend.position="none")
# Combined plot
ggarrange(p3,p4,p5,p6, ncol=2, nrow=2)
# Marital status and education level
data %>% group_by(Marital_Status,Education) %>% tally(sort=T) %>% mutate(Marital_Edu = paste(Marital_Status, Education, sep="-")) %>% ggplot(aes(x=reorder(Marital_Edu,n), y=n)) + geom_point(color="#f77f00") + geom_segment(aes(x=Marital_Edu, xend=Marital_Edu, y=0, yend=n),color="#335c67") + coord_flip() + theme_light() + theme(panel.border=element_blank(),axis.ticks.x=element_blank(),axis.ticks.y=element_blank(), panel.grid.major.y=element_blank(), panel.grid.minor.x=element_blank()) + labs(y="Customer count", x="", title="Customers' marital status and education level")
PrLabel = c("Fruits","Sweet","Fish","Gold","Meat","Wines")
data %>% select(MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds) %>% gather(Product, AmtSpent, MntWines:MntGoldProds) %>% group_by(Product) %>% tally(AmtSpent) %>% ggplot(aes(x=reorder(Product,n), y=n, fill=Product)) + geom_col() + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3) + scale_y_continuous(limits=c(0,700000)) + coord_flip() + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank(),axis.text.y=element_text(size=10)) + labs(y="Amount spent", x= "", title = "Which products are performing best?", subtitle = "Amount spent by customers in the last two years by product type") + scale_x_discrete(label= PrLabel)
# Labels
ChLabel = c("Catalog","Web","Store")
# Plot
data %>% select(NumWebPurchases, NumCatalogPurchases, NumStorePurchases) %>% gather(Channel, Purchases, NumWebPurchases:NumStorePurchases) %>% group_by(Channel) %>% tally(Purchases) %>% ggplot(aes(x=reorder(Channel,n), y=n, fill=Channel)) + geom_col(width=0.6) + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3.5) + scale_y_continuous(limits=c(0,13000)) + coord_flip() + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank(), axis.text.y=element_text(size=11)) + labs(y="Number of purchases", x= "", title = "Which channels are underperforming?", subtitle = "Number of purchases made through store, web and catalog") + scale_x_discrete(labels=ChLabel)