Dataset: eCommerce Marketing

Task: Exploratory and Statistical Analysis.

Contents:

Load libraries

library(plyr)
library(tidyverse)
library(skimr)
library(lubridate)
library(reshape)
library(Hmisc)
library(corrplot)
library(tibble)
library(viridis)
library(rstatix)
library(wesanderson)
library(ggsci)
library(caret)
library(ggpubr)
library(janitor)

Import Data

data = read.csv("marketing_data.csv",header=TRUE)
dim(data)
[1] 2240   28
str(data)
'data.frame':   2240 obs. of  28 variables:
 $ ID                 : int  1826 1 10476 1386 5371 7348 4073 1991 4047 9477 ...
 $ Year_Birth         : int  1970 1961 1958 1967 1989 1958 1954 1967 1954 1954 ...
 $ Education          : chr  "Graduation" "Graduation" "Graduation" "Graduation" ...
 $ Marital_Status     : chr  "Divorced" "Single" "Married" "Together" ...
 $ Income             : chr  "$84,835.00 " "$57,091.00 " "$67,267.00 " "$32,474.00 " ...
 $ Kidhome            : int  0 0 0 1 1 0 0 0 0 0 ...
 $ Teenhome           : int  0 0 1 1 0 0 0 1 1 1 ...
 $ Dt_Customer        : chr  "6/16/14" "6/15/14" "5/13/14" "5/11/14" ...
 $ Recency            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ MntWines           : int  189 464 134 10 6 336 769 78 384 384 ...
 $ MntFruits          : int  104 5 11 0 16 130 80 0 0 0 ...
 $ MntMeatProducts    : int  379 64 59 1 24 411 252 11 102 102 ...
 $ MntFishProducts    : int  111 7 15 0 11 240 15 0 21 21 ...
 $ MntSweetProducts   : int  189 0 2 0 0 32 34 0 32 32 ...
 $ MntGoldProds       : int  218 37 30 0 34 43 65 7 5 5 ...
 $ NumDealsPurchases  : int  1 1 1 1 2 1 1 1 3 3 ...
 $ NumWebPurchases    : int  4 7 3 1 3 4 10 2 6 6 ...
 $ NumCatalogPurchases: int  4 3 2 0 1 7 10 1 2 2 ...
 $ NumStorePurchases  : int  6 7 5 2 2 5 7 3 9 9 ...
 $ NumWebVisitsMonth  : int  1 5 2 7 7 2 6 5 4 4 ...
 $ AcceptedCmp3       : int  0 0 0 0 1 0 1 0 0 0 ...
 $ AcceptedCmp4       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ AcceptedCmp5       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ AcceptedCmp1       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ AcceptedCmp2       : int  0 1 0 0 0 0 0 0 0 0 ...
 $ Response           : int  1 1 0 0 1 1 1 0 0 0 ...
 $ Complain           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Country            : chr  "SP" "CA" "US" "AUS" ...

Dataset variables from Jack Daoud’s Notebook:

People

Products

Place

Promotion

Section 1: Data cleaning and basic exploration

# Format dates in Dt_Customer
data$Dt_Customer = mdy(data$Dt_Customer)
summary(data$Dt_Customer)
        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"2012-07-30" "2013-01-16" "2013-07-08" "2013-07-10" "2013-12-30" "2014-06-29" 
# Drop special characters in Income 
data$Income = gsub('[$]([0-9]+)[,]([0-9]+)','\\1\\2',data$Income)
data$Income = as.numeric(data$Income)
# Missing data analysis
data.frame("NA_Count" = sapply(data, function(x) { length(which(is.na(x))) })) %>% 
  filter(NA_Count > 0) %>% 
  arrange(., desc(NA_Count))
# Unique levels in categorical variables
summary(as.factor(data$Education))
  2n Cycle      Basic Graduation     Master        PhD 
       203         54       1127        370        486 
summary(as.factor(data$Marital_Status))
  Absurd    Alone Divorced  Married   Single Together    Widow     YOLO 
       2        3      232      864      480      580       77        2 
summary(as.factor(data$Country))
 AUS   CA  GER  IND   ME   SA   SP   US 
 160  268  120  148    3  337 1095  109 
# Recode 2n cycle to master 
revalue(data$Education,c("2n Cycle" = "Master")) -> data$Education
summary(as.factor(data$Education))
     Basic Graduation     Master        PhD 
        54       1127        573        486 
# Recode YOLO, Alone, Absurd to Single 
data$Marital_Status[data$Marital_Status == "YOLO" |
                    data$Marital_Status == "Absurd" |
                    data$Marital_Status == "Alone"] <- "Single"
summary(as.factor(data$Marital_Status))
Divorced  Married   Single Together    Widow 
     232      864      487      580       77 

Missing data imputation

References: Jack Daoud’s Notebook and Saeed Jafari’s Notebook.

# Summary of Income for each Education level
library(psych)
describeBy(data$Income, data$Education, mat = TRUE) 
# Table of mean values
#detach(package:plyr)
average_income = data %>% group_by(Education) %>% summarise(avg_Income = mean(Income,na.rm = TRUE)) %>% as.data.frame()
head(average_income)
# Join
imput_data = left_join(data[is.na(data$Income),], 
                          average_income, 
                          by = c("Education")) 

# Replace missing values with Avg_Income
imput_data$Income = imput_data$avg_Income 
# drop avg_Income col
imput_data <- imput_data %>% select(-"avg_Income") 

imput_data
# Full input data with full dataset
marketing_data <- full_join(data,
                            imput_data, 
                            by = c('ID',
                                   'Year_Birth',
                                   'Education',
                                   'Marital_Status',
                                   'Kidhome',
                                   'Teenhome',
                                   'Dt_Customer',
                                   'Recency',
                                   'MntWines',
                                   'MntFruits',
                                   'MntMeatProducts',
                                   'MntFishProducts',
                                   'MntSweetProducts',
                                   'MntGoldProds',
                                   'NumDealsPurchases',
                                   'NumWebPurchases',
                                   'NumCatalogPurchases',
                                   'NumStorePurchases',
                                   'NumWebVisitsMonth',
                                   'AcceptedCmp3',
                                   'AcceptedCmp4',
                                   'AcceptedCmp5',
                                   'AcceptedCmp1',
                                   'AcceptedCmp2',
                                   'Response',
                                   'Complain',
                                   'Country'))

# merge Income columns into a single column
marketing_data <- marketing_data  %>%                                 
   mutate(Income = coalesce(Income.x, Income.y)) %>%
   select(-c("Income.x", "Income.y"))

head(marketing_data)
NA
# Check for missing values after inputation
sum(is.na(marketing_data$Income))
[1] 0
sum(is.na(marketing_data))
[1] 0

Duplicates

# Check for duplicates in ID col
length(unique(data$ID)) 
[1] 2240
# Drop 3 obs with country listed as ME
marketing_data = marketing_data %>% filter(Country!="ME")
dim(marketing_data)
[1] 2237   28
summary(as.factor(marketing_data$Country))
 AUS   CA  GER  IND   SA   SP   US 
 160  268  120  148  337 1095  109 
#examine duplicated cases using all cols except for ID
dupes = marketing_data %>% get_dupes(c(-ID))
dim(dupes)
[1] 94 29
head(dupes)
  • Although no duplicates were found based on ID column, there are 94 problematic cases (47 duplicates) detected based on the rest of the columns.
# dataframe without duplicates
length(unique(marketing_data$ID)) 
[1] 2237
data = marketing_data %>% distinct_at(vars(-ID))
dim(data)
[1] 2190   27

Outliers and Anomalies

# Change variables types
data = data %>% mutate_at(vars(AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain),list(factor))

# Scale numeric data
numeric_data = data %>% select(where(is.numeric))
z_data = data.frame(scale(numeric_data))
z_data$type = rownames(data)

# Plot all numeric variables
z_data %>% 
  pivot_longer(cols=-type) %>% 
  ggplot(aes(x = name, y = value)) +
    geom_boxplot() +
    theme_classic() +
    expand_limits(x=12.6) + coord_flip()

  • The boxplot of all numeric variables show that there are outliers in multiple features, in particular:
    • Year_Birth variable contains 3 values before 1900
    • Income has an extreme outlier i.e. $666,666
  • Observations with the above-mentioned points are dropped, while the other outliers identified remains in the dataframe to avoid losing too much information.
# Drop obs with outliers in Income and Year_Birth 
data = data %>% filter(!Year_Birth<1930) %>% filter(!Income==666666)
dim(data)
[1] 2186   27

Feature Construction

# Feature construction
# Numeric variable from date (Dt_Customer)
data$Dt_Customer_num = as.numeric(data$Dt_Customer)
summary(data$Dt_Customer_num)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  15551   15721   15894   15897   16071   16250 
# TotalPurchases
data$TotalPurchases = data$NumWebPurchases + data$NumCatalogPurchases + data$NumStorePurchases
summary(data$TotalPurchases)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    6.00   12.00   12.54   18.00   32.00 
# Obs without purchases
data %>% filter(TotalPurchases==0)

# TotalProducts
data$TotalProducts = data$MntWines + data$MntFruits + data$MntMeatProducts + data$MntFishProducts + data$MntSweetProducts + data$MntGoldProds
summary(data$TotalProducts)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    5.0    69.0   396.5   605.9  1044.8  2525.0 
# Plot TotalPurchases and TotalProducts
data %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_point(alpha=0.5)


# Plot total products of TotalPurchases==0
data %>% filter(TotalPurchases==0) %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_count() + scale_y_continuous(limits=c(0.00,10.00)) + scale_size_continuous(breaks=round)

  • There are 6 obs with no purchases but have amount spent on products.
# Get ratio of NumDealsPurchased to TotalPurchases
data = data %>% mutate(ratio_dt = round((NumDealsPurchases/TotalPurchases),3)) %>% mutate(ratio_dt = na_if(ratio_dt,Inf))
summary(data$ratio_dt)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.0830  0.2000  0.2481  0.3330 15.0000       6 
  • As there are 6 obs with no purchases, there are 6 NAs in the variable ratio_dt.

The variables Age, Age_group and Total Children are inspired by Jack Daoud’s Notebook

# Age
data$Age = 2014 - data$Year_Birth

# Age_group
data$Age_group <- ""
for (i in 1:nrow(data)) {
    if (data[i, "Age"] > 54) {
        data[i, "Age_group"] <- "Baby Boomer"
    } else if (data[i, "Age"] > 38) {
        data[i, "Age_group"] <- "Gen X"
    } else if (data[i, "Age"] > 18) {
        data[i, "Age_group"] <- "Gen Y"
    } else {
        data[i, "Age_group"] <- "Gen Z"
    }
}

# TotalChildren
data$TotalChildren = data$Kidhome + data$Teenhome
# Drop anomalies> 
#data = data %>% filter(TotalPurchases!=0)
#dim(data)

One hot encoding

References: Jack Daoud’s Notebook and Saeed Jafari’s Notebook.

# One hot encoding 
dummy_obj = dummyVars(" ~.", data = data, fullRank = T) #full rank to avoid multi-collinearity 
data_onehot = data.frame(predict(dummy_obj, newdata = data))
dim(data)
[1] 2186   34
dim(data_onehot)
[1] 2186   46
colnames(data_onehot)
 [1] "Year_Birth"             "EducationGraduation"    "EducationMaster"        "EducationPhD"          
 [5] "Marital_StatusMarried"  "Marital_StatusSingle"   "Marital_StatusTogether" "Marital_StatusWidow"   
 [9] "Kidhome"                "Teenhome"               "Dt_Customer"            "Recency"               
[13] "MntWines"               "MntFruits"              "MntMeatProducts"        "MntFishProducts"       
[17] "MntSweetProducts"       "MntGoldProds"           "NumDealsPurchases"      "NumWebPurchases"       
[21] "NumCatalogPurchases"    "NumStorePurchases"      "NumWebVisitsMonth"      "AcceptedCmp3.1"        
[25] "AcceptedCmp4.1"         "AcceptedCmp5.1"         "AcceptedCmp1.1"         "AcceptedCmp2.1"        
[29] "Response.1"             "Complain.1"             "CountryCA"              "CountryGER"            
[33] "CountryIND"             "CountrySA"              "CountrySP"              "CountryUS"             
[37] "Income"                 "Dt_Customer_num"        "TotalPurchases"         "TotalProducts"         
[41] "ratio_dt"               "Age"                    "Age_groupGen.X"         "Age_groupGen.Y"        
[45] "Age_groupGen.Z"         "TotalChildren"         

Section 2: Statistical analysis

2.2: Does US fare significantly better than the Rest of the World in terms of total purchases?

# Label
data22 = data
data22$country2 = ifelse(data22$Country=="US","US","World")
Hmisc::describe(data22$country2) 
data22$country2 
       n  missing distinct 
    2186        0        2 
                      
Value         US World
Frequency    109  2077
Proportion  0.05  0.95
# Independent t test
t.test(TotalPurchases ~ country2, data= data22, var.equal=TRUE,alternative="greater")

    Two Sample t-test

data:  TotalPurchases by country2
t = 1.4424, df = 2184, p-value = 0.07467
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 -0.1437916        Inf
sample estimates:
   mean in group US mean in group World 
           13.51376            12.49302 
  • The independent t-test results suggests that the total purchase of US is not statistically greater than the rest of the world, as the p-value is >0.05.

2.3: People who buy an above average amount on gold in the last 2 years have more in store purchase?

# Label
data23 = data
data23$gold2 = ifelse(data23$MntGoldProds> mean(data23$MntGoldProds),"above_avg","not_above_avg")
Hmisc::describe(data23$gold2)
data23$gold2 
       n  missing distinct 
    2186        0        2 
                                      
Value          above_avg not_above_avg
Frequency            680          1506
Proportion         0.311         0.689
# Independent t test
t.test(NumStorePurchases ~ gold2, data= data23, var.equal=TRUE,alternative="greater")

    Two Sample t-test

data:  NumStorePurchases by gold2
t = 20.899, df = 2184, p-value < 2.2e-16
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 2.641222      Inf
sample estimates:
    mean in group above_avg mean in group not_above_avg 
                   7.764706                    4.897742 
  • The independent t-test results show that the mean of store purchases for people who buy an above average amount of gold in the last 2 years statistically greater than people who buy a below average amount in the past 2 years, as the p-values is <0.05.

2.5: Is there a significant relationship between geographical regions and sucesss of a compaign?

  • Using Response variable (if customer accepted the offer in the last campaign) as an indicator of success
# Summary of groups
data25 = data
data25$Response = as.numeric(data25$Response)
data25 %>% group_by(Country) %>% get_summary_stats(Response, type="mean_sd")
# Compare means of group using ANOVA test
res.aov = data25 %>% anova_test(Response ~ Country)
Coefficient covariances computed by hccm()
res.aov
ANOVA Table (type II tests)

   Effect DFn  DFd     F     p p<.05   ges
1 Country   6 2179 1.071 0.378       0.003
  • The ANOVA test shows that the differences between the means of offer acceptance between regions are not statistically different, and we can conclude that there is no significant relationship between geographical regions and success of campaigns as the p-value is >0.05.

Section 3: Data Visualizations

3.1: Which marketing campaign is most successful?

# Prepare data
data3 = data %>% select (AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4, AcceptedCmp5)
data3 = mutate_if(data3, is.factor, ~as.numeric(as.character(.x)))
# Plot
CmpLabel = c("2nd campaign","1st campaign","3rd campaign","5th campaign","4th campaign")
data3 %>% gather(Cmp, Outcome, AcceptedCmp1:AcceptedCmp5) %>% filter(Outcome>0) %>% group_by(Cmp) %>% tally() %>% ggplot(aes(x=reorder(Cmp,n), y=n, fill=Cmp)) + geom_col(width=0.7)  + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3.5) + scale_y_continuous(limits=c(0,185)) + coord_flip() + labs(y="", x="",title="Which marketing campaign is the most sucessful?", subtitle ="Number of customers who accepted the offer") + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank()) + scale_x_discrete(label= CmpLabel)

3.2: What does the average customer look like for this company?

# Average Year_Birth and Income 
data %>% summarise(avg_Age = round(mean(Age)), avg_Income = round(mean(Income)), avg_Kidhome = round(mean(Kidhome)), avg_Teenhome= round(mean(Teenhome)))
sum(is.na(data$income))
[1] 0
# Income
p1 = data %>% ggplot(aes(x=Income)) + geom_histogram(binwidth=5000, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 52002, color="#e63946",linetype="dashed") + scale_y_continuous(limits=c(0,200)) + annotate("text",label="Average = $52,002", x= 95000,y=190) + labs(x="", y="Customer Count", title="Yearly household income")


# Age
p2 = data %>% ggplot(aes(x=Age)) + geom_histogram(binwidth=5, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 45, color="#e63946",linetype="dashed") + annotate("text",label="Average = 45", x= 56,y=360) + labs(title="Customer Age", y="Customer Count",x="")

ggarrange(p1,p2,ncol=2)

# Age group
p2_2 = data %>% group_by(Age_group) %>% tally() %>% ggplot(aes(x=Age_group,y=n, fill=Age_group)) + geom_col(width=0.5) + geom_text(stat="identity",aes(label=n),vjust=-0.3,size=3.5) + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none") + labs(x= "", y="Customer count", title="Age demographic")
# Kidhome
p3 = data %>% group_by(Kidhome) %>% tally() %>% ggplot(aes(x=factor(Kidhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of children in household") + scale_y_continuous(limits=c(0,1300))
# Teenhome
p4 = data %>% group_by(Teenhome) %>% tally() %>% ggplot(aes(x=factor(Teenhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of teens in household") + scale_y_continuous(limits=c(0,1300))
# Marital Status 
p5 = data %>% group_by(Marital_Status) %>% tally() %>% ggplot(aes(x=reorder(Marital_Status,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Marital status") + theme(legend.position="none")
# Education
p6 = data %>% group_by(Education) %>% tally() %>% ggplot(aes(x=reorder(Education,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Customer's educational level") + theme(legend.position="none")

# Combined plot
ggarrange(p3,p4,p5,p6, ncol=2, nrow=2) 

# Marital status and education level
data %>% group_by(Marital_Status,Education) %>% tally(sort=T) %>% mutate(Marital_Edu = paste(Marital_Status, Education, sep="-")) %>% ggplot(aes(x=reorder(Marital_Edu,n), y=n)) + geom_point(color="#f77f00") + geom_segment(aes(x=Marital_Edu, xend=Marital_Edu, y=0, yend=n),color="#335c67") + coord_flip() + theme_light() + theme(panel.border=element_blank(),axis.ticks.x=element_blank(),axis.ticks.y=element_blank(), panel.grid.major.y=element_blank(), panel.grid.minor.x=element_blank()) + labs(y="Customer count", x="", title="Customers' marital status and education level")

3.3: Which products are performing best?

PrLabel = c("Fruits","Sweet","Fish","Gold","Meat","Wines")
data %>% select(MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds) %>% gather(Product, AmtSpent, MntWines:MntGoldProds) %>% group_by(Product) %>% tally(AmtSpent) %>% ggplot(aes(x=reorder(Product,n), y=n, fill=Product)) + geom_col() + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3) + scale_y_continuous(limits=c(0,750000)) + coord_flip() + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank(),axis.text.y=element_text(size=10)) + labs(y="Amount spent", x= "", title = "Which products are performing best?", subtitle = "Amount spent by customers in the last two years by product type") + scale_x_discrete(label= PrLabel)

3.3: Which products are performing best?

# Labels
ChLabel = c("Catalog","Web","Store")
# Plot
data %>% select(NumWebPurchases, NumCatalogPurchases, NumStorePurchases) %>% gather(Channel, Purchases, NumWebPurchases:NumStorePurchases) %>% group_by(Channel) %>% tally(Purchases) %>% ggplot(aes(x=reorder(Channel,n), y=n, fill=Channel)) + geom_col(width=0.5) + geom_text(stat="identity",aes(label=n),vjust=-0.5,size=3.5) + scale_y_continuous(limits=c(0,15000)) + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.y=element_text(size=10),axis.text.x=element_text(size=11)) + labs(y="Number of purchases", x= "Channel", title = "Which channels are underperforming?", subtitle = "Number of purchases made through channels") + scale_x_discrete(labels=ChLabel)

---
title: "Marketing Data Analysis v2"
date: "Dec 2020"
output: html_notebook
---

**Dataset**: [eCommerce Marketing](https://www.kaggle.com/jackdaoud/marketing-data)

**Task**: [Exploratory and Statistical Analysis](https://www.kaggle.com/jackdaoud/marketing-data/tasks?taskId=2986).  

**Contents**: 

* Section 1: Data cleaning and basic exploration
* Section 2: Statistical analysis
* Section 3: Data visualization

### Load libraries
```{r, message=FALSE, warning=FALSE}
library(plyr)
library(tidyverse)
library(skimr)
library(lubridate)
library(reshape)
library(Hmisc)
library(corrplot)
library(tibble)
library(viridis)
library(rstatix)
library(wesanderson)
library(ggsci)
library(caret)
library(ggpubr)
library(janitor)
```

### Import Data
```{r}
data = read.csv("marketing_data.csv",header=TRUE)
dim(data)
str(data)
```

**Dataset variables** from [Jack Daoud's Notebook](https://www.kaggle.com/jackdaoud/ecommerce-marketing-eda-statistics-analysis/comments): 

_People_

* ID: Customer's unique identifier   
* Year_Birth: Customer's birth year   
* Education: Customer's education level
* Marital_Status: Customer's marital status
* Income: Customer's yearly household income
* Kidhome: Number of children in customer's household
* Teenhome: Number of teenagers in customer's household
* Dt_Customer: Date of customer's enrollment with the company
* Recency: Number of days since customer's last purchase
* Complain: 1 if customer complained in the last 2 years, 0 otherwise
* Country: Customer's location

_Products_

* MntWines: Amount spent on wine in the last 2 years
* MntFruits: Amount spent on fruits in the last 2 years
* MntMeatProducts: Amount spent on meat in the last 2 years
* MntFishProducts: Amount spent on fish in the last 2 years
* MntSweetProducts: Amount spent on sweets in the last 2 years
* MntGoldProds: Amount spent on gold in the last 2 years

_Place_

* NumWebPurchases: Number of purchases made through the company's web site
* NumCatalogPurchases: Number of purchases made using a catalogue
* NumStorePurchases: Number of purchases made directly in stores
* NumWebVisitsMonth: Number of visits to company's web site in the last month

_Promotion_

* NumDealsPurchases: Number of purchases made with a discount
* AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise
* AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise
* AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise
* AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise
* AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise
* Response: 1 if customer accepted the offer in the last campaign, 0 otherwise


### Section 1: Data cleaning and basic exploration

```{r}
# Format dates in Dt_Customer
data$Dt_Customer = mdy(data$Dt_Customer)
summary(data$Dt_Customer)

# Drop special characters in Income 
data$Income = gsub('[$]([0-9]+)[,]([0-9]+)','\\1\\2',data$Income)
data$Income = as.numeric(data$Income)
```

* The earliest customer enrollment date in the dataset is 2012-07-30 and latest 2014-06-29


```{r}
# Missing data analysis
data.frame("NA_Count" = sapply(data, function(x) { length(which(is.na(x))) })) %>% 
  filter(NA_Count > 0) %>% 
  arrange(., desc(NA_Count))
```

* There are 24 observations without income specified.

```{r}
# Unique levels in categorical variables
summary(as.factor(data$Education))
summary(as.factor(data$Marital_Status))
summary(as.factor(data$Country))
```

* Master education level are also indicated as 2n cycles in the dataset
* There are some odd levels in the Marital Status level i.e., Absurd, Alone and YOLO. 
* There are only 3 observations with country listed as _ME_

```{r}
# Recode 2n cycle to master 
revalue(data$Education,c("2n Cycle" = "Master")) -> data$Education
summary(as.factor(data$Education))
# Recode YOLO, Alone, Absurd to Single 
data$Marital_Status[data$Marital_Status == "YOLO" |
                    data$Marital_Status == "Absurd" |
                    data$Marital_Status == "Alone"] <- "Single"
summary(as.factor(data$Marital_Status))
```

#### Missing data imputation
References: [Jack Daoud's Notebook](https://www.kaggle.com/jackdaoud/ecommerce-marketing-eda-statistics-analysis/comments) and [Saeed Jafari's Notebook](https://www.kaggle.com/saeedjafari/section-1/comments). 

```{r, warning=FALSE}
# Summary of Income for each Education level
library(psych)
describeBy(data$Income, data$Education, mat = TRUE) 
```

```{r, warning=FALSE, message=FALSE}
# Table of mean values
#detach(package:plyr)
average_income = data %>% group_by(Education) %>% summarise(avg_Income = mean(Income,na.rm = TRUE)) %>% as.data.frame()
head(average_income)
```

```{r}
# Join
imput_data = left_join(data[is.na(data$Income),], 
                          average_income, 
                          by = c("Education")) 

# Replace missing values with Avg_Income
imput_data$Income = imput_data$avg_Income 
# drop avg_Income col
imput_data <- imput_data %>% select(-"avg_Income") 

imput_data
```

```{r}
# Full input data with full dataset
marketing_data <- full_join(data,
                            imput_data, 
                            by = c('ID',
                                   'Year_Birth',
                                   'Education',
                                   'Marital_Status',
                                   'Kidhome',
                                   'Teenhome',
                                   'Dt_Customer',
                                   'Recency',
                                   'MntWines',
                                   'MntFruits',
                                   'MntMeatProducts',
                                   'MntFishProducts',
                                   'MntSweetProducts',
                                   'MntGoldProds',
                                   'NumDealsPurchases',
                                   'NumWebPurchases',
                                   'NumCatalogPurchases',
                                   'NumStorePurchases',
                                   'NumWebVisitsMonth',
                                   'AcceptedCmp3',
                                   'AcceptedCmp4',
                                   'AcceptedCmp5',
                                   'AcceptedCmp1',
                                   'AcceptedCmp2',
                                   'Response',
                                   'Complain',
                                   'Country'))

# merge Income columns into a single column
marketing_data <- marketing_data  %>%                                 
   mutate(Income = coalesce(Income.x, Income.y)) %>%
   select(-c("Income.x", "Income.y"))

head(marketing_data)

```
```{r}
# Check for missing values after inputation
sum(is.na(marketing_data$Income))
sum(is.na(marketing_data))
```

#### Duplicates 

```{r}
# Check for duplicates in ID col
length(unique(data$ID)) 
# Drop 3 obs with country listed as ME
marketing_data = marketing_data %>% filter(Country!="ME")
dim(marketing_data)
summary(as.factor(marketing_data$Country))
```


```{r}
#examine duplicated cases using all cols except for ID
dupes = marketing_data %>% get_dupes(c(-ID))
dim(dupes)
head(dupes)
```

* Although no duplicates were found based on ID column, there are 94 problematic cases (47 duplicates) detected based on the rest of the columns. 

```{r, message=FALSE, warning=FALSE}
# dataframe without duplicates
length(unique(marketing_data$ID)) 
data = marketing_data %>% distinct_at(vars(-ID))
dim(data)
```


#### Outliers and Anomalies 

```{r}
# Change variables types
data = data %>% mutate_at(vars(AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain),list(factor))

# Scale numeric data
numeric_data = data %>% select(where(is.numeric))
z_data = data.frame(scale(numeric_data))
z_data$type = rownames(data)

# Plot all numeric variables
z_data %>% 
  pivot_longer(cols=-type) %>% 
  ggplot(aes(x = name, y = value)) +
    geom_boxplot() +
    theme_classic() +
    expand_limits(x=12.6) + coord_flip()
```
* The boxplot of all numeric variables show that there are outliers in multiple features, in particular:
  * Year_Birth variable contains 3 values before 1900 
  * Income has an extreme outlier i.e. $666,666
* Observations with the above-mentioned points are dropped, while the other outliers identified remains in the dataframe to avoid losing too much information. 

```{r}
# Drop obs with outliers in Income and Year_Birth 
data = data %>% filter(!Year_Birth<1930) %>% filter(!Income==666666)
dim(data)
```


#### Feature Construction

```{r}
# Feature construction
# Numeric variable from date (Dt_Customer)
data$Dt_Customer_num = as.numeric(data$Dt_Customer)
summary(data$Dt_Customer_num)

# TotalPurchases
data$TotalPurchases = data$NumWebPurchases + data$NumCatalogPurchases + data$NumStorePurchases
summary(data$TotalPurchases)
# Obs without purchases
data %>% filter(TotalPurchases==0)

# TotalProducts
data$TotalProducts = data$MntWines + data$MntFruits + data$MntMeatProducts + data$MntFishProducts + data$MntSweetProducts + data$MntGoldProds
summary(data$TotalProducts)

# Plot TotalPurchases and TotalProducts
data %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_point(alpha=0.5)

# Plot total products of TotalPurchases==0
data %>% filter(TotalPurchases==0) %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_count() + scale_y_continuous(limits=c(0.00,10.00)) + scale_size_continuous(breaks=round)
```

* There are 6 obs with no purchases but have amount spent on products.

```{r}
# Get ratio of NumDealsPurchased to TotalPurchases
data = data %>% mutate(ratio_dt = round((NumDealsPurchases/TotalPurchases),3)) %>% mutate(ratio_dt = na_if(ratio_dt,Inf))
summary(data$ratio_dt)
```

* As there are 6 obs with no purchases, there are 6 NAs in the variable ratio_dt. 

The variables Age, Age_group and Total Children are inspired by [Jack Daoud's Notebook](https://www.kaggle.com/jackdaoud/ecommerce-marketing-eda-statistics-analysis/comments)
```{r}
# Age
data$Age = 2014 - data$Year_Birth

# Age_group
data$Age_group <- ""
for (i in 1:nrow(data)) {
    if (data[i, "Age"] > 54) {
        data[i, "Age_group"] <- "Baby Boomer"
    } else if (data[i, "Age"] > 38) {
        data[i, "Age_group"] <- "Gen X"
    } else if (data[i, "Age"] > 18) {
        data[i, "Age_group"] <- "Gen Y"
    } else {
        data[i, "Age_group"] <- "Gen Z"
    }
}

# TotalChildren
data$TotalChildren = data$Kidhome + data$Teenhome
```


```{r}
# Drop anomalies> 
#data = data %>% filter(TotalPurchases!=0)
#dim(data)
```

#### One hot encoding
References: [Jack Daoud's Notebook](https://www.kaggle.com/jackdaoud/ecommerce-marketing-eda-statistics-analysis/comments) and [Saeed Jafari's Notebook](https://www.kaggle.com/saeedjafari/section-1/comments).

```{r}
# One hot encoding 
dummy_obj = dummyVars(" ~.", data = data, fullRank = T) #full rank to avoid multi-collinearity 
data_onehot = data.frame(predict(dummy_obj, newdata = data))
dim(data)
dim(data_onehot)
colnames(data_onehot)
```


### Section 2: Statistical analysis

#### 2.1: What factors are significantly related to the number of store purchases
```{r}
# Regression
data_21 = data[-c(1,28,29,30,31,33,34)]
linearMod1 = lm(NumStorePurchases~ ., data = data_21)
summary(linearMod1)
```

* The variables in the regression model are jointly significant as F-statistic p-value is <0.05.
* The above regression model show that factors below are significantly related to the number of store purchases
  + yearly household income
  + number of kids in  household
  + amount spent on: wine, fruits, meat products, sweet products in the last 2 years
  + number of purchases made: with a discount, through website, using a catalog
  + number of website visits in the last month 
  + acceptance of offer in: 3rd, 5th campaign 
  + date of customer's enrollment with the company

#### 2.2: Does US fare significantly better than the Rest of the World in terms of total purchases? 
```{r}
# Label
data22 = data
data22$country2 = ifelse(data22$Country=="US","US","World")
Hmisc::describe(data22$country2) 

# Independent t test
t.test(TotalPurchases ~ country2, data= data22, var.equal=TRUE,alternative="greater")
```

* The independent t-test results suggests that the total purchase of US is not statistically greater than the rest of the world, as the p-value is >0.05. 

#### 2.3: People who buy an above average amount on gold in the last 2 years have more in store purchase?
```{r}
# Label
data23 = data
data23$gold2 = ifelse(data23$MntGoldProds> mean(data23$MntGoldProds),"above_avg","not_above_avg")
Hmisc::describe(data23$gold2)

# Independent t test
t.test(NumStorePurchases ~ gold2, data= data23, var.equal=TRUE,alternative="greater")
```

* The independent t-test results show that the mean of store purchases for people who buy an above average amount of gold in the last 2 years statistically greater than people who buy a below average amount in the past 2 years, as the p-values is <0.05.  


#### 2.4: Do married PhD couples have a significant relation with amount spent on fish? What other factors are significantly related to amount spent on fish? 

```{r}
# Label: PhD couples 
data24 = data
data24$PhDcouples = ifelse(data24$Marital_Status =="Married" & data24$Education =="PhD", "1","0")
Hmisc::describe(data24$PhDcouples)
```

* There are 188 married PhD couples in the dataset. 

```{r}
# Regression
data24_model = data24[-c(1,2,3,28,29,30,31,33,34)]
linearMod2 = lm(MntFishProducts~ ., data = data24_model)
summary(linearMod2)
```

* The variables in the regression model are jointly significant as F-statistic p-value is <0.05.
* The regression model suggests the following features have a significant relation to amount spent on fish products in the past 2 years: 
  + PhD couples
  + number of teens in customers' household
  + amount spent on: fruits, meat products, sweet products, gold products in the last 2 years
  + number of purchases made: using a catalog, directly in stores
  + number of website visits in the last month
  + acceptance of offer in: 1st, 5th campaign
  + date of customer's enrollment with the company


#### 2.5: Is there a significant relationship between geographical regions and sucesss of a compaign?
* Using Response variable (if customer accepted the offer in the last campaign) as an indicator of success

```{r}
# Summary of groups
data25 = data
data25$Response = as.numeric(data25$Response)
data25 %>% group_by(Country) %>% get_summary_stats(Response, type="mean_sd")
# Compare means of group using ANOVA test
res.aov = data25 %>% anova_test(Response ~ Country)
res.aov
```
* The ANOVA test shows that the differences between the means of offer acceptance between regions are not statistically different, and we can conclude that there is no significant relationship between geographical regions and success of campaigns as the p-value is >0.05.  


### Section 3: Data Visualizations
#### 3.1: Which marketing campaign is most successful? 
```{r}
# Prepare data
data3 = data %>% select (AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4, AcceptedCmp5)
data3 = mutate_if(data3, is.factor, ~as.numeric(as.character(.x)))
# Plot
CmpLabel = c("2nd campaign","1st campaign","3rd campaign","5th campaign","4th campaign")
data3 %>% gather(Cmp, Outcome, AcceptedCmp1:AcceptedCmp5) %>% filter(Outcome>0) %>% group_by(Cmp) %>% tally() %>% ggplot(aes(x=reorder(Cmp,n), y=n, fill=Cmp)) + geom_col(width=0.7)  + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3.5) + scale_y_continuous(limits=c(0,185)) + coord_flip() + labs(y="", x="",title="Which marketing campaign is the most sucessful?", subtitle ="Number of customers who accepted the offer") + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank()) + scale_x_discrete(label= CmpLabel)
```


#### 3.2: What does the average customer look like for this company?
```{r}
# Average Year_Birth and Income 
data %>% summarise(avg_Age = round(mean(Age)), avg_Income = round(mean(Income)), avg_Kidhome = round(mean(Kidhome)), avg_Teenhome= round(mean(Teenhome)))
```

```{r}
sum(is.na(data$income))
```


```{r}
# Income
p1 = data %>% ggplot(aes(x=Income)) + geom_histogram(binwidth=5000, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 52002, color="#e63946",linetype="dashed") + scale_y_continuous(limits=c(0,200)) + annotate("text",label="Average = $52,002", x= 95000,y=190) + labs(x="", y="Customer Count", title="Yearly household income")


# Age
p2 = data %>% ggplot(aes(x=Age)) + geom_histogram(binwidth=5, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 45, color="#e63946",linetype="dashed") + annotate("text",label="Average = 45", x= 56,y=360) + labs(title="Customer Age", y="Customer Count",x="")

ggarrange(p1,p2,ncol=2)
```

```{r}
# Age group
p2_2 = data %>% group_by(Age_group) %>% tally() %>% ggplot(aes(x=Age_group,y=n, fill=Age_group)) + geom_col(width=0.5) + geom_text(stat="identity",aes(label=n),vjust=-0.3,size=3.5) + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none") + labs(x= "", y="Customer count", title="Age demographic")
```


```{r}
# Kidhome
p3 = data %>% group_by(Kidhome) %>% tally() %>% ggplot(aes(x=factor(Kidhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of children in household") + scale_y_continuous(limits=c(0,1300))
# Teenhome
p4 = data %>% group_by(Teenhome) %>% tally() %>% ggplot(aes(x=factor(Teenhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of teens in household") + scale_y_continuous(limits=c(0,1300))
# Marital Status 
p5 = data %>% group_by(Marital_Status) %>% tally() %>% ggplot(aes(x=reorder(Marital_Status,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Marital status") + theme(legend.position="none")
# Education
p6 = data %>% group_by(Education) %>% tally() %>% ggplot(aes(x=reorder(Education,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Customer's educational level") + theme(legend.position="none")

# Combined plot
ggarrange(p3,p4,p5,p6, ncol=2, nrow=2) 
```

```{r}
# Marital status and education level
data %>% group_by(Marital_Status,Education) %>% tally(sort=T) %>% mutate(Marital_Edu = paste(Marital_Status, Education, sep="-")) %>% ggplot(aes(x=reorder(Marital_Edu,n), y=n)) + geom_point(color="#f77f00") + geom_segment(aes(x=Marital_Edu, xend=Marital_Edu, y=0, yend=n),color="#335c67") + coord_flip() + theme_light() + theme(panel.border=element_blank(),axis.ticks.x=element_blank(),axis.ticks.y=element_blank(), panel.grid.major.y=element_blank(), panel.grid.minor.x=element_blank()) + labs(y="Customer count", x="", title="Customers' marital status and education level")
```


#### 3.3: Which products are performing best?
```{r}
PrLabel = c("Fruits","Sweet","Fish","Gold","Meat","Wines")
data %>% select(MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds) %>% gather(Product, AmtSpent, MntWines:MntGoldProds) %>% group_by(Product) %>% tally(AmtSpent) %>% ggplot(aes(x=reorder(Product,n), y=n, fill=Product)) + geom_col() + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3) + scale_y_continuous(limits=c(0,750000)) + coord_flip() + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank(),axis.text.y=element_text(size=10)) + labs(y="Amount spent", x= "", title = "Which products are performing best?", subtitle = "Amount spent by customers in the last two years by product type") + scale_x_discrete(label= PrLabel)
```


#### 3.3: Which products are performing best?
```{r}
# Labels
ChLabel = c("Catalog","Web","Store")
# Plot
data %>% select(NumWebPurchases, NumCatalogPurchases, NumStorePurchases) %>% gather(Channel, Purchases, NumWebPurchases:NumStorePurchases) %>% group_by(Channel) %>% tally(Purchases) %>% ggplot(aes(x=reorder(Channel,n), y=n, fill=Channel)) + geom_col(width=0.5) + geom_text(stat="identity",aes(label=n),vjust=-0.5,size=3.5) + scale_y_continuous(limits=c(0,15000)) + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.y=element_text(size=10),axis.text.x=element_text(size=11)) + labs(y="Number of purchases", x= "Channel", title = "Which channels are underperforming?", subtitle = "Number of purchases made through channels") + scale_x_discrete(labels=ChLabel)
```

