Section 1: Data cleaning and basic exploration
# Format dates in Dt_Customer
data$Dt_Customer = mdy(data$Dt_Customer)
summary(data$Dt_Customer)
Min. 1st Qu. Median Mean 3rd Qu. Max.
"2012-07-30" "2013-01-16" "2013-07-08" "2013-07-10" "2013-12-30" "2014-06-29"
# Drop special characters in Income
data$Income = gsub('[$]([0-9]+)[,]([0-9]+)','\\1\\2',data$Income)
data$Income = as.numeric(data$Income)
- The earliest customer enrollment date in the dataset is 2012-07-30 and latest 2014-06-29
# Missing data analysis
data.frame("NA_Count" = sapply(data, function(x) { length(which(is.na(x))) })) %>%
filter(NA_Count > 0) %>%
arrange(., desc(NA_Count))
- There are 24 observations without income specified.
# Unique levels in categorical variables
summary(as.factor(data$Education))
2n Cycle Basic Graduation Master PhD
203 54 1127 370 486
summary(as.factor(data$Marital_Status))
Absurd Alone Divorced Married Single Together Widow YOLO
2 3 232 864 480 580 77 2
summary(as.factor(data$Country))
AUS CA GER IND ME SA SP US
160 268 120 148 3 337 1095 109
- Master education level are also indicated as 2n cycles in the dataset
- There are some odd levels in the Marital Status level i.e., Absurd, Alone and YOLO.
- There are only 3 observations with country listed as ME
# Recode 2n cycle to master
revalue(data$Education,c("2n Cycle" = "Master")) -> data$Education
summary(as.factor(data$Education))
Basic Graduation Master PhD
54 1127 573 486
# Recode YOLO, Alone, Absurd to Single
data$Marital_Status[data$Marital_Status == "YOLO" |
data$Marital_Status == "Absurd" |
data$Marital_Status == "Alone"] <- "Single"
summary(as.factor(data$Marital_Status))
Divorced Married Single Together Widow
232 864 487 580 77
Missing data imputation
References: Jack Daoud’s Notebook and Saeed Jafari’s Notebook.
# Summary of Income for each Education level
library(psych)
describeBy(data$Income, data$Education, mat = TRUE)
# Table of mean values
#detach(package:plyr)
average_income = data %>% group_by(Education) %>% summarise(avg_Income = mean(Income,na.rm = TRUE)) %>% as.data.frame()
head(average_income)
# Join
imput_data = left_join(data[is.na(data$Income),],
average_income,
by = c("Education"))
# Replace missing values with Avg_Income
imput_data$Income = imput_data$avg_Income
# drop avg_Income col
imput_data <- imput_data %>% select(-"avg_Income")
imput_data
# Full input data with full dataset
marketing_data <- full_join(data,
imput_data,
by = c('ID',
'Year_Birth',
'Education',
'Marital_Status',
'Kidhome',
'Teenhome',
'Dt_Customer',
'Recency',
'MntWines',
'MntFruits',
'MntMeatProducts',
'MntFishProducts',
'MntSweetProducts',
'MntGoldProds',
'NumDealsPurchases',
'NumWebPurchases',
'NumCatalogPurchases',
'NumStorePurchases',
'NumWebVisitsMonth',
'AcceptedCmp3',
'AcceptedCmp4',
'AcceptedCmp5',
'AcceptedCmp1',
'AcceptedCmp2',
'Response',
'Complain',
'Country'))
# merge Income columns into a single column
marketing_data <- marketing_data %>%
mutate(Income = coalesce(Income.x, Income.y)) %>%
select(-c("Income.x", "Income.y"))
head(marketing_data)
NA
# Check for missing values after inputation
sum(is.na(marketing_data$Income))
[1] 0
sum(is.na(marketing_data))
[1] 0
Duplicates
# Check for duplicates in ID col
length(unique(data$ID))
[1] 2240
# Drop 3 obs with country listed as ME
marketing_data = marketing_data %>% filter(Country!="ME")
dim(marketing_data)
[1] 2237 28
summary(as.factor(marketing_data$Country))
AUS CA GER IND SA SP US
160 268 120 148 337 1095 109
#examine duplicated cases using all cols except for ID
dupes = marketing_data %>% get_dupes(c(-ID))
dim(dupes)
[1] 94 29
head(dupes)
- Although no duplicates were found based on ID column, there are 94 problematic cases (47 duplicates) detected based on the rest of the columns.
# dataframe without duplicates
length(unique(marketing_data$ID))
[1] 2237
data = marketing_data %>% distinct_at(vars(-ID))
dim(data)
[1] 2190 27
Outliers and Anomalies
# Change variables types
data = data %>% mutate_at(vars(AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain),list(factor))
# Scale numeric data
numeric_data = data %>% select(where(is.numeric))
z_data = data.frame(scale(numeric_data))
z_data$type = rownames(data)
# Plot all numeric variables
z_data %>%
pivot_longer(cols=-type) %>%
ggplot(aes(x = name, y = value)) +
geom_boxplot() +
theme_classic() +
expand_limits(x=12.6) + coord_flip()

- The boxplot of all numeric variables show that there are outliers in multiple features, in particular:
- Year_Birth variable contains 3 values before 1900
- Income has an extreme outlier i.e. $666,666
- Observations with the above-mentioned points are dropped, while the other outliers identified remains in the dataframe to avoid losing too much information.
# Drop obs with outliers in Income and Year_Birth
data = data %>% filter(!Year_Birth<1930) %>% filter(!Income==666666)
dim(data)
[1] 2186 27
Feature Construction
# Feature construction
# Numeric variable from date (Dt_Customer)
data$Dt_Customer_num = as.numeric(data$Dt_Customer)
summary(data$Dt_Customer_num)
Min. 1st Qu. Median Mean 3rd Qu. Max.
15551 15721 15894 15897 16071 16250
# TotalPurchases
data$TotalPurchases = data$NumWebPurchases + data$NumCatalogPurchases + data$NumStorePurchases
summary(data$TotalPurchases)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 6.00 12.00 12.54 18.00 32.00
# Obs without purchases
data %>% filter(TotalPurchases==0)
# TotalProducts
data$TotalProducts = data$MntWines + data$MntFruits + data$MntMeatProducts + data$MntFishProducts + data$MntSweetProducts + data$MntGoldProds
summary(data$TotalProducts)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 69.0 396.5 605.9 1044.8 2525.0
# Plot TotalPurchases and TotalProducts
data %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_point(alpha=0.5)

# Plot total products of TotalPurchases==0
data %>% filter(TotalPurchases==0) %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_count() + scale_y_continuous(limits=c(0.00,10.00)) + scale_size_continuous(breaks=round)

- There are 6 obs with no purchases but have amount spent on products.
# Get ratio of NumDealsPurchased to TotalPurchases
data = data %>% mutate(ratio_dt = round((NumDealsPurchases/TotalPurchases),3)) %>% mutate(ratio_dt = na_if(ratio_dt,Inf))
summary(data$ratio_dt)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.0830 0.2000 0.2481 0.3330 15.0000 6
- As there are 6 obs with no purchases, there are 6 NAs in the variable ratio_dt.
The variables Age, Age_group and Total Children are inspired by Jack Daoud’s Notebook
# Age
data$Age = 2014 - data$Year_Birth
# Age_group
data$Age_group <- ""
for (i in 1:nrow(data)) {
if (data[i, "Age"] > 54) {
data[i, "Age_group"] <- "Baby Boomer"
} else if (data[i, "Age"] > 38) {
data[i, "Age_group"] <- "Gen X"
} else if (data[i, "Age"] > 18) {
data[i, "Age_group"] <- "Gen Y"
} else {
data[i, "Age_group"] <- "Gen Z"
}
}
# TotalChildren
data$TotalChildren = data$Kidhome + data$Teenhome
# Drop anomalies>
#data = data %>% filter(TotalPurchases!=0)
#dim(data)
One hot encoding
References: Jack Daoud’s Notebook and Saeed Jafari’s Notebook.
# One hot encoding
dummy_obj = dummyVars(" ~.", data = data, fullRank = T) #full rank to avoid multi-collinearity
data_onehot = data.frame(predict(dummy_obj, newdata = data))
dim(data)
[1] 2186 34
dim(data_onehot)
[1] 2186 46
colnames(data_onehot)
[1] "Year_Birth" "EducationGraduation" "EducationMaster" "EducationPhD"
[5] "Marital_StatusMarried" "Marital_StatusSingle" "Marital_StatusTogether" "Marital_StatusWidow"
[9] "Kidhome" "Teenhome" "Dt_Customer" "Recency"
[13] "MntWines" "MntFruits" "MntMeatProducts" "MntFishProducts"
[17] "MntSweetProducts" "MntGoldProds" "NumDealsPurchases" "NumWebPurchases"
[21] "NumCatalogPurchases" "NumStorePurchases" "NumWebVisitsMonth" "AcceptedCmp3.1"
[25] "AcceptedCmp4.1" "AcceptedCmp5.1" "AcceptedCmp1.1" "AcceptedCmp2.1"
[29] "Response.1" "Complain.1" "CountryCA" "CountryGER"
[33] "CountryIND" "CountrySA" "CountrySP" "CountryUS"
[37] "Income" "Dt_Customer_num" "TotalPurchases" "TotalProducts"
[41] "ratio_dt" "Age" "Age_groupGen.X" "Age_groupGen.Y"
[45] "Age_groupGen.Z" "TotalChildren"
Section 2: Statistical analysis
2.2: Does US fare significantly better than the Rest of the World in terms of total purchases?
# Label
data22 = data
data22$country2 = ifelse(data22$Country=="US","US","World")
Hmisc::describe(data22$country2)
data22$country2
n missing distinct
2186 0 2
Value US World
Frequency 109 2077
Proportion 0.05 0.95
# Independent t test
t.test(TotalPurchases ~ country2, data= data22, var.equal=TRUE,alternative="greater")
Two Sample t-test
data: TotalPurchases by country2
t = 1.4424, df = 2184, p-value = 0.07467
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
-0.1437916 Inf
sample estimates:
mean in group US mean in group World
13.51376 12.49302
- The independent t-test results suggests that the total purchase of US is not statistically greater than the rest of the world, as the p-value is >0.05.
2.3: People who buy an above average amount on gold in the last 2 years have more in store purchase?
# Label
data23 = data
data23$gold2 = ifelse(data23$MntGoldProds> mean(data23$MntGoldProds),"above_avg","not_above_avg")
Hmisc::describe(data23$gold2)
data23$gold2
n missing distinct
2186 0 2
Value above_avg not_above_avg
Frequency 680 1506
Proportion 0.311 0.689
# Independent t test
t.test(NumStorePurchases ~ gold2, data= data23, var.equal=TRUE,alternative="greater")
Two Sample t-test
data: NumStorePurchases by gold2
t = 20.899, df = 2184, p-value < 2.2e-16
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
2.641222 Inf
sample estimates:
mean in group above_avg mean in group not_above_avg
7.764706 4.897742
- The independent t-test results show that the mean of store purchases for people who buy an above average amount of gold in the last 2 years statistically greater than people who buy a below average amount in the past 2 years, as the p-values is <0.05.
2.5: Is there a significant relationship between geographical regions and sucesss of a compaign?
- Using Response variable (if customer accepted the offer in the last campaign) as an indicator of success
# Summary of groups
data25 = data
data25$Response = as.numeric(data25$Response)
data25 %>% group_by(Country) %>% get_summary_stats(Response, type="mean_sd")
# Compare means of group using ANOVA test
res.aov = data25 %>% anova_test(Response ~ Country)
Coefficient covariances computed by hccm()
res.aov
ANOVA Table (type II tests)
Effect DFn DFd F p p<.05 ges
1 Country 6 2179 1.071 0.378 0.003
- The ANOVA test shows that the differences between the means of offer acceptance between regions are not statistically different, and we can conclude that there is no significant relationship between geographical regions and success of campaigns as the p-value is >0.05.
Section 3: Data Visualizations
3.1: Which marketing campaign is most successful?
# Prepare data
data3 = data %>% select (AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4, AcceptedCmp5)
data3 = mutate_if(data3, is.factor, ~as.numeric(as.character(.x)))
# Plot
CmpLabel = c("2nd campaign","1st campaign","3rd campaign","5th campaign","4th campaign")
data3 %>% gather(Cmp, Outcome, AcceptedCmp1:AcceptedCmp5) %>% filter(Outcome>0) %>% group_by(Cmp) %>% tally() %>% ggplot(aes(x=reorder(Cmp,n), y=n, fill=Cmp)) + geom_col(width=0.7) + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3.5) + scale_y_continuous(limits=c(0,185)) + coord_flip() + labs(y="", x="",title="Which marketing campaign is the most sucessful?", subtitle ="Number of customers who accepted the offer") + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank()) + scale_x_discrete(label= CmpLabel)

3.2: What does the average customer look like for this company?
# Average Year_Birth and Income
data %>% summarise(avg_Age = round(mean(Age)), avg_Income = round(mean(Income)), avg_Kidhome = round(mean(Kidhome)), avg_Teenhome= round(mean(Teenhome)))
sum(is.na(data$income))
[1] 0
# Income
p1 = data %>% ggplot(aes(x=Income)) + geom_histogram(binwidth=5000, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 52002, color="#e63946",linetype="dashed") + scale_y_continuous(limits=c(0,200)) + annotate("text",label="Average = $52,002", x= 95000,y=190) + labs(x="", y="Customer Count", title="Yearly household income")
# Age
p2 = data %>% ggplot(aes(x=Age)) + geom_histogram(binwidth=5, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 45, color="#e63946",linetype="dashed") + annotate("text",label="Average = 45", x= 56,y=360) + labs(title="Customer Age", y="Customer Count",x="")
ggarrange(p1,p2,ncol=2)

# Age group
p2_2 = data %>% group_by(Age_group) %>% tally() %>% ggplot(aes(x=Age_group,y=n, fill=Age_group)) + geom_col(width=0.5) + geom_text(stat="identity",aes(label=n),vjust=-0.3,size=3.5) + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none") + labs(x= "", y="Customer count", title="Age demographic")
# Kidhome
p3 = data %>% group_by(Kidhome) %>% tally() %>% ggplot(aes(x=factor(Kidhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of children in household") + scale_y_continuous(limits=c(0,1300))
# Teenhome
p4 = data %>% group_by(Teenhome) %>% tally() %>% ggplot(aes(x=factor(Teenhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of teens in household") + scale_y_continuous(limits=c(0,1300))
# Marital Status
p5 = data %>% group_by(Marital_Status) %>% tally() %>% ggplot(aes(x=reorder(Marital_Status,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Marital status") + theme(legend.position="none")
# Education
p6 = data %>% group_by(Education) %>% tally() %>% ggplot(aes(x=reorder(Education,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Customer's educational level") + theme(legend.position="none")
# Combined plot
ggarrange(p3,p4,p5,p6, ncol=2, nrow=2)

# Marital status and education level
data %>% group_by(Marital_Status,Education) %>% tally(sort=T) %>% mutate(Marital_Edu = paste(Marital_Status, Education, sep="-")) %>% ggplot(aes(x=reorder(Marital_Edu,n), y=n)) + geom_point(color="#f77f00") + geom_segment(aes(x=Marital_Edu, xend=Marital_Edu, y=0, yend=n),color="#335c67") + coord_flip() + theme_light() + theme(panel.border=element_blank(),axis.ticks.x=element_blank(),axis.ticks.y=element_blank(), panel.grid.major.y=element_blank(), panel.grid.minor.x=element_blank()) + labs(y="Customer count", x="", title="Customers' marital status and education level")

---
title: "Marketing Data Analysis v2"
date: "Dec 2020"
output: html_notebook
---

**Dataset**: [eCommerce Marketing](https://www.kaggle.com/jackdaoud/marketing-data)

**Task**: [Exploratory and Statistical Analysis](https://www.kaggle.com/jackdaoud/marketing-data/tasks?taskId=2986).  

**Contents**: 

* Section 1: Data cleaning and basic exploration
* Section 2: Statistical analysis
* Section 3: Data visualization

### Load libraries
```{r, message=FALSE, warning=FALSE}
library(plyr)
library(tidyverse)
library(skimr)
library(lubridate)
library(reshape)
library(Hmisc)
library(corrplot)
library(tibble)
library(viridis)
library(rstatix)
library(wesanderson)
library(ggsci)
library(caret)
library(ggpubr)
library(janitor)
```

### Import Data
```{r}
data = read.csv("marketing_data.csv",header=TRUE)
dim(data)
str(data)
```

**Dataset variables** from [Jack Daoud's Notebook](https://www.kaggle.com/jackdaoud/ecommerce-marketing-eda-statistics-analysis/comments): 

_People_

* ID: Customer's unique identifier   
* Year_Birth: Customer's birth year   
* Education: Customer's education level
* Marital_Status: Customer's marital status
* Income: Customer's yearly household income
* Kidhome: Number of children in customer's household
* Teenhome: Number of teenagers in customer's household
* Dt_Customer: Date of customer's enrollment with the company
* Recency: Number of days since customer's last purchase
* Complain: 1 if customer complained in the last 2 years, 0 otherwise
* Country: Customer's location

_Products_

* MntWines: Amount spent on wine in the last 2 years
* MntFruits: Amount spent on fruits in the last 2 years
* MntMeatProducts: Amount spent on meat in the last 2 years
* MntFishProducts: Amount spent on fish in the last 2 years
* MntSweetProducts: Amount spent on sweets in the last 2 years
* MntGoldProds: Amount spent on gold in the last 2 years

_Place_

* NumWebPurchases: Number of purchases made through the company's web site
* NumCatalogPurchases: Number of purchases made using a catalogue
* NumStorePurchases: Number of purchases made directly in stores
* NumWebVisitsMonth: Number of visits to company's web site in the last month

_Promotion_

* NumDealsPurchases: Number of purchases made with a discount
* AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise
* AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise
* AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise
* AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise
* AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise
* Response: 1 if customer accepted the offer in the last campaign, 0 otherwise


### Section 1: Data cleaning and basic exploration

```{r}
# Format dates in Dt_Customer
data$Dt_Customer = mdy(data$Dt_Customer)
summary(data$Dt_Customer)

# Drop special characters in Income 
data$Income = gsub('[$]([0-9]+)[,]([0-9]+)','\\1\\2',data$Income)
data$Income = as.numeric(data$Income)
```

* The earliest customer enrollment date in the dataset is 2012-07-30 and latest 2014-06-29


```{r}
# Missing data analysis
data.frame("NA_Count" = sapply(data, function(x) { length(which(is.na(x))) })) %>% 
  filter(NA_Count > 0) %>% 
  arrange(., desc(NA_Count))
```

* There are 24 observations without income specified.

```{r}
# Unique levels in categorical variables
summary(as.factor(data$Education))
summary(as.factor(data$Marital_Status))
summary(as.factor(data$Country))
```

* Master education level are also indicated as 2n cycles in the dataset
* There are some odd levels in the Marital Status level i.e., Absurd, Alone and YOLO. 
* There are only 3 observations with country listed as _ME_

```{r}
# Recode 2n cycle to master 
revalue(data$Education,c("2n Cycle" = "Master")) -> data$Education
summary(as.factor(data$Education))
# Recode YOLO, Alone, Absurd to Single 
data$Marital_Status[data$Marital_Status == "YOLO" |
                    data$Marital_Status == "Absurd" |
                    data$Marital_Status == "Alone"] <- "Single"
summary(as.factor(data$Marital_Status))
```

#### Missing data imputation
References: [Jack Daoud's Notebook](https://www.kaggle.com/jackdaoud/ecommerce-marketing-eda-statistics-analysis/comments) and [Saeed Jafari's Notebook](https://www.kaggle.com/saeedjafari/section-1/comments). 

```{r, warning=FALSE}
# Summary of Income for each Education level
library(psych)
describeBy(data$Income, data$Education, mat = TRUE) 
```

```{r, warning=FALSE, message=FALSE}
# Table of mean values
#detach(package:plyr)
average_income = data %>% group_by(Education) %>% summarise(avg_Income = mean(Income,na.rm = TRUE)) %>% as.data.frame()
head(average_income)
```

```{r}
# Join
imput_data = left_join(data[is.na(data$Income),], 
                          average_income, 
                          by = c("Education")) 

# Replace missing values with Avg_Income
imput_data$Income = imput_data$avg_Income 
# drop avg_Income col
imput_data <- imput_data %>% select(-"avg_Income") 

imput_data
```

```{r}
# Full input data with full dataset
marketing_data <- full_join(data,
                            imput_data, 
                            by = c('ID',
                                   'Year_Birth',
                                   'Education',
                                   'Marital_Status',
                                   'Kidhome',
                                   'Teenhome',
                                   'Dt_Customer',
                                   'Recency',
                                   'MntWines',
                                   'MntFruits',
                                   'MntMeatProducts',
                                   'MntFishProducts',
                                   'MntSweetProducts',
                                   'MntGoldProds',
                                   'NumDealsPurchases',
                                   'NumWebPurchases',
                                   'NumCatalogPurchases',
                                   'NumStorePurchases',
                                   'NumWebVisitsMonth',
                                   'AcceptedCmp3',
                                   'AcceptedCmp4',
                                   'AcceptedCmp5',
                                   'AcceptedCmp1',
                                   'AcceptedCmp2',
                                   'Response',
                                   'Complain',
                                   'Country'))

# merge Income columns into a single column
marketing_data <- marketing_data  %>%                                 
   mutate(Income = coalesce(Income.x, Income.y)) %>%
   select(-c("Income.x", "Income.y"))

head(marketing_data)

```
```{r}
# Check for missing values after inputation
sum(is.na(marketing_data$Income))
sum(is.na(marketing_data))
```

#### Duplicates 

```{r}
# Check for duplicates in ID col
length(unique(data$ID)) 
# Drop 3 obs with country listed as ME
marketing_data = marketing_data %>% filter(Country!="ME")
dim(marketing_data)
summary(as.factor(marketing_data$Country))
```


```{r}
#examine duplicated cases using all cols except for ID
dupes = marketing_data %>% get_dupes(c(-ID))
dim(dupes)
head(dupes)
```

* Although no duplicates were found based on ID column, there are 94 problematic cases (47 duplicates) detected based on the rest of the columns. 

```{r, message=FALSE, warning=FALSE}
# dataframe without duplicates
length(unique(marketing_data$ID)) 
data = marketing_data %>% distinct_at(vars(-ID))
dim(data)
```


#### Outliers and Anomalies 

```{r}
# Change variables types
data = data %>% mutate_at(vars(AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain),list(factor))

# Scale numeric data
numeric_data = data %>% select(where(is.numeric))
z_data = data.frame(scale(numeric_data))
z_data$type = rownames(data)

# Plot all numeric variables
z_data %>% 
  pivot_longer(cols=-type) %>% 
  ggplot(aes(x = name, y = value)) +
    geom_boxplot() +
    theme_classic() +
    expand_limits(x=12.6) + coord_flip()
```
* The boxplot of all numeric variables show that there are outliers in multiple features, in particular:
  * Year_Birth variable contains 3 values before 1900 
  * Income has an extreme outlier i.e. $666,666
* Observations with the above-mentioned points are dropped, while the other outliers identified remains in the dataframe to avoid losing too much information. 

```{r}
# Drop obs with outliers in Income and Year_Birth 
data = data %>% filter(!Year_Birth<1930) %>% filter(!Income==666666)
dim(data)
```


#### Feature Construction

```{r}
# Feature construction
# Numeric variable from date (Dt_Customer)
data$Dt_Customer_num = as.numeric(data$Dt_Customer)
summary(data$Dt_Customer_num)

# TotalPurchases
data$TotalPurchases = data$NumWebPurchases + data$NumCatalogPurchases + data$NumStorePurchases
summary(data$TotalPurchases)
# Obs without purchases
data %>% filter(TotalPurchases==0)

# TotalProducts
data$TotalProducts = data$MntWines + data$MntFruits + data$MntMeatProducts + data$MntFishProducts + data$MntSweetProducts + data$MntGoldProds
summary(data$TotalProducts)

# Plot TotalPurchases and TotalProducts
data %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_point(alpha=0.5)

# Plot total products of TotalPurchases==0
data %>% filter(TotalPurchases==0) %>% ggplot(aes(x=TotalPurchases,y=TotalProducts)) + geom_count() + scale_y_continuous(limits=c(0.00,10.00)) + scale_size_continuous(breaks=round)
```

* There are 6 obs with no purchases but have amount spent on products.

```{r}
# Get ratio of NumDealsPurchased to TotalPurchases
data = data %>% mutate(ratio_dt = round((NumDealsPurchases/TotalPurchases),3)) %>% mutate(ratio_dt = na_if(ratio_dt,Inf))
summary(data$ratio_dt)
```

* As there are 6 obs with no purchases, there are 6 NAs in the variable ratio_dt. 

The variables Age, Age_group and Total Children are inspired by [Jack Daoud's Notebook](https://www.kaggle.com/jackdaoud/ecommerce-marketing-eda-statistics-analysis/comments)
```{r}
# Age
data$Age = 2014 - data$Year_Birth

# Age_group
data$Age_group <- ""
for (i in 1:nrow(data)) {
    if (data[i, "Age"] > 54) {
        data[i, "Age_group"] <- "Baby Boomer"
    } else if (data[i, "Age"] > 38) {
        data[i, "Age_group"] <- "Gen X"
    } else if (data[i, "Age"] > 18) {
        data[i, "Age_group"] <- "Gen Y"
    } else {
        data[i, "Age_group"] <- "Gen Z"
    }
}

# TotalChildren
data$TotalChildren = data$Kidhome + data$Teenhome
```


```{r}
# Drop anomalies> 
#data = data %>% filter(TotalPurchases!=0)
#dim(data)
```

#### One hot encoding
References: [Jack Daoud's Notebook](https://www.kaggle.com/jackdaoud/ecommerce-marketing-eda-statistics-analysis/comments) and [Saeed Jafari's Notebook](https://www.kaggle.com/saeedjafari/section-1/comments).

```{r}
# One hot encoding 
dummy_obj = dummyVars(" ~.", data = data, fullRank = T) #full rank to avoid multi-collinearity 
data_onehot = data.frame(predict(dummy_obj, newdata = data))
dim(data)
dim(data_onehot)
colnames(data_onehot)
```


### Section 2: Statistical analysis

#### 2.1: What factors are significantly related to the number of store purchases
```{r}
# Regression
data_21 = data[-c(1,28,29,30,31,33,34)]
linearMod1 = lm(NumStorePurchases~ ., data = data_21)
summary(linearMod1)
```

* The variables in the regression model are jointly significant as F-statistic p-value is <0.05.
* The above regression model show that factors below are significantly related to the number of store purchases
  + yearly household income
  + number of kids in  household
  + amount spent on: wine, fruits, meat products, sweet products in the last 2 years
  + number of purchases made: with a discount, through website, using a catalog
  + number of website visits in the last month 
  + acceptance of offer in: 3rd, 5th campaign 
  + date of customer's enrollment with the company

#### 2.2: Does US fare significantly better than the Rest of the World in terms of total purchases? 
```{r}
# Label
data22 = data
data22$country2 = ifelse(data22$Country=="US","US","World")
Hmisc::describe(data22$country2) 

# Independent t test
t.test(TotalPurchases ~ country2, data= data22, var.equal=TRUE,alternative="greater")
```

* The independent t-test results suggests that the total purchase of US is not statistically greater than the rest of the world, as the p-value is >0.05. 

#### 2.3: People who buy an above average amount on gold in the last 2 years have more in store purchase?
```{r}
# Label
data23 = data
data23$gold2 = ifelse(data23$MntGoldProds> mean(data23$MntGoldProds),"above_avg","not_above_avg")
Hmisc::describe(data23$gold2)

# Independent t test
t.test(NumStorePurchases ~ gold2, data= data23, var.equal=TRUE,alternative="greater")
```

* The independent t-test results show that the mean of store purchases for people who buy an above average amount of gold in the last 2 years statistically greater than people who buy a below average amount in the past 2 years, as the p-values is <0.05.  


#### 2.4: Do married PhD couples have a significant relation with amount spent on fish? What other factors are significantly related to amount spent on fish? 

```{r}
# Label: PhD couples 
data24 = data
data24$PhDcouples = ifelse(data24$Marital_Status =="Married" & data24$Education =="PhD", "1","0")
Hmisc::describe(data24$PhDcouples)
```

* There are 188 married PhD couples in the dataset. 

```{r}
# Regression
data24_model = data24[-c(1,2,3,28,29,30,31,33,34)]
linearMod2 = lm(MntFishProducts~ ., data = data24_model)
summary(linearMod2)
```

* The variables in the regression model are jointly significant as F-statistic p-value is <0.05.
* The regression model suggests the following features have a significant relation to amount spent on fish products in the past 2 years: 
  + PhD couples
  + number of teens in customers' household
  + amount spent on: fruits, meat products, sweet products, gold products in the last 2 years
  + number of purchases made: using a catalog, directly in stores
  + number of website visits in the last month
  + acceptance of offer in: 1st, 5th campaign
  + date of customer's enrollment with the company


#### 2.5: Is there a significant relationship between geographical regions and sucesss of a compaign?
* Using Response variable (if customer accepted the offer in the last campaign) as an indicator of success

```{r}
# Summary of groups
data25 = data
data25$Response = as.numeric(data25$Response)
data25 %>% group_by(Country) %>% get_summary_stats(Response, type="mean_sd")
# Compare means of group using ANOVA test
res.aov = data25 %>% anova_test(Response ~ Country)
res.aov
```
* The ANOVA test shows that the differences between the means of offer acceptance between regions are not statistically different, and we can conclude that there is no significant relationship between geographical regions and success of campaigns as the p-value is >0.05.  


### Section 3: Data Visualizations
#### 3.1: Which marketing campaign is most successful? 
```{r}
# Prepare data
data3 = data %>% select (AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4, AcceptedCmp5)
data3 = mutate_if(data3, is.factor, ~as.numeric(as.character(.x)))
# Plot
CmpLabel = c("2nd campaign","1st campaign","3rd campaign","5th campaign","4th campaign")
data3 %>% gather(Cmp, Outcome, AcceptedCmp1:AcceptedCmp5) %>% filter(Outcome>0) %>% group_by(Cmp) %>% tally() %>% ggplot(aes(x=reorder(Cmp,n), y=n, fill=Cmp)) + geom_col(width=0.7)  + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3.5) + scale_y_continuous(limits=c(0,185)) + coord_flip() + labs(y="", x="",title="Which marketing campaign is the most sucessful?", subtitle ="Number of customers who accepted the offer") + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank()) + scale_x_discrete(label= CmpLabel)
```


#### 3.2: What does the average customer look like for this company?
```{r}
# Average Year_Birth and Income 
data %>% summarise(avg_Age = round(mean(Age)), avg_Income = round(mean(Income)), avg_Kidhome = round(mean(Kidhome)), avg_Teenhome= round(mean(Teenhome)))
```

```{r}
sum(is.na(data$income))
```


```{r}
# Income
p1 = data %>% ggplot(aes(x=Income)) + geom_histogram(binwidth=5000, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 52002, color="#e63946",linetype="dashed") + scale_y_continuous(limits=c(0,200)) + annotate("text",label="Average = $52,002", x= 95000,y=190) + labs(x="", y="Customer Count", title="Yearly household income")


# Age
p2 = data %>% ggplot(aes(x=Age)) + geom_histogram(binwidth=5, color="#1d3557",fill="#457b9d") + geom_vline(xintercept= 45, color="#e63946",linetype="dashed") + annotate("text",label="Average = 45", x= 56,y=360) + labs(title="Customer Age", y="Customer Count",x="")

ggarrange(p1,p2,ncol=2)
```

```{r}
# Age group
p2_2 = data %>% group_by(Age_group) %>% tally() %>% ggplot(aes(x=Age_group,y=n, fill=Age_group)) + geom_col(width=0.5) + geom_text(stat="identity",aes(label=n),vjust=-0.3,size=3.5) + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none") + labs(x= "", y="Customer count", title="Age demographic")
```


```{r}
# Kidhome
p3 = data %>% group_by(Kidhome) %>% tally() %>% ggplot(aes(x=factor(Kidhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of children in household") + scale_y_continuous(limits=c(0,1300))
# Teenhome
p4 = data %>% group_by(Teenhome) %>% tally() %>% ggplot(aes(x=factor(Teenhome),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Number of teens in household") + scale_y_continuous(limits=c(0,1300))
# Marital Status 
p5 = data %>% group_by(Marital_Status) %>% tally() %>% ggplot(aes(x=reorder(Marital_Status,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Marital status") + theme(legend.position="none")
# Education
p6 = data %>% group_by(Education) %>% tally() %>% ggplot(aes(x=reorder(Education,n),y=n, fill=n)) + geom_col(width=0.7) + scale_fill_viridis() + theme(legend.position="none") + labs(y="Customer count", x="", title="Customer's educational level") + theme(legend.position="none")

# Combined plot
ggarrange(p3,p4,p5,p6, ncol=2, nrow=2) 
```

```{r}
# Marital status and education level
data %>% group_by(Marital_Status,Education) %>% tally(sort=T) %>% mutate(Marital_Edu = paste(Marital_Status, Education, sep="-")) %>% ggplot(aes(x=reorder(Marital_Edu,n), y=n)) + geom_point(color="#f77f00") + geom_segment(aes(x=Marital_Edu, xend=Marital_Edu, y=0, yend=n),color="#335c67") + coord_flip() + theme_light() + theme(panel.border=element_blank(),axis.ticks.x=element_blank(),axis.ticks.y=element_blank(), panel.grid.major.y=element_blank(), panel.grid.minor.x=element_blank()) + labs(y="Customer count", x="", title="Customers' marital status and education level")
```


#### 3.3: Which products are performing best?
```{r}
PrLabel = c("Fruits","Sweet","Fish","Gold","Meat","Wines")
data %>% select(MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds) %>% gather(Product, AmtSpent, MntWines:MntGoldProds) %>% group_by(Product) %>% tally(AmtSpent) %>% ggplot(aes(x=reorder(Product,n), y=n, fill=Product)) + geom_col() + geom_text(stat="identity",aes(label=n),hjust=-0.2,size=3) + scale_y_continuous(limits=c(0,750000)) + coord_flip() + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.x=element_blank(),axis.text.y=element_text(size=10)) + labs(y="Amount spent", x= "", title = "Which products are performing best?", subtitle = "Amount spent by customers in the last two years by product type") + scale_x_discrete(label= PrLabel)
```


#### 3.3: Which products are performing best?
```{r}
# Labels
ChLabel = c("Catalog","Web","Store")
# Plot
data %>% select(NumWebPurchases, NumCatalogPurchases, NumStorePurchases) %>% gather(Channel, Purchases, NumWebPurchases:NumStorePurchases) %>% group_by(Channel) %>% tally(Purchases) %>% ggplot(aes(x=reorder(Channel,n), y=n, fill=Channel)) + geom_col(width=0.5) + geom_text(stat="identity",aes(label=n),vjust=-0.5,size=3.5) + scale_y_continuous(limits=c(0,15000)) + scale_fill_uchicago() + theme_classic() + theme(axis.ticks.x = element_blank(),axis.ticks.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(), legend.position= "none", axis.text.y=element_text(size=10),axis.text.x=element_text(size=11)) + labs(y="Number of purchases", x= "Channel", title = "Which channels are underperforming?", subtitle = "Number of purchases made through channels") + scale_x_discrete(labels=ChLabel)
```

