Getting the data
a. Download data from kaggle
The data used in this project was downloaded from kaggle,“Customer Personality Analysis”.
b. Data Cleaning
I noticed that some observations have some mistakes, I will exclude these from the sample. I will also exclude the richest 1%, assuming that the income distribution behaves like a Lognormal, the richest tend to exhibit behavior that is not representative. Considering that they are outliers I will exclude them from this analysis.
richest<-quantile(db$Income, .99, na.rm = TRUE)
db1<-filter(db,
Education != "2n Cycle",
Education != "Basic",
Marital_Status!= "Alone",
Marital_Status!= "Together",
Marital_Status!= "Absurd",
Marital_Status!= "YOLO",
Income <= richest)
c. Getting to know the data
Let’s get some summary statistics in order to get to know the data and in order to spot some mistakes.
table1<-summarise(db1,
income_mean = mean(Income, na.rm = TRUE),
income_max = max(Income, na.rm = TRUE),
income_min = min(Income, na.rm = TRUE),
age_old = min(Year_Birth, na.rm = TRUE),
age_young = max(Year_Birth, na.rm = TRUE),
kid_mean = mean(Kidhome, na.rm = TRUE),
teen_mean = mean(Teenhome, na.rm = TRUE),
kid_max = max(Kidhome, na.rm = TRUE),
teen_max = max(Teenhome, na.rm = TRUE),
kid_min = min(Kidhome, na.rm = TRUE),
teen_min = min(Teenhome, na.rm = TRUE))
kable(table1)%>%
kable_styling()
|
income_mean
|
income_max
|
income_min
|
age_old
|
age_young
|
kid_mean
|
teen_mean
|
kid_max
|
teen_max
|
kid_min
|
teen_min
|
|
52442.8
|
93790
|
1730
|
1940
|
1995
|
0.4356298
|
0.5191371
|
2
|
2
|
0
|
0
|
kable(table(db1$Education, db1$Marital_Status))%>%
kable_styling()
|
|
Divorced
|
Married
|
Single
|
Widow
|
|
Graduation
|
117
|
428
|
242
|
35
|
|
Master
|
37
|
138
|
73
|
11
|
|
PhD
|
52
|
186
|
94
|
24
|
d. Exploration of correlations - response to campaigns
In order to see who responds to each type of campaign let’s run some regressions. The objective is to see the level of significance of the demographic variables. If we had more information about the speciic campaign, we could have expected results, but given that we have no information about the campaigns, how they work, who is the target, we run general regressions.
Dependent<-select(db1, AcceptedCmp1, AcceptedCmp2, AcceptedCmp3, AcceptedCmp4, AcceptedCmp5, Complain)
models1<-lapply(Dependent, function(x) lm(x ~ Education + Marital_Status + log(Income) + Kidhome + Teenhome, data=db1))
stargazer(models1, title="Models: Response to Campaigns", align=TRUE, type='html')
Models: Response to Campaigns
|
|
|
|
Dependent variable:
|
|
|
|
|
|
x
|
|
|
(1)
|
(2)
|
(3)
|
(4)
|
(5)
|
(6)
|
|
|
|
EducationMaster
|
-0.004
|
-0.006
|
-0.010
|
0.016
|
0.013
|
-0.005
|
|
|
(0.017)
|
(0.007)
|
(0.019)
|
(0.019)
|
(0.017)
|
(0.007)
|
|
|
|
|
|
|
|
|
|
EducationPhD
|
-0.002
|
0.006
|
0.00003
|
0.016
|
0.018
|
-0.009
|
|
|
(0.015)
|
(0.006)
|
(0.017)
|
(0.017)
|
(0.015)
|
(0.006)
|
|
|
|
|
|
|
|
|
|
Marital_StatusMarried
|
0.017
|
-0.0002
|
-0.029
|
0.014
|
0.028
|
0.004
|
|
|
(0.018)
|
(0.008)
|
(0.021)
|
(0.021)
|
(0.019)
|
(0.007)
|
|
|
|
|
|
|
|
|
|
Marital_StatusSingle
|
0.002
|
0.003
|
-0.020
|
0.002
|
0.002
|
0.008
|
|
|
(0.020)
|
(0.009)
|
(0.023)
|
(0.022)
|
(0.020)
|
(0.008)
|
|
|
|
|
|
|
|
|
|
Marital_StatusWidow
|
-0.010
|
0.001
|
-0.036
|
0.055
|
0.034
|
-0.002
|
|
|
(0.032)
|
(0.014)
|
(0.037)
|
(0.036)
|
(0.033)
|
(0.013)
|
|
|
|
|
|
|
|
|
|
log(Income)
|
0.122***
|
0.011*
|
-0.004
|
0.080***
|
0.156***
|
0.001
|
|
|
(0.014)
|
(0.006)
|
(0.017)
|
(0.016)
|
(0.015)
|
(0.006)
|
|
|
|
|
|
|
|
|
|
Kidhome
|
-0.036***
|
-0.011**
|
0.011
|
-0.042***
|
-0.026**
|
0.013**
|
|
|
(0.013)
|
(0.006)
|
(0.015)
|
(0.014)
|
(0.013)
|
(0.005)
|
|
|
|
|
|
|
|
|
|
Teenhome
|
-0.082***
|
-0.003
|
-0.031**
|
0.011
|
-0.115***
|
0.004
|
|
|
(0.011)
|
(0.005)
|
(0.013)
|
(0.013)
|
(0.012)
|
(0.005)
|
|
|
|
|
|
|
|
|
|
Constant
|
-1.196***
|
-0.098
|
0.151
|
-0.795***
|
-1.568***
|
-0.016
|
|
|
(0.158)
|
(0.069)
|
(0.182)
|
(0.178)
|
(0.161)
|
(0.064)
|
|
|
|
|
|
|
|
|
|
|
|
Observations
|
1,437
|
1,437
|
1,437
|
1,437
|
1,437
|
1,437
|
|
R2
|
0.105
|
0.011
|
0.007
|
0.047
|
0.153
|
0.008
|
|
Adjusted R2
|
0.100
|
0.005
|
0.001
|
0.041
|
0.148
|
0.003
|
|
Residual Std. Error (df = 1428)
|
0.232
|
0.101
|
0.267
|
0.260
|
0.236
|
0.095
|
|
F Statistic (df = 8; 1428)
|
20.848***
|
1.931*
|
1.182
|
8.718***
|
32.272***
|
1.507
|
|
|
|
Note:
|
p<0.1; p<0.05; p<0.01
|
- People with higher income responded positively to campaigns 1, 4 and 5.
- People with kids at home had a negative response to campaigns 1, 4 and 5.
- Families with teens at home responded negatively to campaigns 1, 3 and 5.
- The education level and matiral status were not significant in any category for any campaign.
e. Exploration of correlations - product preferences
#I need to incorportae the product variables!!!!!!!!!
Products<-select(db1, MntWines, MntFruits, MntMeatProducts, MntSweetProducts, MntGoldProds)
models2<-lapply(Products, function(x) lm(x ~ Education + Marital_Status + log(Income) + Kidhome + Teenhome, data=db1))
stargazer(models2, title="Models: Product Preferences", align=TRUE, type='html')
Models: Product Preferences
|
|
|
|
Dependent variable:
|
|
|
|
|
|
x
|
|
|
(1)
|
(2)
|
(3)
|
(4)
|
(5)
|
|
|
|
EducationMaster
|
54.368***
|
-11.125***
|
-14.077
|
-11.227***
|
-9.448***
|
|
|
(17.710)
|
(2.400)
|
(11.593)
|
(2.387)
|
(3.336)
|
|
|
|
|
|
|
|
|
EducationPhD
|
99.945***
|
-11.826***
|
-27.860***
|
-13.162***
|
-21.166***
|
|
|
(15.865)
|
(2.150)
|
(10.385)
|
(2.138)
|
(2.989)
|
|
|
|
|
|
|
|
|
Marital_StatusMarried
|
-1.496
|
-2.643
|
5.796
|
0.922
|
-2.797
|
|
|
(19.518)
|
(2.645)
|
(12.777)
|
(2.631)
|
(3.677)
|
|
|
|
|
|
|
|
|
Marital_StatusSingle
|
-14.703
|
-1.671
|
20.052
|
-0.456
|
-1.447
|
|
|
(21.305)
|
(2.888)
|
(13.947)
|
(2.872)
|
(4.014)
|
|
|
|
|
|
|
|
|
Marital_StatusWidow
|
-26.792
|
-2.259
|
-0.343
|
5.820
|
5.471
|
|
|
(34.391)
|
(4.661)
|
(22.513)
|
(4.636)
|
(6.479)
|
|
|
|
|
|
|
|
|
log(Income)
|
359.950***
|
34.788***
|
223.502***
|
34.217***
|
27.840***
|
|
|
(15.439)
|
(2.092)
|
(10.107)
|
(2.081)
|
(2.909)
|
|
|
|
|
|
|
|
|
Kidhome
|
-169.291***
|
-14.004***
|
-91.397***
|
-14.763***
|
-22.189***
|
|
|
(13.742)
|
(1.863)
|
(8.996)
|
(1.852)
|
(2.589)
|
|
|
|
|
|
|
|
|
Teenhome
|
-53.317***
|
-17.520***
|
-134.675***
|
-16.105***
|
-3.342
|
|
|
(12.140)
|
(1.645)
|
(7.947)
|
(1.636)
|
(2.287)
|
|
|
|
|
|
|
|
|
Constant
|
-3,484.482***
|
-326.002***
|
-2,128.061***
|
-322.541***
|
-235.855***
|
|
|
(168.945)
|
(22.897)
|
(110.595)
|
(22.772)
|
(31.829)
|
|
|
|
|
|
|
|
|
|
|
Observations
|
1,437
|
1,437
|
1,437
|
1,437
|
1,437
|
|
R2
|
0.477
|
0.320
|
0.461
|
0.321
|
0.195
|
|
Adjusted R2
|
0.474
|
0.316
|
0.458
|
0.317
|
0.191
|
|
Residual Std. Error (df = 1428)
|
247.835
|
33.590
|
162.239
|
33.406
|
46.692
|
|
F Statistic (df = 8; 1428)
|
162.777***
|
83.946***
|
152.927***
|
84.469***
|
43.325***
|
|
|
|
Note:
|
p<0.1; p<0.05; p<0.01
|
- Having a master is positively correlated with wine consumpetion and negatively correlated with consumption of fruits, sweets and gold products.
- Having a PhD is positively correlated with consumption of wine and negatively correlated with consumption of other products.
- Marital status is not correlated with consumption of any type of good.
- As expected, income is positively correlated with consumption of all the goods (this are normal goods, not inferior).
- Surprisingly having kids and teens at home is negatively correlated with consumption of almost all the goods.
f. Exploration of correlations - Purchase type
#I need to incorportae the product variables!!!!!!!!!
Purchases<-select(db1, NumDealsPurchases, NumWebPurchases, NumCatalogPurchases, NumStorePurchases, NumWebVisitsMonth)
models3<-lapply(Purchases, function(x) lm(x ~ Education + Marital_Status + log(Income) + Kidhome + Teenhome, data=db1))
stargazer(models3, title="Models: Purchase type", align=TRUE, type='html')
Models: Purchase type
|
|
|
|
Dependent variable:
|
|
|
|
|
|
x
|
|
|
(1)
|
(2)
|
(3)
|
(4)
|
(5)
|
|
|
|
EducationMaster
|
-0.030
|
-0.126
|
-0.166
|
0.018
|
-0.016
|
|
|
(0.121)
|
(0.165)
|
(0.153)
|
(0.171)
|
(0.131)
|
|
|
|
|
|
|
|
|
EducationPhD
|
-0.018
|
0.097
|
-0.051
|
-0.066
|
0.177
|
|
|
(0.108)
|
(0.148)
|
(0.137)
|
(0.153)
|
(0.118)
|
|
|
|
|
|
|
|
|
Marital_StatusMarried
|
0.026
|
-0.005
|
-0.041
|
0.215
|
-0.052
|
|
|
(0.133)
|
(0.182)
|
(0.168)
|
(0.188)
|
(0.145)
|
|
|
|
|
|
|
|
|
Marital_StatusSingle
|
-0.028
|
-0.152
|
-0.071
|
0.161
|
-0.092
|
|
|
(0.145)
|
(0.199)
|
(0.183)
|
(0.205)
|
(0.158)
|
|
|
|
|
|
|
|
|
Marital_StatusWidow
|
0.031
|
-0.088
|
0.054
|
-0.226
|
-0.048
|
|
|
(0.235)
|
(0.321)
|
(0.296)
|
(0.332)
|
(0.255)
|
|
|
|
|
|
|
|
|
log(Income)
|
-0.385***
|
1.726***
|
2.599***
|
3.499***
|
-2.595***
|
|
|
(0.105)
|
(0.144)
|
(0.133)
|
(0.149)
|
(0.114)
|
|
|
|
|
|
|
|
|
Kidhome
|
0.637***
|
-1.217***
|
-1.649***
|
-1.634***
|
0.954***
|
|
|
(0.094)
|
(0.128)
|
(0.118)
|
(0.132)
|
(0.102)
|
|
|
|
|
|
|
|
|
Teenhome
|
1.477***
|
0.745***
|
-0.972***
|
-0.083
|
0.900***
|
|
|
(0.083)
|
(0.113)
|
(0.105)
|
(0.117)
|
(0.090)
|
|
|
|
|
|
|
|
|
Constant
|
5.453***
|
-14.256***
|
-23.984***
|
-31.148***
|
32.407***
|
|
|
(1.153)
|
(1.579)
|
(1.455)
|
(1.629)
|
(1.251)
|
|
|
|
|
|
|
|
|
|
|
Observations
|
1,437
|
1,437
|
1,437
|
1,437
|
1,437
|
|
R2
|
0.223
|
0.264
|
0.435
|
0.465
|
0.427
|
|
Adjusted R2
|
0.218
|
0.260
|
0.432
|
0.462
|
0.424
|
|
Residual Std. Error (df = 1428)
|
1.691
|
2.316
|
2.134
|
2.390
|
1.836
|
|
F Statistic (df = 8; 1428)
|
51.098***
|
63.986***
|
137.363***
|
155.117***
|
133.157***
|
|
|
|
Note:
|
p<0.1; p<0.05; p<0.01
|
- Income is significant for all the types of purchases but with different signs. The hugher the income the more likely they are to make purchases through the web catalogue and in store. And less likely to access deals and visit the web.
- Having kids at home makes it more likely to buy deals and look at the web.
- Having teens is correlated with accepting deals, doing web purchases and visiting the web and negatively correlated with catalog and store purchases.