data = read.table("D:/Fall 2021/DA Application/project/marketing_campaign.csv", sep = "\t", header = TRUE)
df = data.frame(data)
head(df)
##     ID Year_Birth  Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524       1957 Graduation         Single  58138       0        0  04-09-2012
## 2 2174       1954 Graduation         Single  46344       1        1  08-03-2014
## 3 4141       1965 Graduation       Together  71613       0        0  21-08-2013
## 4 6182       1984 Graduation       Together  26646       1        0  10-02-2014
## 5 5324       1981        PhD        Married  58293       1        0  19-01-2014
## 6 7446       1967     Master       Together  62513       0        1  09-09-2013
##   Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1      58      635        88             546             172               88
## 2      38       11         1               6               2                1
## 3      26      426        49             127             111               21
## 4      26       11         4              20              10                3
## 5      94      173        43             118              46               27
## 6      16      520        42              98               0               42
##   MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1           88                 3               8                  10
## 2            6                 2               1                   1
## 3           42                 1               8                   2
## 4            5                 2               2                   0
## 5           15                 5               5                   3
## 6           14                 2               6                   4
##   NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1                 4                 7            0            0            0
## 2                 2                 5            0            0            0
## 3                10                 4            0            0            0
## 4                 4                 6            0            0            0
## 5                 6                 5            0            0            0
## 6                10                 6            0            0            0
##   AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1            0            0        0             3        11        1
## 2            0            0        0             3        11        0
## 3            0            0        0             3        11        0
## 4            0            0        0             3        11        0
## 5            0            0        0             3        11        0
## 6            0            0        0             3        11        0
sum(is.na(df))
## [1] 24
df = na.omit(df)
str(df)
## 'data.frame':    2216 obs. of  29 variables:
##  $ ID                 : int  5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
##  $ Year_Birth         : int  1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
##  $ Education          : chr  "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Marital_Status     : chr  "Single" "Single" "Together" "Together" ...
##  $ Income             : int  58138 46344 71613 26646 58293 62513 55635 33454 30351 5648 ...
##  $ Kidhome            : int  0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : int  0 1 0 0 0 1 1 0 0 1 ...
##  $ Dt_Customer        : chr  "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
##  $ Recency            : int  58 38 26 26 94 16 34 32 19 68 ...
##  $ MntWines           : int  635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : int  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : int  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : int  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : int  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : int  88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : int  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : int  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: int  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : int  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : int  7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Complain           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Z_CostContact      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Z_Revenue          : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ Response           : int  1 0 0 0 0 0 0 0 1 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:24] 11 28 44 49 59 72 91 92 93 129 ...
##   ..- attr(*, "names")= chr [1:24] "11" "28" "44" "49" ...
df$Age = 2021 - df$Year_Birth
df$Education[df$Education == "2n Cycle"] = "UG"
df$Education[df$Education == "Basic"] = "UG"
df$Education[df$Education == "Graduation"] = "PG"
df$Education[df$Education == "Master"] = "PG"
df$Education[df$Education == "PhD"] = "PG"
df$Marital_Status[df$Marital_Status == "Divorced"] = "Single"
df$Marital_Status[df$Marital_Status == "Absurd"] = "Single"
df$Marital_Status[df$Marital_Status == "YOLO"] = "Single"
df$Marital_Status[df$Marital_Status == "Widow"] = "Single"
df$Marital_Status[df$Marital_Status == "Together"] = "Couple"
df$Marital_Status[df$Marital_Status == "Married"] = "Couple"
df$Marital_Status[df$Marital_Status == "Alone"] = "Single"
df$Customer_year <- str_sub(df$Dt_Customer, -4)
df$Customer_year <- as.numeric(df$Customer_year)
df$Customer_Seniority <- 2021 - df$Customer_year
df$Child <- df$Kidhome + df$Teenhome
df$Amt_Spent <- df$MntWines + df$MntFishProducts + df$MntFruits + df$MntGoldProds + df$MntMeatProducts + df$MntSweetProducts
df$Num_Purchases_made <- df$NumWebPurchases +df$NumCatalogPurchases + df$NumStorePurchases
df <- df[c(30, 3, 4, 5, 33, 32, 9, 34, 35, 16, 20)]
corrgram(df, order=TRUE, lower.panel=panel.ellipse, upper.panel=panel.pts, text.panel=panel.txt, diag.panel=panel.minmax, main="Correlaton between various variables ") 

data2 <- df
data2$Education <- unclass(as.factor(data2$Education))
data2$Marital_Status <- unclass(as.factor(data2$Marital_Status))
data2$Education <- as.numeric(data2$Education)
data2$Marital_Status <- as.numeric(data2$Marital_Status)
corrgram(data2[-c(1)], order = TRUE, lower.panel = panel.shade,
    upper.panel = NULL, text.panel = panel.txt, main = "Customer Data")

plot(df)

age_plot <- ggplot(data = df, aes(Age))
age_plot + geom_density()

edu_plot1 <- ggplot(data = df, aes(Education))
edu_plot1 + geom_histogram(stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

eduplot = ggplot(df, aes(x=Education,y=Income,fill=Education))+ylim(0,180000)+geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
eduplot
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

edu_plot2 <- ggplot(data = df, aes(Income, fill = Education))
edu_plot2 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df = subset(df,df$Income<666665)
library(caret)

dmy <- dummyVars(" ~ .", data = df, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = df))

glimpse(dat_transformed)
## Rows: 2,215
## Columns: 11
## $ Age                  <dbl> 64, 67, 56, 37, 40, 54, 50, 36, 47, 71, 45, 62, 6~
## $ EducationUG          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0~
## $ Marital_StatusSingle <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0~
## $ Income               <dbl> 58138, 46344, 71613, 26646, 58293, 62513, 55635, ~
## $ Child                <dbl> 0, 2, 0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 2, 0, 0, 2, 0~
## $ Customer_Seniority   <dbl> 9, 7, 8, 7, 7, 8, 9, 8, 8, 7, 9, 8, 8, 9, 9, 9, 9~
## $ Recency              <dbl> 58, 38, 26, 26, 94, 16, 34, 32, 19, 68, 59, 82, 5~
## $ Amt_Spent            <dbl> 1617, 27, 776, 53, 422, 716, 590, 169, 46, 49, 61~
## $ Num_Purchases_made   <dbl> 22, 4, 20, 6, 14, 20, 17, 8, 5, 1, 5, 15, 12, 4, ~
## $ NumDealsPurchases    <dbl> 3, 2, 1, 2, 5, 2, 4, 2, 1, 1, 1, 1, 3, 1, 1, 3, 2~
## $ NumWebVisitsMonth    <dbl> 7, 5, 4, 6, 5, 6, 6, 8, 9, 20, 8, 2, 6, 8, 3, 8, ~
dfc = dat_transformed[-c(4)]
str(dfc)
## 'data.frame':    2215 obs. of  10 variables:
##  $ Age                 : num  64 67 56 37 40 54 50 36 47 71 ...
##  $ EducationUG         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Marital_StatusSingle: num  1 1 0 0 0 0 1 0 0 0 ...
##  $ Child               : num  0 2 0 1 1 1 1 1 1 2 ...
##  $ Customer_Seniority  : num  9 7 8 7 7 8 9 8 8 7 ...
##  $ Recency             : num  58 38 26 26 94 16 34 32 19 68 ...
##  $ Amt_Spent           : num  1617 27 776 53 422 ...
##  $ Num_Purchases_made  : num  22 4 20 6 14 20 17 8 5 1 ...
##  $ NumDealsPurchases   : num  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebVisitsMonth   : num  7 5 4 6 5 6 6 8 9 20 ...
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(dfc,kmeans,method="wss")+geom_vline(xintercept=3,linetype=2)

set.seed(123)

km.res<- kmeans(dfc,3,nstart=10)
print(km.res$centers)
##        Age EducationUG Marital_StatusSingle     Child Customer_Seniority
## 1 50.77689  0.13944223            0.3474104 1.2302789           7.909163
## 2 54.68220  0.09983361            0.3594010 0.7504160           8.004992
## 3 52.91643  0.05292479            0.3732591 0.2869081           8.133705
##    Recency Amt_Spent Num_Purchases_made NumDealsPurchases NumWebVisitsMonth
## 1 48.64940  152.7586           7.258167          2.339442          6.314741
## 2 48.80865  921.4676          19.131448          2.728785          4.379368
## 3 50.69638 1670.4819          20.091922          1.584958          3.409471
print(km.res$size)
## [1] 1255  601  359
print(km.res$betweenss/km.res$totss)
## [1] 0.8976134
print(km.res$totss)
## [1] 807143674
print(km.res$betweenss)
## [1] 724503006
print(km.res$withinss)
## [1] 28087287 27493826 27059556
fviz_cluster(km.res, dfc, geom = "point",ellipse.type = "norm",repel = TRUE)

plot_ly(x = dfc$Income , 
        y = dfc$Amt_Spent, 
        z = dfc$Num_Purchases_made, 
        type = "scatter3d", 
        mode = "markers", 
        color = as.factor(km.res$cluster)) 
dfc['cluster']=as.factor(km.res$cluster)
spentplot = ggplot(dfc, aes(x=cluster,y=Amt_Spent,fill=cluster))+geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
spentplot

spentplot = ggplot(dfc, aes(x=cluster,y=Age,fill=cluster))+geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
spentplot

numdealplot = ggplot(dfc, aes(x=cluster,y=NumDealsPurchases,fill=cluster))+geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2)
numdealplot

dfc$Income = df$Income
incomeplot = ggplot(dfc, aes(x=cluster,y=Income,fill=cluster))+geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
incomeplot

numdealplot = ggplot(dfc, aes(x=cluster,y=Customer_Seniority,fill=cluster))+geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2)
numdealplot