#Market Segmentation
#identifying consumers with similar taste or habits ,
#and then grouping them.
#By doing so it becomes easier to target each group with different policies (in this example of mall segmentation , it would be discounts and other sale tactics),
#which would in turn maximise the profits and minimise the cost.
#As there are no pre imposed labels to classify the customers ,
#we use K-means, which is clustering technique used for unsupervised learning.
#K-means
  #A clustering technique which groups similar data points by taking in account their airthmetic mean , in others forms gropus/clusters of data points with similar mean. It requires two arguments, the data and the number of centroids(k). The k tells the algorithm how many clusters it needs to make. The optimal number of clusters can be found using :-
       #Elbow Method
      #Silhouette Method.

#This study would employ the Silhouette Method(SM) to find the optimal number of centroids.
#Our Aim of the study is to form clusters based on the annual income,
#and the spending scores of the customers. 
#This wpuld let us understand our customers better,
#and hence in the future help in better strategy planning.

#loading packages
pacman::p_load(pacman,tidyr,ggthemes,ggplot2,plotly,GGally,rio,
               stringr,shiny,rmarkdown,lubridate,psych,ipred,caret,ROCR,pROC,
               DT,dummies,rpart,rpart.plot,httr,randomForest,readr,doParallel,
               xgboost,truncnorm,DMwR,tidyverse,cluster)

Mall_data<-read.csv('Mall_Customers.csv')
 head(Mall_data)
##   CustomerID  Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1          1   Male  19                 15                     39
## 2          2   Male  21                 15                     81
## 3          3 Female  20                 16                      6
## 4          4 Female  23                 16                     77
## 5          5 Female  31                 17                     40
## 6          6 Female  22                 17                     76
 #check for any missing values
sum(is.na(Mall_data))
## [1] 0
colSums(is.na(Mall_data))
##             CustomerID                  Genre                    Age 
##                      0                      0                      0 
##     Annual.Income..k.. Spending.Score..1.100. 
##                      0                      0
#data contain no missing values

#Silhouette Analysis.
#The Silhouette Method allows you to calculate how similar each observations is within the cluster it is assigned relative to other clusters.
#The method uses average silhouette widths to explore what the "best" value of k should be. 
#It provides a metric to analyse this , ranging from -1 to 1.
#while 1 being it is in the right cluster. 
#Thus K with the average width closest to 1 is chosen as the optimal number of clusters.
#We create df1 having only two variables, Income and Spending scores, the variables for our study.As a thumb rules k Should not be more than the number of original variables, which in this case is 5.


# df1 with income and spending score as the only variables
df1<-Mall_data  
  df1 <- df1 %>%
  select(-CustomerID,-Genre,-Age)
#silhouted annalysis for k ----
sil <- map_dbl(2:5,function(k){
  model <- pam(df1 , k=k)
  model$silinfo$avg.width
})
sil_df <- data.frame(k=2:5,sil_width=sil)
sil_df
##   k sil_width
## 1 2 0.3960019
## 2 3 0.4649832
## 3 4 0.3544029
## 4 5 0.5535287
ggplot(sil_df, aes(x = k, y = sil_width)) +
  geom_line(col = "red") +
  scale_x_continuous(breaks = 2:5)+ ggtitle("Silhouette Method")  

#From our analysis we can see that K with the value of 5,
#has the average width closest to 1.
#Thus we well be forming 5 clusters for our study.

#CLUSTER ANALYSIS
set.seed(1992)
clus <- kmeans(df1,5)
clus
## K-means clustering with 5 clusters of sizes 22, 100, 38, 30, 10
## 
## Cluster means:
##   Annual.Income..k.. Spending.Score..1.100.
## 1           25.72727               79.36364
## 2           47.96000               43.25000
## 3           87.00000               18.63158
## 4           78.23333               81.36667
## 5          109.70000               82.00000
## 
## Clustering vector:
##   [1] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
##  [38] 1 2 1 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 4 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
## [149] 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 5 3 5 3
## [186] 5 3 5 3 5 3 5 3 5 3 5 3 5 3 5
## 
## Within cluster sum of squares by cluster:
## [1]  3519.455 42058.590 14204.842  4370.333  2512.100
##  (between_SS / total_SS =  75.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#Assig the variables into 5 clusters
Mall_data <- mutate(Mall_data,cluster = clus$cluster)
head(Mall_data)
##   CustomerID  Genre Age Annual.Income..k.. Spending.Score..1.100. cluster
## 1          1   Male  19                 15                     39       2
## 2          2   Male  21                 15                     81       1
## 3          3 Female  20                 16                      6       2
## 4          4 Female  23                 16                     77       1
## 5          5 Female  31                 17                     40       2
## 6          6 Female  22                 17                     76       1
#plot the Clusters
ggplot(Mall_data,aes(Annual.Income..k..,Spending.Score..1.100., col=factor(cluster)))+
  geom_point()+
  xlab("Annual Income")+
  ylab("Spending Score")+
  ggtitle("Cluster analysis")+
  scale_color_discrete(name=" ",breaks=c("1", "2", "3", "4", "5"),
    labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"))

#From the graphical represenation we can make the following deduction about the clusters:
# Cluster 1 has individuals with low  income and a high spending score.
# Cluster 2 has individuals with mid range annual income as well as mid spending score.
# Cluster 3 has individuals with mid range annual income as well as high spending score.
# Cluster 4 has individuals with high annual income as well as high spending score.
# Cluster 5 has individuals with mid to high annual incomes as well as low spending score.


#Now lets look at the gender and age diversion between these clusters:

ggplot(Mall_data,aes(Age,fill= factor(cluster)))+
  geom_density(alpha=0.7)+
  ggtitle("Cluster analysis by Age")+
  scale_fill_discrete(name=" ",
    breaks=c("1", "2", "3", "4", "5"),
    labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"))

#From the graphical representation we can see that the clusters are normally distributed. 
#They have customers of all age groups in them , with peaks mostly seen in the late 30s or early 40s.
#Cluster 4, who are seen to have high spending score,as well as a high annual income,
#have customers that are mostly late 20s and early 50s or mid 40s,
#this is a good indication as to why the group has high spending score as,
#well as high annual income.this might be attributed to the fact that most peiple
#aged in 24 to 45 years of age have families,some with kids.
#Cluster 1 who have low income and high spending score is Right tailed,the customers in cluster1
#are mostly late teens or in their early 20s and some ealry 40s, this explains why they have high spending score with low income
#Cluster 2 and 3 has all the age groups included and with mid range income as well as mid range
#spending score.
#Cluster 5 has mid-high annual income with and are in their mid 20s to ealry 40s with a peak around the 30s
#Cluster 5 mostly in the 30s

ggplot(Mall_data,aes(Genre,fill= factor(cluster)))+
  geom_bar(position = "fill")+
  ggtitle("Cluster analysis by Gender")+
  scale_fill_discrete(name=" ",
                      breaks=c("1", "2", "3", "4", "5"),
                      labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5")) +
  ylab("Proportion")

#All the clusters have an equal representation of the genders.
#With the above clustering analysis,
#corporation can identify different groups of customers , 
#and thus target them more precisiely by bettering their strategies.
#The cluster analysis in the mall project has provided good insights into the customer data base.

#by income
ggplot(Mall_data,aes(Annual.Income..k..,fill= factor(cluster)))+
  geom_bar(position = "fill")+
  ggtitle("Cluster analysis by Income")+
  scale_fill_discrete(name=" ",
                      breaks=c("1", "2", "3", "4", "5"),
                      labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5")) +
  ylab("Proportion")

#it seems that all in Cluster 3 have an annual income range from around 60k and beyond 100k 
#Cluster 5 have a range from 100k and above highest earners but lowest spenders
#cluster 1 have a range of less than 40K amking then the lowest earners
#cluster 4 has mid range income of >60k but less than 100k