dataset = read.csv("E:/college docs/Customer segmentation/customer-segmentation-dataset/Mall_Customers.csv", header = T)


str(dataset)
## 'data.frame':    200 obs. of  5 variables:
##  $ CustomerID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                : chr  "Male" "Male" "Female" "Female" ...
##  $ Age                   : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ Annual.Income..k..    : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending.Score..1.100.: int  39 81 6 77 40 76 6 94 3 72 ...
summary(dataset$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   28.75   36.00   38.85   49.00   70.00
summary(dataset$Annual.Income..k..)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   41.50   61.50   60.56   78.00  137.00
sd(dataset$Age)
## [1] 13.96901
head(dataset)
##   CustomerID Gender Age Annual.Income..k.. Spending.Score..1.100.
## 1          1   Male  19                 15                     39
## 2          2   Male  21                 15                     81
## 3          3 Female  20                 16                      6
## 4          4 Female  23                 16                     77
## 5          5 Female  31                 17                     40
## 6          6 Female  22                 17                     76
table(dataset$Gender)
## 
## Female   Male 
##    112     88
library(ggplot2)
a=table(dataset$Gender)
a
## 
## Female   Male 
##    112     88
barplot(a,main = "Barplot for gender", xlab = "Gender", ylab="count", col=rainbow(2), legend = row.names(a) )

pct = round(a*100/sum(a))

pct
## 
## Female   Male 
##     56     44
lbls= paste(c("Female","Male"),pct , "%", sep = " ")
lbls
## [1] "Female 56 %" "Male 44 %"
library(plotrix)
pie3D(a, labels = lbls,main ="Pie chart for gender", col = c('orange','green'))

summary(dataset$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   28.75   36.00   38.85   49.00   70.00
sd(dataset$Age)
## [1] 13.96901
hist(dataset$Age, col = 'orange', labels = T, border = 'black',main="Hostogram for age", xlab = "Age",ylab = "Count", ylim=c(0,40))

ggplot(dataset)+ geom_histogram(aes(Age), fill="#006050", col="#ffffff", bins = 50)+ ggtitle("histogram from age")

boxplot(dataset$Age, col="yellow")

ggplot(dataset)+ geom_boxplot(aes(Age), fill="#005570")+ggtitle("Age boxplot")

summary(dataset$Annual.Income..k..)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   41.50   61.50   60.56   78.00  137.00
sd(dataset$Annual.Income..k..)
## [1] 26.26472
ggplot(dataset)+ geom_histogram(aes(Annual.Income..k..), col="#ffffff", fill="#ff5566", bins=20)+ ggtitle("Histogram for annual income")

ggplot(dataset)+ geom_boxplot(aes(Annual.Income..k..), fill="#ff0066")+ ggtitle("Boxplot for annual income")

ggplot(dataset)+ geom_density(aes(Annual.Income..k..), fill="#006050", col="#ffffff")+ ggtitle("Annual income density plot")+xlab("Annual Income")+ylab("Density")

summary(dataset$Spending.Score..1.100.)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   34.75   50.00   50.20   73.00   99.00
ggplot(dataset)+ geom_boxplot(aes(Spending.Score..1.100.), fill="#554684")+ ggtitle("Boxplot for spending score")

summary(dataset$Spending.Score..1.100.)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   34.75   50.00   50.20   73.00   99.00
ggplot(dataset)+ geom_histogram(aes(Spending.Score..1.100.), fill="597812", col="#ffffff", bins = 50) 

rm(y)
## Warning in rm(y): object 'y' not found
hist(dataset$Spending.Score..1.100., labels = T, col = "#784691", main="Histogram for spending score", xlab = "Score",ylab = "frequency", ylim=c(0,50))

# summary gender = female 54% | male 44% age = 30-40 income score = 40-50

#K means clusturing 
library(NbClust)
## Warning: package 'NbClust' was built under R version 4.0.3
k = NbClust(dataset[,3:5],distance = "euclidean", min.nc = 3,max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 3 as the best number of clusters 
## * 4 proposed 4 as the best number of clusters 
## * 3 proposed 5 as the best number of clusters 
## * 8 proposed 6 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  6 
##  
##  
## *******************************************************************

k = 6

k6=kmeans(dataset[,3:5],6,iter.max = 100, nstart = 50, algorithm = "Lloyd")
k6
## K-means clustering with 6 clusters of sizes 22, 21, 45, 38, 35, 39
## 
## Cluster means:
##        Age Annual.Income..k.. Spending.Score..1.100.
## 1 25.27273           25.72727               79.36364
## 2 44.14286           25.14286               19.52381
## 3 56.15556           53.37778               49.08889
## 4 27.00000           56.65789               49.13158
## 5 41.68571           88.22857               17.28571
## 6 32.69231           86.53846               82.12821
## 
## Clustering vector:
##   [1] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
##  [38] 1 2 1 3 1 3 4 2 1 3 4 4 4 3 4 4 3 3 3 3 3 4 3 3 4 3 3 3 4 3 3 4 4 3 3 3 3
##  [75] 3 4 3 4 4 3 3 4 3 3 4 3 3 4 4 3 3 4 3 4 4 4 3 4 3 4 4 3 3 4 3 4 3 3 3 3 3
## [112] 4 4 4 4 4 3 3 3 3 4 4 4 6 4 6 5 6 5 6 5 6 4 6 5 6 5 6 5 6 5 6 4 6 5 6 5 6
## [149] 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5
## [186] 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6
## 
## Within cluster sum of squares by cluster:
## [1]  4099.818  7732.381  8062.133  7742.895 16690.857 13972.359
##  (between_SS / total_SS =  81.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
k6$centers
##        Age Annual.Income..k.. Spending.Score..1.100.
## 1 25.27273           25.72727               79.36364
## 2 44.14286           25.14286               19.52381
## 3 56.15556           53.37778               49.08889
## 4 27.00000           56.65789               49.13158
## 5 41.68571           88.22857               17.28571
## 6 32.69231           86.53846               82.12821
ggplot(dataset,aes(Annual.Income..k..,Spending.Score..1.100.), col=as.factor(k6$centers))+ geom_point(aes(col=as.factor(k6$cluster))) + ggtitle("segemenation of mall customers",subtitle = "using kmeans clustering") + scale_color_discrete(name=" ", labels=c("cluster 1","cluster 2","cluster 3","cluster 4","cluster 5","cluster 6"))

ggplot(dataset, aes(Annual.Income..k.., Age))+ geom_point(aes(col= as.factor(k6$cluster)))+ scale_color_discrete(name=" ", labels=c("cluster 1","cluster 2","cluster 3","cluster 4","cluster 5","cluster 6"))

table( dataset$CustomerID, k6$cluster)
##      
##       1 2 3 4 5 6
##   1   0 1 0 0 0 0
##   2   1 0 0 0 0 0
##   3   0 1 0 0 0 0
##   4   1 0 0 0 0 0
##   5   0 1 0 0 0 0
##   6   1 0 0 0 0 0
##   7   0 1 0 0 0 0
##   8   1 0 0 0 0 0
##   9   0 1 0 0 0 0
##   10  1 0 0 0 0 0
##   11  0 1 0 0 0 0
##   12  1 0 0 0 0 0
##   13  0 1 0 0 0 0
##   14  1 0 0 0 0 0
##   15  0 1 0 0 0 0
##   16  1 0 0 0 0 0
##   17  0 1 0 0 0 0
##   18  1 0 0 0 0 0
##   19  0 1 0 0 0 0
##   20  1 0 0 0 0 0
##   21  0 1 0 0 0 0
##   22  1 0 0 0 0 0
##   23  0 1 0 0 0 0
##   24  1 0 0 0 0 0
##   25  0 1 0 0 0 0
##   26  1 0 0 0 0 0
##   27  0 1 0 0 0 0
##   28  1 0 0 0 0 0
##   29  0 1 0 0 0 0
##   30  1 0 0 0 0 0
##   31  0 1 0 0 0 0
##   32  1 0 0 0 0 0
##   33  0 1 0 0 0 0
##   34  1 0 0 0 0 0
##   35  0 1 0 0 0 0
##   36  1 0 0 0 0 0
##   37  0 1 0 0 0 0
##   38  1 0 0 0 0 0
##   39  0 1 0 0 0 0
##   40  1 0 0 0 0 0
##   41  0 0 1 0 0 0
##   42  1 0 0 0 0 0
##   43  0 0 1 0 0 0
##   44  0 0 0 1 0 0
##   45  0 1 0 0 0 0
##   46  1 0 0 0 0 0
##   47  0 0 1 0 0 0
##   48  0 0 0 1 0 0
##   49  0 0 0 1 0 0
##   50  0 0 0 1 0 0
##   51  0 0 1 0 0 0
##   52  0 0 0 1 0 0
##   53  0 0 0 1 0 0
##   54  0 0 1 0 0 0
##   55  0 0 1 0 0 0
##   56  0 0 1 0 0 0
##   57  0 0 1 0 0 0
##   58  0 0 1 0 0 0
##   59  0 0 0 1 0 0
##   60  0 0 1 0 0 0
##   61  0 0 1 0 0 0
##   62  0 0 0 1 0 0
##   63  0 0 1 0 0 0
##   64  0 0 1 0 0 0
##   65  0 0 1 0 0 0
##   66  0 0 0 1 0 0
##   67  0 0 1 0 0 0
##   68  0 0 1 0 0 0
##   69  0 0 0 1 0 0
##   70  0 0 0 1 0 0
##   71  0 0 1 0 0 0
##   72  0 0 1 0 0 0
##   73  0 0 1 0 0 0
##   74  0 0 1 0 0 0
##   75  0 0 1 0 0 0
##   76  0 0 0 1 0 0
##   77  0 0 1 0 0 0
##   78  0 0 0 1 0 0
##   79  0 0 0 1 0 0
##   80  0 0 1 0 0 0
##   81  0 0 1 0 0 0
##   82  0 0 0 1 0 0
##   83  0 0 1 0 0 0
##   84  0 0 1 0 0 0
##   85  0 0 0 1 0 0
##   86  0 0 1 0 0 0
##   87  0 0 1 0 0 0
##   88  0 0 0 1 0 0
##   89  0 0 0 1 0 0
##   90  0 0 1 0 0 0
##   91  0 0 1 0 0 0
##   92  0 0 0 1 0 0
##   93  0 0 1 0 0 0
##   94  0 0 0 1 0 0
##   95  0 0 0 1 0 0
##   96  0 0 0 1 0 0
##   97  0 0 1 0 0 0
##   98  0 0 0 1 0 0
##   99  0 0 1 0 0 0
##   100 0 0 0 1 0 0
##   101 0 0 0 1 0 0
##   102 0 0 1 0 0 0
##   103 0 0 1 0 0 0
##   104 0 0 0 1 0 0
##   105 0 0 1 0 0 0
##   106 0 0 0 1 0 0
##   107 0 0 1 0 0 0
##   108 0 0 1 0 0 0
##   109 0 0 1 0 0 0
##   110 0 0 1 0 0 0
##   111 0 0 1 0 0 0
##   112 0 0 0 1 0 0
##   113 0 0 0 1 0 0
##   114 0 0 0 1 0 0
##   115 0 0 0 1 0 0
##   116 0 0 0 1 0 0
##   117 0 0 1 0 0 0
##   118 0 0 1 0 0 0
##   119 0 0 1 0 0 0
##   120 0 0 1 0 0 0
##   121 0 0 0 1 0 0
##   122 0 0 0 1 0 0
##   123 0 0 0 1 0 0
##   124 0 0 0 0 0 1
##   125 0 0 0 1 0 0
##   126 0 0 0 0 0 1
##   127 0 0 0 0 1 0
##   128 0 0 0 0 0 1
##   129 0 0 0 0 1 0
##   130 0 0 0 0 0 1
##   131 0 0 0 0 1 0
##   132 0 0 0 0 0 1
##   133 0 0 0 1 0 0
##   134 0 0 0 0 0 1
##   135 0 0 0 0 1 0
##   136 0 0 0 0 0 1
##   137 0 0 0 0 1 0
##   138 0 0 0 0 0 1
##   139 0 0 0 0 1 0
##   140 0 0 0 0 0 1
##   141 0 0 0 0 1 0
##   142 0 0 0 0 0 1
##   143 0 0 0 1 0 0
##   144 0 0 0 0 0 1
##   145 0 0 0 0 1 0
##   146 0 0 0 0 0 1
##   147 0 0 0 0 1 0
##   148 0 0 0 0 0 1
##   149 0 0 0 0 1 0
##   150 0 0 0 0 0 1
##   151 0 0 0 0 1 0
##   152 0 0 0 0 0 1
##   153 0 0 0 0 1 0
##   154 0 0 0 0 0 1
##   155 0 0 0 0 1 0
##   156 0 0 0 0 0 1
##   157 0 0 0 0 1 0
##   158 0 0 0 0 0 1
##   159 0 0 0 0 1 0
##   160 0 0 0 0 0 1
##   161 0 0 0 0 1 0
##   162 0 0 0 0 0 1
##   163 0 0 0 0 1 0
##   164 0 0 0 0 0 1
##   165 0 0 0 0 1 0
##   166 0 0 0 0 0 1
##   167 0 0 0 0 1 0
##   168 0 0 0 0 0 1
##   169 0 0 0 0 1 0
##   170 0 0 0 0 0 1
##   171 0 0 0 0 1 0
##   172 0 0 0 0 0 1
##   173 0 0 0 0 1 0
##   174 0 0 0 0 0 1
##   175 0 0 0 0 1 0
##   176 0 0 0 0 0 1
##   177 0 0 0 0 1 0
##   178 0 0 0 0 0 1
##   179 0 0 0 0 1 0
##   180 0 0 0 0 0 1
##   181 0 0 0 0 1 0
##   182 0 0 0 0 0 1
##   183 0 0 0 0 1 0
##   184 0 0 0 0 0 1
##   185 0 0 0 0 1 0
##   186 0 0 0 0 0 1
##   187 0 0 0 0 1 0
##   188 0 0 0 0 0 1
##   189 0 0 0 0 1 0
##   190 0 0 0 0 0 1
##   191 0 0 0 0 1 0
##   192 0 0 0 0 0 1
##   193 0 0 0 0 1 0
##   194 0 0 0 0 0 1
##   195 0 0 0 0 1 0
##   196 0 0 0 0 0 1
##   197 0 0 0 0 1 0
##   198 0 0 0 0 0 1
##   199 0 0 0 0 1 0
##   200 0 0 0 0 0 1