dataset = read.csv("E:/college docs/Customer segmentation/customer-segmentation-dataset/Mall_Customers.csv", header = T)
str(dataset)
## 'data.frame': 200 obs. of 5 variables:
## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income..k.. : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score..1.100.: int 39 81 6 77 40 76 6 94 3 72 ...
summary(dataset$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 28.75 36.00 38.85 49.00 70.00
summary(dataset$Annual.Income..k..)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 41.50 61.50 60.56 78.00 137.00
sd(dataset$Age)
## [1] 13.96901
head(dataset)
## CustomerID Gender Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
table(dataset$Gender)
##
## Female Male
## 112 88
library(ggplot2)
a=table(dataset$Gender)
a
##
## Female Male
## 112 88
barplot(a,main = "Barplot for gender", xlab = "Gender", ylab="count", col=rainbow(2), legend = row.names(a) )
pct = round(a*100/sum(a))
pct
##
## Female Male
## 56 44
lbls= paste(c("Female","Male"),pct , "%", sep = " ")
lbls
## [1] "Female 56 %" "Male 44 %"
library(plotrix)
pie3D(a, labels = lbls,main ="Pie chart for gender", col = c('orange','green'))
summary(dataset$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 28.75 36.00 38.85 49.00 70.00
sd(dataset$Age)
## [1] 13.96901
hist(dataset$Age, col = 'orange', labels = T, border = 'black',main="Hostogram for age", xlab = "Age",ylab = "Count", ylim=c(0,40))
ggplot(dataset)+ geom_histogram(aes(Age), fill="#006050", col="#ffffff", bins = 50)+ ggtitle("histogram from age")
boxplot(dataset$Age, col="yellow")
ggplot(dataset)+ geom_boxplot(aes(Age), fill="#005570")+ggtitle("Age boxplot")
summary(dataset$Annual.Income..k..)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 41.50 61.50 60.56 78.00 137.00
sd(dataset$Annual.Income..k..)
## [1] 26.26472
ggplot(dataset)+ geom_histogram(aes(Annual.Income..k..), col="#ffffff", fill="#ff5566", bins=20)+ ggtitle("Histogram for annual income")
ggplot(dataset)+ geom_boxplot(aes(Annual.Income..k..), fill="#ff0066")+ ggtitle("Boxplot for annual income")
ggplot(dataset)+ geom_density(aes(Annual.Income..k..), fill="#006050", col="#ffffff")+ ggtitle("Annual income density plot")+xlab("Annual Income")+ylab("Density")
summary(dataset$Spending.Score..1.100.)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 34.75 50.00 50.20 73.00 99.00
ggplot(dataset)+ geom_boxplot(aes(Spending.Score..1.100.), fill="#554684")+ ggtitle("Boxplot for spending score")
summary(dataset$Spending.Score..1.100.)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 34.75 50.00 50.20 73.00 99.00
ggplot(dataset)+ geom_histogram(aes(Spending.Score..1.100.), fill="597812", col="#ffffff", bins = 50)
rm(y)
## Warning in rm(y): object 'y' not found
hist(dataset$Spending.Score..1.100., labels = T, col = "#784691", main="Histogram for spending score", xlab = "Score",ylab = "frequency", ylim=c(0,50))
# summary gender = female 54% | male 44% age = 30-40 income score = 40-50
#K means clusturing
library(NbClust)
## Warning: package 'NbClust' was built under R version 4.0.3
k = NbClust(dataset[,3:5],distance = "euclidean", min.nc = 3,max.nc = 10, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 3 as the best number of clusters
## * 4 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 8 proposed 6 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 6
##
##
## *******************************************************************
k = 6
k6=kmeans(dataset[,3:5],6,iter.max = 100, nstart = 50, algorithm = "Lloyd")
k6
## K-means clustering with 6 clusters of sizes 22, 21, 45, 38, 35, 39
##
## Cluster means:
## Age Annual.Income..k.. Spending.Score..1.100.
## 1 25.27273 25.72727 79.36364
## 2 44.14286 25.14286 19.52381
## 3 56.15556 53.37778 49.08889
## 4 27.00000 56.65789 49.13158
## 5 41.68571 88.22857 17.28571
## 6 32.69231 86.53846 82.12821
##
## Clustering vector:
## [1] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
## [38] 1 2 1 3 1 3 4 2 1 3 4 4 4 3 4 4 3 3 3 3 3 4 3 3 4 3 3 3 4 3 3 4 4 3 3 3 3
## [75] 3 4 3 4 4 3 3 4 3 3 4 3 3 4 4 3 3 4 3 4 4 4 3 4 3 4 4 3 3 4 3 4 3 3 3 3 3
## [112] 4 4 4 4 4 3 3 3 3 4 4 4 6 4 6 5 6 5 6 5 6 4 6 5 6 5 6 5 6 5 6 4 6 5 6 5 6
## [149] 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6 5
## [186] 6 5 6 5 6 5 6 5 6 5 6 5 6 5 6
##
## Within cluster sum of squares by cluster:
## [1] 4099.818 7732.381 8062.133 7742.895 16690.857 13972.359
## (between_SS / total_SS = 81.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
k6$centers
## Age Annual.Income..k.. Spending.Score..1.100.
## 1 25.27273 25.72727 79.36364
## 2 44.14286 25.14286 19.52381
## 3 56.15556 53.37778 49.08889
## 4 27.00000 56.65789 49.13158
## 5 41.68571 88.22857 17.28571
## 6 32.69231 86.53846 82.12821
ggplot(dataset,aes(Annual.Income..k..,Spending.Score..1.100.), col=as.factor(k6$centers))+ geom_point(aes(col=as.factor(k6$cluster))) + ggtitle("segemenation of mall customers",subtitle = "using kmeans clustering") + scale_color_discrete(name=" ", labels=c("cluster 1","cluster 2","cluster 3","cluster 4","cluster 5","cluster 6"))
ggplot(dataset, aes(Annual.Income..k.., Age))+ geom_point(aes(col= as.factor(k6$cluster)))+ scale_color_discrete(name=" ", labels=c("cluster 1","cluster 2","cluster 3","cluster 4","cluster 5","cluster 6"))
table( dataset$CustomerID, k6$cluster)
##
## 1 2 3 4 5 6
## 1 0 1 0 0 0 0
## 2 1 0 0 0 0 0
## 3 0 1 0 0 0 0
## 4 1 0 0 0 0 0
## 5 0 1 0 0 0 0
## 6 1 0 0 0 0 0
## 7 0 1 0 0 0 0
## 8 1 0 0 0 0 0
## 9 0 1 0 0 0 0
## 10 1 0 0 0 0 0
## 11 0 1 0 0 0 0
## 12 1 0 0 0 0 0
## 13 0 1 0 0 0 0
## 14 1 0 0 0 0 0
## 15 0 1 0 0 0 0
## 16 1 0 0 0 0 0
## 17 0 1 0 0 0 0
## 18 1 0 0 0 0 0
## 19 0 1 0 0 0 0
## 20 1 0 0 0 0 0
## 21 0 1 0 0 0 0
## 22 1 0 0 0 0 0
## 23 0 1 0 0 0 0
## 24 1 0 0 0 0 0
## 25 0 1 0 0 0 0
## 26 1 0 0 0 0 0
## 27 0 1 0 0 0 0
## 28 1 0 0 0 0 0
## 29 0 1 0 0 0 0
## 30 1 0 0 0 0 0
## 31 0 1 0 0 0 0
## 32 1 0 0 0 0 0
## 33 0 1 0 0 0 0
## 34 1 0 0 0 0 0
## 35 0 1 0 0 0 0
## 36 1 0 0 0 0 0
## 37 0 1 0 0 0 0
## 38 1 0 0 0 0 0
## 39 0 1 0 0 0 0
## 40 1 0 0 0 0 0
## 41 0 0 1 0 0 0
## 42 1 0 0 0 0 0
## 43 0 0 1 0 0 0
## 44 0 0 0 1 0 0
## 45 0 1 0 0 0 0
## 46 1 0 0 0 0 0
## 47 0 0 1 0 0 0
## 48 0 0 0 1 0 0
## 49 0 0 0 1 0 0
## 50 0 0 0 1 0 0
## 51 0 0 1 0 0 0
## 52 0 0 0 1 0 0
## 53 0 0 0 1 0 0
## 54 0 0 1 0 0 0
## 55 0 0 1 0 0 0
## 56 0 0 1 0 0 0
## 57 0 0 1 0 0 0
## 58 0 0 1 0 0 0
## 59 0 0 0 1 0 0
## 60 0 0 1 0 0 0
## 61 0 0 1 0 0 0
## 62 0 0 0 1 0 0
## 63 0 0 1 0 0 0
## 64 0 0 1 0 0 0
## 65 0 0 1 0 0 0
## 66 0 0 0 1 0 0
## 67 0 0 1 0 0 0
## 68 0 0 1 0 0 0
## 69 0 0 0 1 0 0
## 70 0 0 0 1 0 0
## 71 0 0 1 0 0 0
## 72 0 0 1 0 0 0
## 73 0 0 1 0 0 0
## 74 0 0 1 0 0 0
## 75 0 0 1 0 0 0
## 76 0 0 0 1 0 0
## 77 0 0 1 0 0 0
## 78 0 0 0 1 0 0
## 79 0 0 0 1 0 0
## 80 0 0 1 0 0 0
## 81 0 0 1 0 0 0
## 82 0 0 0 1 0 0
## 83 0 0 1 0 0 0
## 84 0 0 1 0 0 0
## 85 0 0 0 1 0 0
## 86 0 0 1 0 0 0
## 87 0 0 1 0 0 0
## 88 0 0 0 1 0 0
## 89 0 0 0 1 0 0
## 90 0 0 1 0 0 0
## 91 0 0 1 0 0 0
## 92 0 0 0 1 0 0
## 93 0 0 1 0 0 0
## 94 0 0 0 1 0 0
## 95 0 0 0 1 0 0
## 96 0 0 0 1 0 0
## 97 0 0 1 0 0 0
## 98 0 0 0 1 0 0
## 99 0 0 1 0 0 0
## 100 0 0 0 1 0 0
## 101 0 0 0 1 0 0
## 102 0 0 1 0 0 0
## 103 0 0 1 0 0 0
## 104 0 0 0 1 0 0
## 105 0 0 1 0 0 0
## 106 0 0 0 1 0 0
## 107 0 0 1 0 0 0
## 108 0 0 1 0 0 0
## 109 0 0 1 0 0 0
## 110 0 0 1 0 0 0
## 111 0 0 1 0 0 0
## 112 0 0 0 1 0 0
## 113 0 0 0 1 0 0
## 114 0 0 0 1 0 0
## 115 0 0 0 1 0 0
## 116 0 0 0 1 0 0
## 117 0 0 1 0 0 0
## 118 0 0 1 0 0 0
## 119 0 0 1 0 0 0
## 120 0 0 1 0 0 0
## 121 0 0 0 1 0 0
## 122 0 0 0 1 0 0
## 123 0 0 0 1 0 0
## 124 0 0 0 0 0 1
## 125 0 0 0 1 0 0
## 126 0 0 0 0 0 1
## 127 0 0 0 0 1 0
## 128 0 0 0 0 0 1
## 129 0 0 0 0 1 0
## 130 0 0 0 0 0 1
## 131 0 0 0 0 1 0
## 132 0 0 0 0 0 1
## 133 0 0 0 1 0 0
## 134 0 0 0 0 0 1
## 135 0 0 0 0 1 0
## 136 0 0 0 0 0 1
## 137 0 0 0 0 1 0
## 138 0 0 0 0 0 1
## 139 0 0 0 0 1 0
## 140 0 0 0 0 0 1
## 141 0 0 0 0 1 0
## 142 0 0 0 0 0 1
## 143 0 0 0 1 0 0
## 144 0 0 0 0 0 1
## 145 0 0 0 0 1 0
## 146 0 0 0 0 0 1
## 147 0 0 0 0 1 0
## 148 0 0 0 0 0 1
## 149 0 0 0 0 1 0
## 150 0 0 0 0 0 1
## 151 0 0 0 0 1 0
## 152 0 0 0 0 0 1
## 153 0 0 0 0 1 0
## 154 0 0 0 0 0 1
## 155 0 0 0 0 1 0
## 156 0 0 0 0 0 1
## 157 0 0 0 0 1 0
## 158 0 0 0 0 0 1
## 159 0 0 0 0 1 0
## 160 0 0 0 0 0 1
## 161 0 0 0 0 1 0
## 162 0 0 0 0 0 1
## 163 0 0 0 0 1 0
## 164 0 0 0 0 0 1
## 165 0 0 0 0 1 0
## 166 0 0 0 0 0 1
## 167 0 0 0 0 1 0
## 168 0 0 0 0 0 1
## 169 0 0 0 0 1 0
## 170 0 0 0 0 0 1
## 171 0 0 0 0 1 0
## 172 0 0 0 0 0 1
## 173 0 0 0 0 1 0
## 174 0 0 0 0 0 1
## 175 0 0 0 0 1 0
## 176 0 0 0 0 0 1
## 177 0 0 0 0 1 0
## 178 0 0 0 0 0 1
## 179 0 0 0 0 1 0
## 180 0 0 0 0 0 1
## 181 0 0 0 0 1 0
## 182 0 0 0 0 0 1
## 183 0 0 0 0 1 0
## 184 0 0 0 0 0 1
## 185 0 0 0 0 1 0
## 186 0 0 0 0 0 1
## 187 0 0 0 0 1 0
## 188 0 0 0 0 0 1
## 189 0 0 0 0 1 0
## 190 0 0 0 0 0 1
## 191 0 0 0 0 1 0
## 192 0 0 0 0 0 1
## 193 0 0 0 0 1 0
## 194 0 0 0 0 0 1
## 195 0 0 0 0 1 0
## 196 0 0 0 0 0 1
## 197 0 0 0 0 1 0
## 198 0 0 0 0 0 1
## 199 0 0 0 0 1 0
## 200 0 0 0 0 0 1