library(readr)
Mall_Customers <- read_csv("C:/Users/ACER/Downloads/Mall_Customers.csv")
## Rows: 400 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Gender
## dbl (4): CustomerID, Age, Annual Income (k$), Spending Score (1-100)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Mall_Customers)
customer_data<-Mall_Customers
#Customer Gender Visualization
a=table(customer_data$Gender)
barplot(a,main="Using BarPlot to display Gender Comparision",
ylab="Count",
xlab="Gender",
col=rainbow(2),
legend=rownames(a))

pct=round(a/sum(a)*100)
lbs=paste(c("Female","Male")," ",pct,"%",sep=" ")
library(plotrix)
pie3D(a,labels=lbs,
main="Pie Chart Depicting Ratio of Female and Male")

#Visualization of Age Distribution
summary(customer_data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 28.75 36.00 38.85 49.00 70.00
hist(customer_data$Age,
col="blue",
main="Histogram to Show Count of Age Class",
xlab="Age Class",
ylab="Frequency",
labels=TRUE)

boxplot(customer_data$Age,
col="#ff0066",
main="Boxplot for Descriptive Analysis of Age")

#Analysis of the Annual Income of the Customers
summary(customer_data$`Annual Income (k$)`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 41.50 61.50 60.56 78.00 137.00
hist(customer_data$`Annual Income (k$)`,
col="#660033",
main="Histogram for Annual Income",
xlab="Annual Income Class",
ylab="Frequency",
labels=TRUE)

plot(density(customer_data$`Annual Income (k$)`),
col="yellow",
main="Density Plot for Annual Income",
xlab="Annual Income Class",
ylab="Density")
polygon(density(customer_data$`Annual Income (k$)`),
col="#ccff66")

boxplot(customer_data$`Spending Score (1-100)`,
horizontal=TRUE,
col="#990000",
main="BoxPlot for Descriptive Analysis of Spending Score")

hist(customer_data$`Spending Score (1-100)`,
main="HistoGram for Spending Score",
xlab="Spending Score Class",
ylab="Frequency",
col="#6600cc",
labels=TRUE)

#K-means Algorithm
library(purrr)
set.seed(123)
# function to calculate total intra-cluster sum of square
iss <- function(k) {
kmeans(customer_data[,3:5],k,iter.max=100,nstart=100,algorithm="Lloyd" )$tot.withinss
}
k.values <- 1:10
iss_values <- map_dbl(k.values, iss)
plot(k.values, iss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total intra-clusters sum of squares")
library(cluster)
library(gridExtra)

library(grid)
library(ggplot2)
library(NbClust)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(customer_data[,3:5], kmeans, method = "silhouette")

set.seed(125)
stat_gap <- clusGap(customer_data[,3:5], FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
fviz_gap_stat(stat_gap)

k6<-kmeans(customer_data[,3:5],6,iter.max=100,nstart=50,algorithm="Lloyd")
k6
## K-means clustering with 6 clusters of sizes 90, 76, 44, 78, 70, 42
##
## Cluster means:
## Age Annual Income (k$) Spending Score (1-100)
## 1 56.15556 53.37778 49.08889
## 2 27.00000 56.65789 49.13158
## 3 25.27273 25.72727 79.36364
## 4 32.69231 86.53846 82.12821
## 5 41.68571 88.22857 17.28571
## 6 44.14286 25.14286 19.52381
##
## Clustering vector:
## [1] 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6
## [38] 3 6 3 1 3 1 2 6 3 1 2 2 2 1 2 2 1 1 1 1 1 2 1 1 2 1 1 1 2 1 1 2 2 1 1 1 1
## [75] 1 2 1 2 2 1 1 2 1 1 2 1 1 2 2 1 1 2 1 2 2 2 1 2 1 2 2 1 1 2 1 2 1 1 1 1 1
## [112] 2 2 2 2 2 1 1 1 1 2 2 2 4 2 4 5 4 5 4 5 4 2 4 5 4 5 4 5 4 5 4 2 4 5 4 5 4
## [149] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
## [186] 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3
## [223] 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 1 3 1 2 6 3 1 2 2 2 1 2 2 1 1 1 1 1 2
## [260] 1 1 2 1 1 1 2 1 1 2 2 1 1 1 1 1 2 1 2 2 1 1 2 1 1 2 1 1 2 2 1 1 2 1 2 2 2
## [297] 1 2 1 2 2 1 1 2 1 2 1 1 1 1 1 2 2 2 2 2 1 1 1 1 2 2 2 4 2 4 5 4 5 4 5 4 2
## [334] 4 5 4 5 4 5 4 5 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
## [371] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
##
## Within cluster sum of squares by cluster:
## [1] 16124.267 15485.789 8199.636 27944.718 33381.714 15464.762
## (between_SS / total_SS = 81.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Visualizing the Clustering Results using the First Two Principle Components
pcclust=prcomp(customer_data[,3:5],scale=FALSE) #principal component analysis
summary(pcclust)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 26.4293 26.1269 12.9155
## Proportion of Variance 0.4512 0.4410 0.1078
## Cumulative Proportion 0.4512 0.8922 1.0000
pcclust$rotation[,1:2]
## PC1 PC2
## Age 0.1889742 -0.1309652
## Annual Income (k$) -0.5886410 -0.8083757
## Spending Score (1-100) -0.7859965 0.5739136
set.seed(1)
ggplot(customer_data, aes(x =`Annual Income (k$)`, y =`Spending Score (1-100)`)) +
geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) +
scale_color_discrete(name=" ",
breaks=c("1", "2", "3", "4", "5","6"),
labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) +
ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering")

ggplot(customer_data, aes(x =`Spending Score (1-100)`, y =Age)) +
geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) +
scale_color_discrete(name=" ",
breaks=c("1", "2", "3", "4", "5","6"),
labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) +
ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering")

kCols=function(vec){cols=rainbow (length (unique (vec)))
return (cols[as.numeric(as.factor(vec))])}
digCluster<-k6$cluster; dignm<-as.character(digCluster); # K-means clusters
plot(pcclust$x[,1:2], col =kCols(digCluster),pch =19,xlab ="K-means",ylab="classes")
legend("bottomleft",unique(dignm),fill=unique(kCols(digCluster)))
