library(readr)
Mall_Customers <- read_csv("C:/Users/ACER/Downloads/Mall_Customers.csv")
## Rows: 400 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Gender
## dbl (4): CustomerID, Age, Annual Income (k$), Spending Score (1-100)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Mall_Customers)
customer_data<-Mall_Customers
#Customer Gender Visualization

a=table(customer_data$Gender)
barplot(a,main="Using BarPlot to display Gender Comparision",
        ylab="Count",
        xlab="Gender",
        col=rainbow(2),
        legend=rownames(a))

pct=round(a/sum(a)*100)
lbs=paste(c("Female","Male")," ",pct,"%",sep=" ")
library(plotrix)
pie3D(a,labels=lbs,
      main="Pie Chart Depicting Ratio of Female and Male")

#Visualization of Age Distribution


summary(customer_data$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   28.75   36.00   38.85   49.00   70.00
hist(customer_data$Age,
     col="blue",
     main="Histogram to Show Count of Age Class",
     xlab="Age Class",
     ylab="Frequency",
     labels=TRUE)

boxplot(customer_data$Age,
        col="#ff0066",
        main="Boxplot for Descriptive Analysis of Age")

#Analysis of the Annual Income of the Customers

summary(customer_data$`Annual Income (k$)`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   41.50   61.50   60.56   78.00  137.00
hist(customer_data$`Annual Income (k$)`,
     col="#660033",
     main="Histogram for Annual Income",
     xlab="Annual Income Class",
     ylab="Frequency",
     labels=TRUE)

plot(density(customer_data$`Annual Income (k$)`),
     col="yellow",
     main="Density Plot for Annual Income",
     xlab="Annual Income Class",
     ylab="Density")
polygon(density(customer_data$`Annual Income (k$)`),
        col="#ccff66")

boxplot(customer_data$`Spending Score (1-100)`,
        horizontal=TRUE,
        col="#990000",
        main="BoxPlot for Descriptive Analysis of Spending Score")

hist(customer_data$`Spending Score (1-100)`,
     main="HistoGram for Spending Score",
     xlab="Spending Score Class",
     ylab="Frequency",
     col="#6600cc",
     labels=TRUE)

#K-means Algorithm
library(purrr)
set.seed(123)
# function to calculate total intra-cluster sum of square 
iss <- function(k) {
  kmeans(customer_data[,3:5],k,iter.max=100,nstart=100,algorithm="Lloyd" )$tot.withinss
}

k.values <- 1:10


iss_values <- map_dbl(k.values, iss)

plot(k.values, iss_values,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total intra-clusters sum of squares")


library(cluster) 
library(gridExtra)

library(grid)
library(ggplot2)
library(NbClust)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(customer_data[,3:5], kmeans, method = "silhouette")

set.seed(125)
stat_gap <- clusGap(customer_data[,3:5], FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
fviz_gap_stat(stat_gap)

k6<-kmeans(customer_data[,3:5],6,iter.max=100,nstart=50,algorithm="Lloyd")
k6
## K-means clustering with 6 clusters of sizes 90, 76, 44, 78, 70, 42
## 
## Cluster means:
##        Age Annual Income (k$) Spending Score (1-100)
## 1 56.15556           53.37778               49.08889
## 2 27.00000           56.65789               49.13158
## 3 25.27273           25.72727               79.36364
## 4 32.69231           86.53846               82.12821
## 5 41.68571           88.22857               17.28571
## 6 44.14286           25.14286               19.52381
## 
## Clustering vector:
##   [1] 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6
##  [38] 3 6 3 1 3 1 2 6 3 1 2 2 2 1 2 2 1 1 1 1 1 2 1 1 2 1 1 1 2 1 1 2 2 1 1 1 1
##  [75] 1 2 1 2 2 1 1 2 1 1 2 1 1 2 2 1 1 2 1 2 2 2 1 2 1 2 2 1 1 2 1 2 1 1 1 1 1
## [112] 2 2 2 2 2 1 1 1 1 2 2 2 4 2 4 5 4 5 4 5 4 2 4 5 4 5 4 5 4 5 4 2 4 5 4 5 4
## [149] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
## [186] 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3
## [223] 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 1 3 1 2 6 3 1 2 2 2 1 2 2 1 1 1 1 1 2
## [260] 1 1 2 1 1 1 2 1 1 2 2 1 1 1 1 1 2 1 2 2 1 1 2 1 1 2 1 1 2 2 1 1 2 1 2 2 2
## [297] 1 2 1 2 2 1 1 2 1 2 1 1 1 1 1 2 2 2 2 2 1 1 1 1 2 2 2 4 2 4 5 4 5 4 5 4 2
## [334] 4 5 4 5 4 5 4 5 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
## [371] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
## 
## Within cluster sum of squares by cluster:
## [1] 16124.267 15485.789  8199.636 27944.718 33381.714 15464.762
##  (between_SS / total_SS =  81.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#Visualizing the Clustering Results using the First Two Principle Components

pcclust=prcomp(customer_data[,3:5],scale=FALSE) #principal component analysis
summary(pcclust)
## Importance of components:
##                            PC1     PC2     PC3
## Standard deviation     26.4293 26.1269 12.9155
## Proportion of Variance  0.4512  0.4410  0.1078
## Cumulative Proportion   0.4512  0.8922  1.0000
pcclust$rotation[,1:2]
##                               PC1        PC2
## Age                     0.1889742 -0.1309652
## Annual Income (k$)     -0.5886410 -0.8083757
## Spending Score (1-100) -0.7859965  0.5739136
set.seed(1)
ggplot(customer_data, aes(x =`Annual Income (k$)`, y =`Spending Score (1-100)`)) + 
         geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) +
         scale_color_discrete(name=" ",
                              breaks=c("1", "2", "3", "4", "5","6"),
                              labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) +
         ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering")

       ggplot(customer_data, aes(x =`Spending Score (1-100)`, y =Age)) + 
         geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) +
         scale_color_discrete(name=" ",
                              breaks=c("1", "2", "3", "4", "5","6"),
                              labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) +
         ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering")

       kCols=function(vec){cols=rainbow (length (unique (vec)))
       return (cols[as.numeric(as.factor(vec))])}
       
       digCluster<-k6$cluster; dignm<-as.character(digCluster); # K-means clusters
       
       plot(pcclust$x[,1:2], col =kCols(digCluster),pch =19,xlab ="K-means",ylab="classes")
       legend("bottomleft",unique(dignm),fill=unique(kCols(digCluster)))