## Importing packages
library(ggplot2); theme_set(theme_bw())
## Warning: package 'ggplot2' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
customers <- read.csv("Mall_Customers.csv")
attach(customers)
## CHECK THE DIMENSIONS OF THE DATA
dim(customers)
## [1] 200 5
str(customers)
## 'data.frame': 200 obs. of 5 variables:
## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income..k.. : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score..1.100.: int 39 81 6 77 40 76 6 94 3 72 ...
names(customers)
## [1] "CustomerID" "Gender"
## [3] "Age" "Annual.Income..k.."
## [5] "Spending.Score..1.100."
#change the name of colum for beter understand
names(customers)<- c("CustomerID","Gender","Age","Annual_Income", "Spending_Score")
## UNIQUE CUSTOMERS
n_distinct(CustomerID)
## [1] 200
## GENDERS DISTRIBUTION
as.data.frame(table(Gender)) %>%
ggplot(aes(x = Gender, y = Freq)) +
geom_bar(stat = "identity", fill = "#F8766D") +
geom_text(y = as.vector(table(Gender)), label = paste0((as.vector(table(Gender))/sum(as.vector(table(Gender))))*100, "%"))

## SUMMARY OF THE AGE VARIABLE
summary(Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 28.75 36.00 38.85 49.00 70.00
ggplot(as.data.frame(Age), aes(y = Age)) + geom_boxplot(fill='#F8766D')

#The first thing came to my mind is if gender makes a difference on the spending score
male_customers = subset(customers, Gender == "Male")
female_customers = subset(customers, Gender == "Female")
t.test(customers$Spending_Score, customers$Spending_Score, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: customers$Spending_Score and customers$Spending_Score
## t = 0, df = 398, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.076755 5.076755
## sample estimates:
## mean of x mean of y
## 50.2 50.2
#From the t-test result, it is very obvious that there are no differences on spending score between male and female
ggplot(customers, aes(x = Age, y = Spending_Score, colour = Gender)) +
geom_point(na.rm = TRUE) +
geom_smooth(method="loess")

#From the plot, it seems like after 40, there is a huge decline in spending score.
#Although it will gradually come back after 50, but it still wont be as high as in the 30s.
ggplot(customers, aes(x = Annual_Income, y = Spending_Score, colour = Gender)) +
geom_point(na.rm = TRUE) +
geom_smooth(method="loess")

#Although spending score across all income group seems to have a mean spending score of around 50,
#but the variances seems to be drastically different.
earn_between_50k_60k = subset(customers, Annual_Income >= 40 & Annual_Income <= 50)
other_income_range = subset(customers, Annual_Income < 40 | Annual_Income > 50)
var.test(earn_between_50k_60k$Spending_Score, other_income_range$Spending_Score, alternative = "two.sided")
##
## F test to compare two variances
##
## data: earn_between_50k_60k$Spending_Score and other_income_range$Spending_Score
## F = 0.045139, num df = 27, denom df = 171, p-value = 1.439e-13
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.02678692 0.08610525
## sample estimates:
## ratio of variances
## 0.04513944
#The f-test above shows the variances are indeed very different between the two groups.
############################################################################# **********##################################
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(cluster)
## Warning: package 'cluster' was built under R version 3.5.3
library(fpc)
library(NbClust)
## Warning: package 'NbClust' was built under R version 3.5.2
library(clValid)
## Warning: package 'clValid' was built under R version 3.5.2
library(magrittr)
## Warning: package 'magrittr' was built under R version 3.5.2
library(clustertend)
## Warning: package 'clustertend' was built under R version 3.5.2
# Loading data
data<-customers [,-c(1,2)]
View(data)
# To standarize the variables
data = scale(data)
# Assessing cluster tendency
library(clustertend)
# Compute Hopkins statistic for the dataset
## values > .5 means it is not clusterable
## values < .5 means it is clusterable, closer to 0 better the data
set.seed(123)
hopkins(data, n = nrow(data)-1)
## $H
## [1] 0.3137406
#Since the H value = 0.1815 which is far below the threshold 0.5, it is highly clusterable
# K-mean - Determining optimal number of clusters
# NbClust Package : 30 indices to determine the number of clusters in a dataset
# If index = 'all' - run 30 indices to determine the optimal no. of clusters
# If index = "silhouette" - It is a measure to estimate the dissimilarity between clusters.
# A higher silhouette width is preferred to determine the optimal number of clusters
library(NbClust)
##Method I: using silhouette method
nb <- NbClust(data, distance = "euclidean", min.nc=2, max.nc=15,
method = "kmeans",index = "silhouette")
nb$All.index## maximum value of silhouette shows best number of clusters
## 2 3 4 5 6 7 8 9 10 11
## 0.3355 0.3155 0.4040 0.4166 0.4274 0.4046 0.3977 0.3666 0.3591 0.3586
## 12 13 14 15
## 0.3624 0.3466 0.3730 0.3777
nb$Best.nc
## Number_clusters Value_Index
## 6.0000 0.4274
#Method II : Same Silhouette Width analysis with fpc package
library(fpc)
# maximum value of silhouette shows best number of clusters
pamkClus <- pamk(data, krange = 2:15, criterion="multiasw", ns=2, critout=TRUE)
## 2 clusters 0.3230175
## 3 clusters 0.3584529
## 4 clusters 0.4002899
## 5 clusters 0.3602264
## 6 clusters 0.4193437
## 7 clusters 0.4123169
## 8 clusters 0.381269
## 9 clusters 0.3683303
## 10 clusters 0.3854899
## 11 clusters 0.3710397
## 12 clusters 0.364599
## 13 clusters 0.3665642
## 14 clusters 0.3649264
## 15 clusters 0.3357786
pamkClus$nc
## [1] 6
cat("number of clusters estimated by optimum average silhouette width:", pamkClus$nc, "\\\\n")
## number of clusters estimated by optimum average silhouette width: 6 \\n
#Method III : Scree plot to determine the number of clusters
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:15) {
wss[i] <- sum(kmeans(data,centers=i)$withinss)
}
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")

##Best Method IV: using all 30 ways of measure
nb <- NbClust(data, distance = "euclidean", min.nc=2, max.nc=15,
method = "kmeans",index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 6 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 4 proposed 6 as the best number of clusters
## * 2 proposed 8 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
########################## Running k means #######################################
# K-Means Cluster Analysis
fit <- kmeans(data,4)
plot(data,col=fit$cluster,pch=16)
### kmeans clustering with a better visualization
# K-means clustering
# nstart means it initiates multiple initial configuaration and reports the best one
km.res <- eclust(data, "kmeans", k = 4, nstart = 25, graph = FALSE)
# Visualize k-means clusters
fviz_cluster(km.res, geom = "point", frame.type = "norm")
## Warning: argument frame is deprecated; please use ellipse instead.
## Warning: argument frame.type is deprecated; please use ellipse.type
## instead.
## another way of finding number of clusters and running kmeans
kmeans = kmeansruns(data,4,runs=100) ## for a single value
kmeans=kmeansruns(data,krange = 1:12,runs=100) ## gives a range of value and R will find best value of k
plot(data,col=kmeans$cluster,pch=16)

####################### Clustering Validation #################################################
#Internal clustering validation, which use the internal information of the
#clustering process to evaluate the goodness of a clustering structure.
#It can be also used for estimating the number of clusters and the appropriate clustering algorithm.
## Connectivity - ranges from 0 to Inf; choose the one with minimum value
## Average Silhouette width - ranges from -1 to 1; choose the one with maximum value
## Dunn index - ranges from 0 to Inf; choose the one with maximum value
# Internal Validation
clmethods <- c("kmeans")
internval <- clValid(data, nClust = 2:6, clMethods = clmethods, validation = "internal")
## Warning in clValid(data, nClust = 2:6, clMethods = clmethods, validation =
## "internal"): rownames for data not specified, using 1:nrow(data)
# Summary
summary(internval)
##
## Clustering Methods:
## kmeans
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## kmeans Connectivity 15.2317 29.3147 34.4337 33.1044 34.9714
## Dunn 0.0596 0.0445 0.0593 0.0659 0.0554
## Silhouette 0.3355 0.3503 0.4040 0.4166 0.4274
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 15.2317 kmeans 2
## Dunn 0.0659 kmeans 5
## Silhouette 0.4274 kmeans 6
optimalScores(internval)
## Score Method Clusters
## Connectivity 15.23174603 kmeans 2
## Dunn 0.06593646 kmeans 5
## Silhouette 0.42742815 kmeans 6
# Stability measure Validation
clmethods <- c("kmeans")
stabval <- clValid(data, nClust = 2:5, clMethods = clmethods, validation = "stability")
## Warning in clValid(data, nClust = 2:5, clMethods = clmethods, validation =
## "stability"): rownames for data not specified, using 1:nrow(data)
# Display only optimal Scores
summary(stabval)
##
## Clustering Methods:
## kmeans
##
## Cluster sizes:
## 2 3 4 5
##
## Validation Measures:
## 2 3 4 5
##
## kmeans APN 0.2547 0.2913 0.3127 0.3659
## AD 2.0424 1.8111 1.6195 1.5768
## ADM 0.7833 0.8359 0.8218 0.9040
## FOM 0.9959 0.9791 0.9484 0.9427
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.2547 kmeans 2
## AD 1.5768 kmeans 5
## ADM 0.7833 kmeans 2
## FOM 0.9427 kmeans 5
optimalScores(stabval)
## Score Method Clusters
## APN 0.2547106 kmeans 2
## AD 1.5767868 kmeans 5
## ADM 0.7832847 kmeans 2
## FOM 0.9426807 kmeans 5
#The average proportion of non-overlap (APN)
#The average distance (AD)
#The average distance between means (ADM)
#The figure of merit (FOM)
### Smaller values indicate stable clusters
## You have to run kmeans clustering with a better visualization method to make it work
fviz_silhouette(km.res, palette = "jco", ggtheme = theme_classic())
## cluster size ave.sil.width
## 1 1 65 0.38
## 2 2 38 0.34
## 3 3 57 0.37
## 4 4 40 0.55

# Silhouette width of observations
sil <- km.res$silinfo$widths[, 1:3]
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
## cluster neighbor sil_width
## 129 1 2 -0.03022948
##################################################################