kaggle_project_mallcustomar.R

## Importing packages
library(ggplot2); theme_set(theme_bw())

## Warning: package 'ggplot2' was built under R version 3.5.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.5.2

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

customers <-  read.csv("Mall_Customers.csv")
attach(customers)
## CHECK THE DIMENSIONS OF THE DATA
dim(customers)

## [1] 200   5

str(customers)

## 'data.frame':    200 obs. of  5 variables:
##  $ CustomerID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ Age                   : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ Annual.Income..k..    : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending.Score..1.100.: int  39 81 6 77 40 76 6 94 3 72 ...

names(customers)

## [1] "CustomerID"             "Gender"                
## [3] "Age"                    "Annual.Income..k.."    
## [5] "Spending.Score..1.100."

#change the name of colum for beter understand
names(customers)<- c("CustomerID","Gender","Age","Annual_Income", "Spending_Score")
## UNIQUE CUSTOMERS
n_distinct(CustomerID)

## [1] 200

## GENDERS DISTRIBUTION
as.data.frame(table(Gender))  %>% 
  ggplot(aes(x = Gender, y = Freq))  +
  geom_bar(stat = "identity", fill = "#F8766D") +
  geom_text(y = as.vector(table(Gender)), label = paste0((as.vector(table(Gender))/sum(as.vector(table(Gender))))*100, "%"))

## SUMMARY OF THE AGE VARIABLE
summary(Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   28.75   36.00   38.85   49.00   70.00

ggplot(as.data.frame(Age), aes(y = Age)) + geom_boxplot(fill='#F8766D')

#The first thing came to my mind is if gender makes a difference on the spending score
male_customers = subset(customers, Gender == "Male")
female_customers = subset(customers, Gender == "Female")
t.test(customers$Spending_Score, customers$Spending_Score, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  customers$Spending_Score and customers$Spending_Score
## t = 0, df = 398, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.076755  5.076755
## sample estimates:
## mean of x mean of y 
##      50.2      50.2

#From the t-test result, it is very obvious that there are no differences on spending score between male and female
ggplot(customers, aes(x = Age, y = Spending_Score, colour = Gender)) + 
  geom_point(na.rm = TRUE) + 
  geom_smooth(method="loess")

#From the plot, it seems like after 40, there is a huge decline in spending score.
#Although it will gradually come back after 50, but it still wont be as high as in the 30s.

ggplot(customers, aes(x = Annual_Income, y = Spending_Score, colour = Gender)) + 
  geom_point(na.rm = TRUE) + 
  geom_smooth(method="loess")

#Although spending score across all income group seems to have a mean spending score of around 50,
#but the variances seems to be drastically different.
earn_between_50k_60k = subset(customers, Annual_Income >= 40 & Annual_Income <= 50)
other_income_range = subset(customers, Annual_Income < 40 | Annual_Income > 50)
var.test(earn_between_50k_60k$Spending_Score, other_income_range$Spending_Score, alternative = "two.sided")

## 
##  F test to compare two variances
## 
## data:  earn_between_50k_60k$Spending_Score and other_income_range$Spending_Score
## F = 0.045139, num df = 27, denom df = 171, p-value = 1.439e-13
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.02678692 0.08610525
## sample estimates:
## ratio of variances 
##         0.04513944

#The f-test above shows the variances are indeed very different between the two groups.
############################################################################# **********##################################
library(factoextra)

## Warning: package 'factoextra' was built under R version 3.5.2

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(cluster)

## Warning: package 'cluster' was built under R version 3.5.3

library(fpc)
library(NbClust)

## Warning: package 'NbClust' was built under R version 3.5.2

library(clValid)

## Warning: package 'clValid' was built under R version 3.5.2

library(magrittr)

## Warning: package 'magrittr' was built under R version 3.5.2

library(clustertend)

## Warning: package 'clustertend' was built under R version 3.5.2

# Loading data
data<-customers [,-c(1,2)]
View(data)

# To standarize the variables 
data = scale(data) 
# Assessing cluster tendency
library(clustertend)
# Compute Hopkins statistic for the dataset
## values > .5 means it is not clusterable
## values < .5 means it is clusterable, closer to 0 better the data

set.seed(123)
hopkins(data, n = nrow(data)-1)

## $H
## [1] 0.3137406

#Since the H value = 0.1815 which is far below the threshold 0.5, it is highly clusterable

# K-mean - Determining optimal number of clusters
# NbClust Package : 30 indices to determine the number of clusters in a dataset
# If index = 'all' - run 30 indices to determine the optimal no. of clusters
# If index = "silhouette" - It is a measure to estimate the dissimilarity between clusters.
# A higher silhouette width is preferred to determine the optimal number of clusters


library(NbClust)
##Method I: using silhouette method
nb <- NbClust(data,  distance = "euclidean", min.nc=2, max.nc=15, 
              method = "kmeans",index = "silhouette")

nb$All.index## maximum value of silhouette shows best number of clusters

##      2      3      4      5      6      7      8      9     10     11 
## 0.3355 0.3155 0.4040 0.4166 0.4274 0.4046 0.3977 0.3666 0.3591 0.3586 
##     12     13     14     15 
## 0.3624 0.3466 0.3730 0.3777

nb$Best.nc

## Number_clusters     Value_Index 
##          6.0000          0.4274

#Method II : Same Silhouette Width analysis with fpc package
library(fpc)
# maximum value of silhouette shows best number of clusters
pamkClus <- pamk(data, krange = 2:15, criterion="multiasw", ns=2, critout=TRUE)

## 2  clusters  0.3230175 
## 3  clusters  0.3584529 
## 4  clusters  0.4002899 
## 5  clusters  0.3602264 
## 6  clusters  0.4193437 
## 7  clusters  0.4123169 
## 8  clusters  0.381269 
## 9  clusters  0.3683303 
## 10  clusters  0.3854899 
## 11  clusters  0.3710397 
## 12  clusters  0.364599 
## 13  clusters  0.3665642 
## 14  clusters  0.3649264 
## 15  clusters  0.3357786

pamkClus$nc

## [1] 6

cat("number of clusters estimated by optimum average silhouette width:", pamkClus$nc, "\\\\n")

## number of clusters estimated by optimum average silhouette width: 6 \\n

#Method III : Scree plot to determine the number of clusters
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:15) {
  wss[i] <- sum(kmeans(data,centers=i)$withinss)
} 
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")

##Best Method IV: using all 30 ways of measure
nb <- NbClust(data,  distance = "euclidean", min.nc=2, max.nc=15, 
              method = "kmeans",index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 6 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 4 proposed 6 as the best number of clusters 
## * 2 proposed 8 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 2 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************

########################## Running k means #######################################

# K-Means Cluster Analysis
fit <- kmeans(data,4)
plot(data,col=fit$cluster,pch=16) 


### kmeans clustering with a better visualization

# K-means clustering
# nstart means it initiates multiple initial configuaration and reports the best one
km.res <- eclust(data, "kmeans", k = 4, nstart = 25, graph = FALSE)

# Visualize k-means clusters
fviz_cluster(km.res, geom = "point", frame.type = "norm")

## Warning: argument frame is deprecated; please use ellipse instead.

## Warning: argument frame.type is deprecated; please use ellipse.type
## instead.

## another way of finding number of clusters and running kmeans
kmeans = kmeansruns(data,4,runs=100) ## for a single value
kmeans=kmeansruns(data,krange = 1:12,runs=100) ## gives a range of value and R will find best value of k
plot(data,col=kmeans$cluster,pch=16)

####################### Clustering Validation #################################################

#Internal clustering validation, which use the internal information of the 
#clustering process to evaluate the goodness of a clustering structure. 
#It can be also used for estimating the number of clusters and the appropriate clustering algorithm.


## Connectivity - ranges from 0 to Inf; choose the one with minimum value 
## Average Silhouette width - ranges from -1 to 1; choose the one with maximum value 
## Dunn index - ranges from 0 to Inf; choose the one with maximum value 


# Internal Validation
clmethods <- c("kmeans")
internval <- clValid(data, nClust = 2:6, clMethods = clmethods, validation = "internal")

## Warning in clValid(data, nClust = 2:6, clMethods = clmethods, validation =
## "internal"): rownames for data not specified, using 1:nrow(data)

# Summary
summary(internval)

## 
## Clustering Methods:
##  kmeans 
## 
## Cluster sizes:
##  2 3 4 5 6 
## 
## Validation Measures:
##                            2       3       4       5       6
##                                                             
## kmeans Connectivity  15.2317 29.3147 34.4337 33.1044 34.9714
##        Dunn           0.0596  0.0445  0.0593  0.0659  0.0554
##        Silhouette     0.3355  0.3503  0.4040  0.4166  0.4274
## 
## Optimal Scores:
## 
##              Score   Method Clusters
## Connectivity 15.2317 kmeans 2       
## Dunn          0.0659 kmeans 5       
## Silhouette    0.4274 kmeans 6

optimalScores(internval)

##                    Score Method Clusters
## Connectivity 15.23174603 kmeans        2
## Dunn          0.06593646 kmeans        5
## Silhouette    0.42742815 kmeans        6

# Stability measure Validation
clmethods <- c("kmeans")
stabval <- clValid(data, nClust = 2:5, clMethods = clmethods, validation = "stability")

## Warning in clValid(data, nClust = 2:5, clMethods = clmethods, validation =
## "stability"): rownames for data not specified, using 1:nrow(data)

# Display only optimal Scores
summary(stabval)

## 
## Clustering Methods:
##  kmeans 
## 
## Cluster sizes:
##  2 3 4 5 
## 
## Validation Measures:
##                  2      3      4      5
##                                        
## kmeans APN  0.2547 0.2913 0.3127 0.3659
##        AD   2.0424 1.8111 1.6195 1.5768
##        ADM  0.7833 0.8359 0.8218 0.9040
##        FOM  0.9959 0.9791 0.9484 0.9427
## 
## Optimal Scores:
## 
##     Score  Method Clusters
## APN 0.2547 kmeans 2       
## AD  1.5768 kmeans 5       
## ADM 0.7833 kmeans 2       
## FOM 0.9427 kmeans 5

optimalScores(stabval)

##         Score Method Clusters
## APN 0.2547106 kmeans        2
## AD  1.5767868 kmeans        5
## ADM 0.7832847 kmeans        2
## FOM 0.9426807 kmeans        5

#The average proportion of non-overlap (APN)
#The average distance (AD)
#The average distance between means (ADM)
#The figure of merit (FOM)
### Smaller values indicate stable clusters


## You have to run kmeans clustering with a better visualization method to make it work

fviz_silhouette(km.res, palette = "jco", ggtheme = theme_classic())

##   cluster size ave.sil.width
## 1       1   65          0.38
## 2       2   38          0.34
## 3       3   57          0.37
## 4       4   40          0.55

# Silhouette width of observations
sil <- km.res$silinfo$widths[, 1:3]
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0) 
sil[neg_sil_index, , drop = FALSE]

##     cluster neighbor   sil_width
## 129       1        2 -0.03022948

##################################################################

kaggle_project_mallcustomar.R

hi

2019-06-15