Integrate unsupervised and supervised learning techniques.

Part 1 - Running a k-Means analysis

#Collecting Data
data <- read.csv("Wholesale_customers_data.csv")
str(data)
## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(data)
##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

A huge difference for the top customers in every category for example - Fresh and Frozen columns.

Next step is to do clustering and segmentation for middle 50%.

#Convert to data frame
top.n.custs <- function (data,cols,n=5) {
 ##Initialize a vector to hold customers being removed
 idx.to.remove <-integer(0)

 for (c in cols){


 col.order <-order(data[,c],decreasing=T)



 idx <-head(col.order, n)
 idx.to.remove <-union(idx.to.remove,idx)

 }
 return(idx.to.remove)

}
# Number of Customers Removed
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs) 
## [1] 18
#Customers details
data[top.custs,] 
##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 1         2      3  12669  9656    7561    214             2674       1338
## 2         2      3   7057  9810    9568   1762             3293       1776
## 3         2      3   6353  8808    7684   2405             3516       7844
## 5         2      3  22615  5410    7198   3915             1777       5185
## 6         2      3   9413  8259    5126    666             1795       1451
## 4         1      3  13265  1196    4221   6404              507       1788
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423
#Remove the Customers
data.rm.top<-data[-c(top.custs),] 
#Summary stats for the remaining data
print(summary(data.rm.top))
##     Channel         Region          Fresh            Milk      
##  Min.   :1.00   Min.   :1.000   Min.   :    3   Min.   :   55  
##  1st Qu.:1.00   1st Qu.:2.000   1st Qu.: 3072   1st Qu.: 1497  
##  Median :1.00   Median :3.000   Median : 8130   Median : 3582  
##  Mean   :1.31   Mean   :2.531   Mean   :11076   Mean   : 5172  
##  3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:16251   3rd Qu.: 6962  
##  Max.   :2.00   Max.   :3.000   Max.   :56082   Max.   :36423  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2132   1st Qu.:  738.8   1st Qu.:  255.2   1st Qu.:  398.0  
##  Median : 4603   Median : 1487.5   Median :  799.5   Median :  904.5  
##  Mean   : 7211   Mean   : 2910.3   Mean   : 2541.6   Mean   : 1352.0  
##  3rd Qu.:10391   3rd Qu.: 3428.0   3rd Qu.: 3879.2   3rd Qu.: 1752.2  
##  Max.   :39694   Max.   :60869.0   Max.   :19410.0   Max.   :16523.0
#Set the seed
set.seed(76964057)
k <-kmeans(data.rm.top[,-c(1,2)], centers=5) #Create 5 clusters, Remove columns 1 and 2
k$centers 
##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  4130.566  7624.663 11763.687 1268.361        5135.1205  1301.4217
## 2 18932.426  3374.020  4801.911 3180.574        1142.0198  1536.0099
## 3 37275.906  5219.844  5850.094 6883.125         824.8125  2169.3438
## 4  6420.711 16893.947 23928.632 2162.421       10290.9737  2438.7368
## 5  5847.637  2379.458  2887.440 2971.375         675.7440   864.8452
table(k$cluster) ##count of data in every cluster
## 
##   1   2   3   4   5 
##  83 101  32  38 168
#K from 2 to 20
rng<-2:20 
# K Means algorithm
tries <-100 
#Set up an empty vector 
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
for(v in rng){

 v.totw.ss <-integer(tries)
 b.totb.ss <- integer(tries)
 tot.ss <- integer(tries)

 #Running kmeans
 for(i in 1:tries){
 k.temp <-kmeans(data.rm.top,centers=v)


 v.totw.ss[i] <-k.temp$tot.withinss


 b.totb.ss[i] <- k.temp$betweenss


 tot.ss[i] <- k.temp$totss
 }


 avg.totw.ss[v-1] <-mean(v.totw.ss)
 avg.totb.ss[v-1] <-mean(b.totb.ss)
 avg.tot.ss[v-1] <- mean(tot.ss)
}
# For each value of the range variable
for(v in rng){

#Set up an empty vector 
 v.totw.ss <-integer(tries)
 b.totb.ss <- integer(tries)
 tot.ss <- integer(tries)

 #Run kmeans
 for(i in 1:tries){
 k.temp <-kmeans(data.rm.top,centers=v)


 v.totw.ss[i] <-k.temp$tot.withinss


 b.totb.ss[i] <- k.temp$betweenss

 #Store the total sum of squares
 tot.ss[i] <- k.temp$totss
 }


 avg.totw.ss[v-1] <-mean(v.totw.ss)
 avg.totb.ss[v-1] <-mean(b.totb.ss)
 avg.tot.ss[v-1] <- mean(tot.ss)
}
## Warning: did not converge in 10 iterations
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
 ylab="Average Total Within Sum of Squares",
 xlab="Value of K")

plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
 ylab="Average Total Between Sum of Squares",
 xlab="Value of K")

#Plot the ratio 
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
 ylab="Ratio Between SS / Total SS",
 xlab="Value of K")
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
 ylab="Ratio Between SS / Total SS",
 xlab="Value of K")
abline(h=0.15, col="red")

#Plot the ratio 
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
 ylab="Ratio Between SS / Total SS",
 xlab="Value of K")
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
 ylab="Ratio Between SS / Total SS",
 xlab="Value of K")
abline(h=0.15, col="red")

Here, we can conclude that Cluster 1 looks to be a heavy Grocery and above average Detergents Paper but low Fresh foods. Whereas, Cluster 3 is dominant in the Fresh category. Cluster 5 might be either the “junk drawer” or it might represent the small customers. around K = 5 we start losing dramatic gains. So we are satisfied with 5 clusters.

Part 2: K means for Wine

wine <- read.csv("wine.csv")
str(wine)
## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}
#Load data
wine <- read.csv("wine.csv")
str(wine)
## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
df <- scale(wine[-1])
#Examine the data frame and plot 
head(df)
##        Alcohol  Malic.acid        Ash        Acl         Mg   Phenols
## [1,] 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991
##      Flavanoids Nonflavanoid.phenols    Proanth  Color.int        Hue
## [1,]  1.0319081           -0.6577078  1.2214385  0.2510088  0.3611585
## [2,]  0.7315653           -0.8184106 -0.5431887 -0.2924962  0.4049085
## [3,]  1.2121137           -0.4970050  2.1299594  0.2682629  0.3174085
## [4,]  1.4623994           -0.9791134  1.0292513  1.1827317 -0.4263410
## [5,]  0.6614853            0.2261576  0.4002753 -0.3183774  0.3611585
## [6,]  1.3622851           -0.1755994  0.6623487  0.7298108  0.4049085
##             OD     Proline
## [1,] 1.8427215  1.01015939
## [2,] 1.1103172  0.96252635
## [3,] 0.7863692  1.39122370
## [4,] 1.1807407  2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589  2.23274072
wssplot(df)

library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
print(table(nc$Best.n[1,]))
## 
##  0  1  2  3 10 12 14 15 
##  2  1  4 15  1  1  1  1
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

Now, we would use the best value of n to train the model using kmeans() command and print the confusion matrix and accuracy.

set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
(df_km <- table(wine$Wine, fit.km$cluster))
##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 96.62921
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
         color=TRUE, shade=TRUE,
         labels=2, lines=0)

Part 3

Train a classifier to classify wines using the clusters we obtained in Part II

library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart)
library(rpart.plot)
library(RColorBrewer)

Label the data frame, with the cluster labels

df <- data.frame(k=fit.km$cluster, df)
df_rpart <- data.frame(k=fit.km$cluster, df)

Using Decision tree model for training the data

rdf <- df_rpart[sample(1:nrow(df_rpart)), ]

Using 80% of data for training and 20% of data for testing

train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)
fancyRpartPlot(fit)

pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))
##     
## pred  1  2  3
##    1 11  0  0
##    2  0 14  0
##    3  0  0 12
(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))
## [1] 100