Integrate unsupervised and supervised learning techniques.

Part 1 - Running a k-Means analysis:Segmentation on customers

#Requires some data frame and the top N to remove
#Convert to data frame
data <- read.csv("Wholesale_customers_data.csv")
str(data)
## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(data)
##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

There is obviously a big difference for the top customers in each category (e.g. Fresh goes from a min of 3 to a max of 112,151).

So, we will do clustering and segmentation for our middle 50%. Remove the top 5 customers from each category.

#Requires some data frame and the top N to remove
#Convert to data frame
top.n.custs <- function (data,cols,n=5) {
 #Initialize a vector to hold customers being removed
 idx.to.remove <-integer(0)

 for (c in cols){

 # For every column in the data we passed to this function
 #Sort column "c" in descending order (bigger on top)
 #Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.

 col.order <-order(data[,c],decreasing=T)

 #Take the first n of the sorted column C to
 #combine and de-duplicate the row ids that need to be removed

 idx <-head(col.order, n)
 idx.to.remove <-union(idx.to.remove,idx)

 }

 #Return the indexes of customers to be removed
 return(idx.to.remove)

}
#How Many Customers to be Removed?
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs) 
## [1] 18
#Examine the customers
data[top.custs,] 
##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 1         2      3  12669  9656    7561    214             2674       1338
## 2         2      3   7057  9810    9568   1762             3293       1776
## 3         2      3   6353  8808    7684   2405             3516       7844
## 5         2      3  22615  5410    7198   3915             1777       5185
## 6         2      3   9413  8259    5126    666             1795       1451
## 4         1      3  13265  1196    4221   6404              507       1788
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423

Using data.rm.top, we can perform cluster analysis. Here, we would remove Channel and Relogion variables as they are not useful in clustering.

#Remove the Customers
data.rm.top<-data[-c(top.custs),] 
#Examine summary stats for the remaining data
print(summary(data.rm.top))
##     Channel         Region          Fresh            Milk      
##  Min.   :1.00   Min.   :1.000   Min.   :    3   Min.   :   55  
##  1st Qu.:1.00   1st Qu.:2.000   1st Qu.: 3072   1st Qu.: 1497  
##  Median :1.00   Median :3.000   Median : 8130   Median : 3582  
##  Mean   :1.31   Mean   :2.531   Mean   :11076   Mean   : 5172  
##  3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:16251   3rd Qu.: 6962  
##  Max.   :2.00   Max.   :3.000   Max.   :56082   Max.   :36423  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2132   1st Qu.:  738.8   1st Qu.:  255.2   1st Qu.:  398.0  
##  Median : 4603   Median : 1487.5   Median :  799.5   Median :  904.5  
##  Mean   : 7211   Mean   : 2910.3   Mean   : 2541.6   Mean   : 1352.0  
##  3rd Qu.:10391   3rd Qu.: 3428.0   3rd Qu.: 3879.2   3rd Qu.: 1752.2  
##  Max.   :39694   Max.   :60869.0   Max.   :19410.0   Max.   :16523.0
#Set the seed for reproducibility
set.seed(76964057)
k <-kmeans(data.rm.top[,-c(1,2)], centers=5) #Create 5 clusters, Remove columns 1 and 2
k$centers #Display&nbsp;cluster centers
##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  4130.566  7624.663 11763.687 1268.361        5135.1205  1301.4217
## 2 18932.426  3374.020  4801.911 3180.574        1142.0198  1536.0099
## 3 37275.906  5219.844  5850.094 6883.125         824.8125  2169.3438
## 4  6420.711 16893.947 23928.632 2162.421       10290.9737  2438.7368
## 5  5847.637  2379.458  2887.440 2971.375         675.7440   864.8452
table(k$cluster) #Give a count of data points in each cluster
## 
##   1   2   3   4   5 
##  83 101  32  38 168
#Try K from 2 to 20
rng<-2:20 
#Number of times to run the K Means algorithm
tries <-100 
#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
# For each value of the range variable
for(v in rng){

 #Set up an empty vectors to hold the tries
 v.totw.ss <-integer(tries)
 b.totb.ss <- integer(tries)
 tot.ss <- integer(tries)

 #Run kmeans
 for(i in 1:tries){
 k.temp <-kmeans(data.rm.top,centers=v)

 #Store the total withinss
 v.totw.ss[i] <-k.temp$tot.withinss

 #Store the betweenss
 b.totb.ss[i] <- k.temp$betweenss

 #Store the total sum of squares
 tot.ss[i] <- k.temp$totss
 }

 #Average the withinss and betweenss
 avg.totw.ss[v-1] <-mean(v.totw.ss)
 avg.totb.ss[v-1] <-mean(b.totb.ss)
 avg.tot.ss[v-1] <- mean(tot.ss)
}
# For each value of the range variable
for(v in rng){

 #Set up an empty vectors to hold the tries
 v.totw.ss <-integer(tries)
 b.totb.ss <- integer(tries)
 tot.ss <- integer(tries)

 #Run kmeans
 for(i in 1:tries){
 k.temp <-kmeans(data.rm.top,centers=v)

 #Store the total withinss
 v.totw.ss[i] <-k.temp$tot.withinss

 #Store the betweenss
 b.totb.ss[i] <- k.temp$betweenss

 #Store the total sum of squares
 tot.ss[i] <- k.temp$totss
 }

 #Average the withinss and betweenss
 avg.totw.ss[v-1] <-mean(v.totw.ss)
 avg.totb.ss[v-1] <-mean(b.totb.ss)
 avg.tot.ss[v-1] <- mean(tot.ss)
}
## Warning: did not converge in 10 iterations
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
 ylab="Average Total Within Sum of Squares",
 xlab="Value of K")

plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
 ylab="Average Total Between Sum of Squares",
 xlab="Value of K")

#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
 ylab="Ratio Between SS / Total SS",
 xlab="Value of K")
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
 ylab="Ratio Between SS / Total SS",
 xlab="Value of K")
abline(h=0.15, col="red")

#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
 ylab="Ratio Between SS / Total SS",
 xlab="Value of K")
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
 ylab="Ratio Between SS / Total SS",
 xlab="Value of K")
abline(h=0.15, col="red")

Here, we can conclude that Cluster 1 looks to be a heavy Grocery and above average Detergents Paper but low Fresh foods. Whereas, Cluster 3 is dominant in the Fresh category. Cluster 5 might be either the “junk drawer” or it might represent the small customers. Somewhere around K = 5 we start losing dramatic gains. So we are satisfied with 5 clusters.

k\(withinss tells us the sum of the square of the distance from each data point to the cluster center. Lower is better. Seeing a high withinss indicates either outliers are in our data or we need to create more clusters. k\)betweenss tells us the sum of the squared distance between cluster centers. Ideally we want cluster centers far apart from each other.

Part 2: K means for Wine

wine <- read.csv("wine.csv")
str(wine)
## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

Using a short function, wssplot, we will plot up the within (cluster) sum of squares versus different values of “k”.
The script will use this to generate a table and a barplot for our use in determining what the best number of clusters is for this dataset.

wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

We would scale the data before applying Machine learning techniques.

#Load data into R/RStudio and view it
wine <- read.csv("wine.csv")
df <- scale(wine[-1])
#Examine the data frame and plot the within sum of squares
head(df)
##        Alcohol  Malic.acid        Ash        Acl         Mg   Phenols
## [1,] 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991
##      Flavanoids Nonflavanoid.phenols    Proanth  Color.int        Hue
## [1,]  1.0319081           -0.6577078  1.2214385  0.2510088  0.3611585
## [2,]  0.7315653           -0.8184106 -0.5431887 -0.2924962  0.4049085
## [3,]  1.2121137           -0.4970050  2.1299594  0.2682629  0.3174085
## [4,]  1.4623994           -0.9791134  1.0292513  1.1827317 -0.4263410
## [5,]  0.6614853            0.2261576  0.4002753 -0.3183774  0.3611585
## [6,]  1.3622851           -0.1755994  0.6623487  0.7298108  0.4049085
##             OD     Proline
## [1,] 1.8427215  1.01015939
## [2,] 1.1103172  0.96252635
## [3,] 0.7863692  1.39122370
## [4,] 1.1807407  2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589  2.23274072
wssplot(df)

We would use the command NbClust() for K-means analysis. This command can find you the best value of the k.

library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
print(table(nc$Best.n[1,]))
## 
##  0  1  2  3 10 12 14 15 
##  2  1  4 15  1  1  1  1
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

Now, we would use the best value of n to train the model using kmeans() command and print the confusion matrix and accuracy.

set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
(df_km <- table(wine$Wine, fit.km$cluster))
##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 96.62921
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
         color=TRUE, shade=TRUE,
         labels=2, lines=0)

Part 3

Train a classifier to classify wines using the clusters we obtained in Part II

library(rattle)
## Warning: package 'rattle' was built under R version 3.4.4
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.4
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
library(RColorBrewer)

Label the data frame, with the cluster labels

df <- data.frame(k=fit.km$cluster, df)
df_rpart <- data.frame(k=fit.km$cluster, df)

Using Decision tree model for training the data

rdf <- df_rpart[sample(1:nrow(df_rpart)), ]

Using 80% of data for training and 20% of data for testing

train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)
fancyRpartPlot(fit)

pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))
##     
## pred  1  2  3
##    1 11  0  0
##    2  0 14  0
##    3  0  0 12
(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))
## [1] 100