Lab-4 Part I

Dataset

data <- read.csv("C:/Users/Bkulkarni/Wholesale_customers_data.csv")
summary(data)
##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

Pre-processing

top.n.custs <- function (data,cols,n=5) { 
  idx.to.remove <-integer(0) 
  for (c in cols){ 
   col.order <-order(data[,c],decreasing=T) 
   idx <-head(col.order, n) 
   idx.to.remove <-union(idx.to.remove,idx) 
  }
  return(idx.to.remove)
}
top.custs <- top.n.custs(data,cols = 3:8, n = 5 )
length(top.custs)
## [1] 19
data[top.custs,]
##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423
## 326       1      2  32717 16784   13626  60869             1272       5609
## 94        1      3  11314  3090    2062  35009               71       2698
## 197       1      1  30624  7209    4897  18711              763       2876
## 104       1      3  56082  3504    8906  18028             1480       2498
## 24        2      3  26373 36423   22019   5154             4337      16523
## 72        1      3  18291  1266   21042   5373             4173      14472
## 88        1      3  43265  5025    8117   6312             1579      14351

Model Design

data.rm.top<-data[-c(top.custs),]
set.seed(76964057) 
k <-kmeans(data.rm.top[,-c(1,2)], centers=5)
k$centers
##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  4189.747  7645.639 11015.277 1335.145        4750.4819  1387.1205
## 2 16470.870  3026.491  4264.741 3217.306         996.5556  1319.7593
## 3 33120.163  4896.977  5579.860 3823.372         945.4651  1620.1860
## 4  5830.214 15295.048 23449.167 1936.452       10361.6429  1912.7381
## 5  5043.434  2329.683  2786.138 2689.814         652.8276   849.8414
table(k$cluster)
## 
##   1   2   3   4   5 
##  83 108  43  42 145

Optimum K

rng<-2:20 
tries <-100 
avg.totw.ss <-integer(length(rng)) 
for(v in rng){ 
 v.totw.ss <-integer(tries) 
 for(i in 1:tries){
 k.temp <-kmeans(data.rm.top,centers=v) 
 v.totw.ss[i] <-k.temp$tot.withinss
 }
 avg.totw.ss[v-1] <-mean(v.totw.ss) 
}
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
 ylab="Average Total Within Sum of Squares",
 xlab="Value of K")

Lab4-Part II

Dataset

wine <- read.csv("C:/Users/Bkulkarni/wine.csv")
str(wine)
## 'data.frame':    178 obs. of  14 variables:
##  $ Wine                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic.acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Acl                 : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Mg                  : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols             : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid.phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanth             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color.int           : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD                  : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

Best K Value

wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

Scaling the data

df <- scale(wine[-1])

Optimum K

wssplot(df)

K-Means Analysis

library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

K-means Train and evaluation

set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
(df_km <- table(wine$Wine, fit.km$cluster))
##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 96.62921

K-means plot

library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
         color=TRUE, shade=TRUE,
         labels=2, lines=0)

Lab-4 Part III

Preparing the dataset

df_rpart <- data.frame(k=fit.km$cluster, df)
rdf <- df_rpart[sample(1:nrow(df_rpart)), ]
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]

Training the data

library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(fit)

Model evaluation

pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))
##     
## pred  1  2  3
##    1 11  1  0
##    2  0 12  1
##    3  0  1 11
(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))
## [1] 91.89189

Summary

Misclassification occurs when individuals are assigned to a different category than the one they should be in. This can lead to incorrect associations being observed between the assigned categories and the outcomes of interest. Misclassification causes a bias in the resulting information

A simulation approach allowing comparison of misclassification errors and estimating the required sample size for a given effect size, number, and correlation matrices can give more information on the expected % of errors. In the simulation, by setting the misclassification error at some level, e.g. 5% we can estimate the required effect size given the sample size (number of patients), or the required sample size for the expected effect size.

The problem of the fluctuating error of k-means can be solved by looking into the details of the algorithm and comparing it with hierarchical clustering and k-medoids.