data <- read.csv("C:/Users/Bkulkarni/Wholesale_customers_data.csv")
summary(data)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
top.n.custs <- function (data,cols,n=5) {
idx.to.remove <-integer(0)
for (c in cols){
col.order <-order(data[,c],decreasing=T)
idx <-head(col.order, n)
idx.to.remove <-union(idx.to.remove,idx)
}
return(idx.to.remove)
}
top.custs <- top.n.custs(data,cols = 3:8, n = 5 )
length(top.custs)
## [1] 19
data[top.custs,]
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 182 1 3 112151 29627 18148 16745 4948 8550
## 126 1 3 76237 3473 7102 16538 778 918
## 285 1 3 68951 4411 12609 8692 751 2406
## 40 1 3 56159 555 902 10002 212 2916
## 259 1 1 56083 4563 2124 6422 730 3321
## 87 2 3 22925 73498 32114 987 20070 903
## 48 2 3 44466 54259 55571 7782 24171 6465
## 86 2 3 16117 46197 92780 1026 40827 2944
## 184 1 3 36847 43950 20170 36534 239 47943
## 62 2 3 35942 38369 59598 3254 26701 2017
## 334 2 2 8565 4980 67298 131 38102 1215
## 66 2 3 85 20959 45828 36 24231 1423
## 326 1 2 32717 16784 13626 60869 1272 5609
## 94 1 3 11314 3090 2062 35009 71 2698
## 197 1 1 30624 7209 4897 18711 763 2876
## 104 1 3 56082 3504 8906 18028 1480 2498
## 24 2 3 26373 36423 22019 5154 4337 16523
## 72 1 3 18291 1266 21042 5373 4173 14472
## 88 1 3 43265 5025 8117 6312 1579 14351
data.rm.top<-data[-c(top.custs),]
set.seed(76964057)
k <-kmeans(data.rm.top[,-c(1,2)], centers=5)
k$centers
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 4189.747 7645.639 11015.277 1335.145 4750.4819 1387.1205
## 2 16470.870 3026.491 4264.741 3217.306 996.5556 1319.7593
## 3 33120.163 4896.977 5579.860 3823.372 945.4651 1620.1860
## 4 5830.214 15295.048 23449.167 1936.452 10361.6429 1912.7381
## 5 5043.434 2329.683 2786.138 2689.814 652.8276 849.8414
table(k$cluster)
##
## 1 2 3 4 5
## 83 108 43 42 145
rng<-2:20
tries <-100
avg.totw.ss <-integer(length(rng))
for(v in rng){
v.totw.ss <-integer(tries)
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
v.totw.ss[i] <-k.temp$tot.withinss
}
avg.totw.ss[v-1] <-mean(v.totw.ss)
}
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")
wine <- read.csv("C:/Users/Bkulkarni/wine.csv")
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Wine : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic.acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Acl : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Mg : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid.phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanth : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color.int : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
df <- scale(wine[-1])
wssplot(df)
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 15 proposed 3 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
(df_km <- table(wine$Wine, fit.km$cluster))
##
## 1 2 3
## 1 59 0 0
## 2 3 65 3
## 3 0 0 48
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 96.62921
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)
df_rpart <- data.frame(k=fit.km$cluster, df)
rdf <- df_rpart[sample(1:nrow(df_rpart)), ]
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(fit)
pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))
##
## pred 1 2 3
## 1 11 1 0
## 2 0 12 1
## 3 0 1 11
(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))
## [1] 91.89189
Misclassification occurs when individuals are assigned to a different category than the one they should be in. This can lead to incorrect associations being observed between the assigned categories and the outcomes of interest. Misclassification causes a bias in the resulting information
A simulation approach allowing comparison of misclassification errors and estimating the required sample size for a given effect size, number, and correlation matrices can give more information on the expected % of errors. In the simulation, by setting the misclassification error at some level, e.g. 5% we can estimate the required effect size given the sample size (number of patients), or the required sample size for the expected effect size.
The problem of the fluctuating error of k-means can be solved by looking into the details of the algorithm and comparing it with hierarchical clustering and k-medoids.