Integrate unsupervised and supervised learning techniques.
#Requires some data frame and the top N to remove
#Convert to data frame
data <- read.csv("Wholesale_customers_data.csv")
str(data)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(data)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
There is obviously a big difference for the top customers in each category (e.g. Fresh goes from a min of 3 to a max of 112,151).
So, we will do clustering and segmentation for our middle 50%. Remove the top 5 customers from each category.
#Requires some data frame and the top N to remove
#Convert to data frame
top.n.custs <- function (data,cols,n=5) {
#Initialize a vector to hold customers being removed
idx.to.remove <-integer(0)
for (c in cols){
# For every column in the data we passed to this function
#Sort column "c" in descending order (bigger on top)
#Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
col.order <-order(data[,c],decreasing=T)
#Take the first n of the sorted column C to
#combine and de-duplicate the row ids that need to be removed
idx <-head(col.order, n)
idx.to.remove <-union(idx.to.remove,idx)
}
#Return the indexes of customers to be removed
return(idx.to.remove)
}
#How Many Customers to be Removed?
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs)
## [1] 18
#Examine the customers
data[top.custs,]
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
## 4 1 3 13265 1196 4221 6404 507 1788
## 182 1 3 112151 29627 18148 16745 4948 8550
## 126 1 3 76237 3473 7102 16538 778 918
## 285 1 3 68951 4411 12609 8692 751 2406
## 40 1 3 56159 555 902 10002 212 2916
## 259 1 1 56083 4563 2124 6422 730 3321
## 87 2 3 22925 73498 32114 987 20070 903
## 48 2 3 44466 54259 55571 7782 24171 6465
## 86 2 3 16117 46197 92780 1026 40827 2944
## 184 1 3 36847 43950 20170 36534 239 47943
## 62 2 3 35942 38369 59598 3254 26701 2017
## 334 2 2 8565 4980 67298 131 38102 1215
## 66 2 3 85 20959 45828 36 24231 1423
Using data.rm.top, we can perform cluster analysis. Here, we would remove Channel and Relogion variables as they are not useful in clustering.
#Remove the Customers
data.rm.top<-data[-c(top.custs),]
#Examine summary stats for the remaining data
print(summary(data.rm.top))
## Channel Region Fresh Milk
## Min. :1.00 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.00 1st Qu.:2.000 1st Qu.: 3072 1st Qu.: 1497
## Median :1.00 Median :3.000 Median : 8130 Median : 3582
## Mean :1.31 Mean :2.531 Mean :11076 Mean : 5172
## 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:16251 3rd Qu.: 6962
## Max. :2.00 Max. :3.000 Max. :56082 Max. :36423
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2132 1st Qu.: 738.8 1st Qu.: 255.2 1st Qu.: 398.0
## Median : 4603 Median : 1487.5 Median : 799.5 Median : 904.5
## Mean : 7211 Mean : 2910.3 Mean : 2541.6 Mean : 1352.0
## 3rd Qu.:10391 3rd Qu.: 3428.0 3rd Qu.: 3879.2 3rd Qu.: 1752.2
## Max. :39694 Max. :60869.0 Max. :19410.0 Max. :16523.0
#Set the seed for reproducibility
set.seed(76964057)
k <-kmeans(data.rm.top[,-c(1,2)], centers=5) #Create 5 clusters, Remove columns 1 and 2
k$centers #Display cluster centers
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 4130.566 7624.663 11763.687 1268.361 5135.1205 1301.4217
## 2 18932.426 3374.020 4801.911 3180.574 1142.0198 1536.0099
## 3 37275.906 5219.844 5850.094 6883.125 824.8125 2169.3438
## 4 6420.711 16893.947 23928.632 2162.421 10290.9737 2438.7368
## 5 5847.637 2379.458 2887.440 2971.375 675.7440 864.8452
table(k$cluster) #Give a count of data points in each cluster
##
## 1 2 3 4 5
## 83 101 32 38 168
#Try K from 2 to 20
rng<-2:20
#Number of times to run the K Means algorithm
tries <-100
#Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
# For each value of the range variable
for(v in rng){
#Set up an empty vectors to hold the tries
v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
#Store the total withinss
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
#Store the total sum of squares
tot.ss[i] <- k.temp$totss
}
#Average the withinss and betweenss
avg.totw.ss[v-1] <-mean(v.totw.ss)
avg.totb.ss[v-1] <-mean(b.totb.ss)
avg.tot.ss[v-1] <- mean(tot.ss)
}
# For each value of the range variable
for(v in rng){
#Set up an empty vectors to hold the tries
v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
#Store the total withinss
v.totw.ss[i] <-k.temp$tot.withinss
#Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
#Store the total sum of squares
tot.ss[i] <- k.temp$totss
}
#Average the withinss and betweenss
avg.totw.ss[v-1] <-mean(v.totw.ss)
avg.totb.ss[v-1] <-mean(b.totb.ss)
avg.tot.ss[v-1] <- mean(tot.ss)
}
## Warning: did not converge in 10 iterations
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")
plot(rng,avg.totb.ss,type="b", main="Total between SS by Various K",
ylab="Average Total Between Sum of Squares",
xlab="Value of K")
#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.85, col="red")
plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="red")
#Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b", main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.85, col="red")
plot(rng,avg.totw.ss/avg.tot.ss,type="b", main="Ratio of within ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="red")
Here, we can conclude that Cluster 1 looks to be a heavy Grocery and above average Detergents Paper but low Fresh foods. Whereas, Cluster 3 is dominant in the Fresh category. Cluster 5 might be either the “junk drawer” or it might represent the small customers. Somewhere around K = 5 we start losing dramatic gains. So we are satisfied with 5 clusters.
k\(withinss tells us the sum of the square of the distance from each data point to the cluster center. Lower is better. Seeing a high withinss indicates either outliers are in our data or we need to create more clusters. k\)betweenss tells us the sum of the squared distance between cluster centers. Ideally we want cluster centers far apart from each other.
wine <- read.csv("wine.csv")
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Wine : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic.acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Acl : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Mg : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid.phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanth : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color.int : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
Using a short function, wssplot, we will plot up the within (cluster) sum of squares versus different values of “k”.
The script will use this to generate a table and a barplot for our use in determining what the best number of clusters is for this dataset.
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
We would scale the data before applying Machine learning techniques.
#Load data into R/RStudio and view it
wine <- read.csv("wine.csv")
df <- scale(wine[-1])
#Examine the data frame and plot the within sum of squares
head(df)
## Alcohol Malic.acid Ash Acl Mg Phenols
## [1,] 1.5143408 -0.56066822 0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252 0.02117152 1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508 0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684 0.22705328 1.8352256 0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132 0.3043010 -1.2860793 0.85828399 1.5576991
## Flavanoids Nonflavanoid.phenols Proanth Color.int Hue
## [1,] 1.0319081 -0.6577078 1.2214385 0.2510088 0.3611585
## [2,] 0.7315653 -0.8184106 -0.5431887 -0.2924962 0.4049085
## [3,] 1.2121137 -0.4970050 2.1299594 0.2682629 0.3174085
## [4,] 1.4623994 -0.9791134 1.0292513 1.1827317 -0.4263410
## [5,] 0.6614853 0.2261576 0.4002753 -0.3183774 0.3611585
## [6,] 1.3622851 -0.1755994 0.6623487 0.7298108 0.4049085
## OD Proline
## [1,] 1.8427215 1.01015939
## [2,] 1.1103172 0.96252635
## [3,] 0.7863692 1.39122370
## [4,] 1.1807407 2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589 2.23274072
wssplot(df)
We would use the command NbClust() for K-means analysis. This command can find you the best value of the k.
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 15 proposed 3 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
print(table(nc$Best.n[1,]))
##
## 0 1 2 3 10 12 14 15
## 2 1 4 15 1 1 1 1
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")
Now, we would use the best value of n to train the model using kmeans() command and print the confusion matrix and accuracy.
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
(df_km <- table(wine$Wine, fit.km$cluster))
##
## 1 2 3
## 1 59 0 0
## 2 3 65 3
## 3 0 0 48
(Accuracy <- (sum(diag(df_km))/sum(df_km)*100))
## [1] 96.62921
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)
Train a classifier to classify wines using the clusters we obtained in Part II
library(rattle)
## Warning: package 'rattle' was built under R version 3.4.4
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.4
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
library(RColorBrewer)
Label the data frame, with the cluster labels
df <- data.frame(k=fit.km$cluster, df)
df_rpart <- data.frame(k=fit.km$cluster, df)
Using Decision tree model for training the data
rdf <- df_rpart[sample(1:nrow(df_rpart)), ]
Using 80% of data for training and 20% of data for testing
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
library(rpart)
fit <- rpart(k ~ ., data=train, method="class")
library(rattle)
fancyRpartPlot(fit)
pred <- predict(fit, test, type="class")
(news_tbl <- table(pred, test$k))
##
## pred 1 2 3
## 1 11 0 0
## 2 0 14 0
## 3 0 0 12
(Accuracy <- (sum(diag(news_tbl))/sum(news_tbl)*100))
## [1] 100