Convert to data frame
data <-read.csv("Wholesale customers data.csv",header=T)
top.n.custs <- function (data,cols,n=5) {
#Initialize a vector to hold customers being removed
idx.to.remove <-integer(0)
for (c in cols){
# For every column in the data we passed to this function
#Sort column "c" in descending order (bigger on top)
#Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
col.order <-order(data[,c],decreasing=T)
#Take the first n of the sorted column C to
#combine and de-duplicate the row ids that need to be removed
idx <-head(col.order, n)
idx.to.remove <-union(idx.to.remove,idx)
}
#Return the indexes of customers to be removed
return(idx.to.remove)
}
How Many Customers to be Removed?
top.custs <-top.n.custs(data, cols = 1:5,n=5)
length(top.custs)
## [1] 18
Examine the customers
data[top.custs,]
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
## 4 1 3 13265 1196 4221 6404 507 1788
## 182 1 3 112151 29627 18148 16745 4948 8550
## 126 1 3 76237 3473 7102 16538 778 918
## 285 1 3 68951 4411 12609 8692 751 2406
## 40 1 3 56159 555 902 10002 212 2916
## 259 1 1 56083 4563 2124 6422 730 3321
## 87 2 3 22925 73498 32114 987 20070 903
## 48 2 3 44466 54259 55571 7782 24171 6465
## 86 2 3 16117 46197 92780 1026 40827 2944
## 184 1 3 36847 43950 20170 36534 239 47943
## 62 2 3 35942 38369 59598 3254 26701 2017
## 334 2 2 8565 4980 67298 131 38102 1215
## 66 2 3 85 20959 45828 36 24231 1423
Remove the Customers
data.rm.top<-data[-c(top.custs),]
Examine summary stats for the remaining data
print(summary(data.rm.top))
## Channel Region Fresh Milk
## Min. :1.00 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.00 1st Qu.:2.000 1st Qu.: 3072 1st Qu.: 1497
## Median :1.00 Median :3.000 Median : 8130 Median : 3582
## Mean :1.31 Mean :2.531 Mean :11076 Mean : 5172
## 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:16251 3rd Qu.: 6962
## Max. :2.00 Max. :3.000 Max. :56082 Max. :36423
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2132 1st Qu.: 738.8 1st Qu.: 255.2 1st Qu.: 398.0
## Median : 4603 Median : 1487.5 Median : 799.5 Median : 904.5
## Mean : 7211 Mean : 2910.3 Mean : 2541.6 Mean : 1352.0
## 3rd Qu.:10391 3rd Qu.: 3428.0 3rd Qu.: 3879.2 3rd Qu.: 1752.2
## Max. :39694 Max. :60869.0 Max. :19410.0 Max. :16523.0
Set the seed for reproducibility
set.seed(12345)
Try K from 2 to 20
rng<-2:20
Number of times to run the K Means algorithm
tries <-100
Set up an empty vector to hold all of points
avg.totw.ss <-integer(length(rng))
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))
For each value of the range variable
for(v in rng){
#Set up an empty vectors to hold the tries
v.totw.ss <-integer(tries)
b.totb.ss <- integer(tries)
tot.ss <- integer(tries)
#Run kmeans
for(i in 1:tries){
k.temp <-kmeans(data.rm.top,centers=v)
# Store the total withinss
v.totw.ss[i] <-k.temp$tot.withinss
# Store the betweenss
b.totb.ss[i] <- k.temp$betweenss
# Store the total sum of squares
tot.ss[i] <- k.temp$totss
}
#Average the withinss and betweenss
avg.totw.ss[v-1] <-mean(v.totw.ss)
avg.totb.ss[v-1] <-mean(b.totb.ss)
avg.tot.ss[v-1] <- mean(tot.ss)
}
plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K",
ylab="Average Total Within Sum of Squares",
xlab="Value of K")
plot(rng,avg.totb.ss,type="b",
main="Total between SS by Various K",
ylab="Average Total Between Sum of Squares",
xlab="Value of K")
Plot the ratio of betweenss/total ss and withinss / total ss for evaluation
plot(rng,avg.totb.ss/avg.tot.ss,type="b",
main="Ratio of between ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.85, col="red")
plot(rng,avg.totw.ss/avg.tot.ss,type="b",
main="Ratio of within ss / the total ss by Various K",
ylab="Ratio Between SS / Total SS",
xlab="Value of K")
abline(h=0.15, col="red")
Create the best number of clusters, Remove columns 1 and 2
#n <- readline(prompt = "Enter the best number of clusters:5 ")
#return(as.integer(5))
Display cluster centers
k <-kmeans(data.rm.top[,-c(1,2)], centers=5)
print(k$centers)
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 5847.637 2379.458 2887.440 2971.375 675.7440 864.8452
## 2 37275.906 5219.844 5850.094 6883.125 824.8125 2169.3438
## 3 4130.566 7624.663 11763.687 1268.361 5135.1205 1301.4217
## 4 6420.711 16893.947 23928.632 2162.421 10290.9737 2438.7368
## 5 18932.426 3374.020 4801.911 3180.574 1142.0198 1536.0099
Give a count of data points in each cluster
print(table(k$cluster))
##
## 1 2 3 4 5
## 168 32 83 38 101
Generate a plot of the clusters
library(cluster)
clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)
##Summary: In the wholesale dataset we can see in the plot that after 5 the graph tips off and that was the reason we too the k value as 5.
Applying the same strategy for online news popularity dataset as used in Lab 2 for training and testing data
news <- read.csv("OnlineNewsPopularity.csv", header = T)
sum(is.na(news))
## [1] 0
news<- news[, 29:61]
library(caTools)
library(class)
set.seed(12345)
news$popular<- ifelse(news$avg_positive_polarity>0.5,1, 0)
news<-news[,-23]
split<-sample.split(news$popular , SplitRatio=0.8)
training_set<-subset(news, split==TRUE)
test_set<-subset(news, split==FALSE)
train_label<- training_set$popular
test_label<-test_set$popular
#knn<- knn(train=training_set , test=test_set , cl=train_label, k=13)
knn<- knn(train=training_set , test=test_set , cl=train_label, k=21)
knn<-table(x=test_label, y=knn)
knn
## y
## x 0 1
## 0 7529 0
## 1 399 0
It is said that selecting an odd number for a k value is a good start to finding the optimal k value.Here I have selected 21 as the k value because there are 21 variables and when I tried out other values for k it was not giving me the optimal solution This R script will perform a k-Means analysis on the wine dataset from UCI. This dataset #has 13 chemical measurements on 178 observations of Italian wine.
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)
}
plot(1:nc, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
}
library(rattle.data)
data(wine)
head(wine)
## Type Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39
## Nonflavanoids Proanthocyanins Color Hue Dilution Proline
## 1 0.28 2.29 5.64 1.04 3.92 1065
## 2 0.26 1.28 4.38 1.05 3.40 1050
## 3 0.30 2.81 5.68 1.03 3.17 1185
## 4 0.24 2.18 7.80 0.86 3.45 1480
## 5 0.39 1.82 4.32 1.04 2.93 735
## 6 0.34 1.97 6.75 1.05 2.85 1450
df <- scale(wine[-1])
head(df)
## Alcohol Malic Ash Alcalinity Magnesium Phenols
## [1,] 1.5143408 -0.56066822 0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252 0.02117152 1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508 0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684 0.22705328 1.8352256 0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132 0.3043010 -1.2860793 0.85828399 1.5576991
## Flavanoids Nonflavanoids Proanthocyanins Color Hue
## [1,] 1.0319081 -0.6577078 1.2214385 0.2510088 0.3611585
## [2,] 0.7315653 -0.8184106 -0.5431887 -0.2924962 0.4049085
## [3,] 1.2121137 -0.4970050 2.1299594 0.2682629 0.3174085
## [4,] 1.4623994 -0.9791134 1.0292513 1.1827317 -0.4263410
## [5,] 0.6614853 0.2261576 0.4002753 -0.3183774 0.3611585
## [6,] 1.3622851 -0.1755994 0.6623487 0.7298108 0.4049085
## Dilution Proline
## [1,] 1.8427215 1.01015939
## [2,] 1.1103172 0.96252635
## [3,] 0.7863692 1.39122370
## [4,] 1.1807407 2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589 2.23274072
wssplot(df)
Start the k-Means analysis using the variable “nc” for the number of clusters
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 15 proposed 3 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
print(table(nc$Best.n[1,]))
##
## 0 1 2 3 10 12 14 15
## 2 1 4 15 1 1 1 1
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")
Enter the best number of clusters based on the information in the table and barplot
#n <- readline(prompt = "Enter the best number of clusters: ")
#n <- as.integer(n)
Conduct the k-Means analysis using the best number of clusters
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
print(fit.km$size)
## [1] 62 65 51
print(fit.km$centers)
## Alcohol Malic Ash Alcalinity Magnesium Phenols
## 1 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## 3 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## Flavanoids Nonflavanoids Proanthocyanins Color Hue
## 1 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## 2 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## 3 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## Dilution Proline
## 1 0.7770551 1.1220202
## 2 0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428
print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))
## cluster Alcohol Malic Ash Alcalinity Magnesium Phenols
## 1 1 13.67677 1.997903 2.466290 17.46290 107.96774 2.847581
## 2 2 12.25092 1.897385 2.231231 20.06308 92.73846 2.247692
## 3 3 13.13412 3.307255 2.417647 21.24118 98.66667 1.683922
## Flavanoids Nonflavanoids Proanthocyanins Color Hue Dilution
## 1 3.0032258 0.2920968 1.922097 5.453548 1.0654839 3.163387
## 2 2.0500000 0.3576923 1.624154 2.973077 1.0627077 2.803385
## 3 0.8188235 0.4519608 1.145882 7.234706 0.6919608 1.696667
## Proline
## 1 1100.2258
## 2 510.1692
## 3 619.0588
Use a confusion or truth table to evaluate how well the k-Means analysis performed
ct.km <- table(wine$Type, fit.km$cluster)
print(ct.km)
##
## 1 2 3
## 1 59 0 0
## 2 3 65 3
## 3 0 0 48
Generate a plot of the clusters
library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)
Set-up to train a model for classification of wines
library(rpart)
df <- data.frame(k=fit.km$cluster, df)
print(str(df))
## 'data.frame': 178 obs. of 14 variables:
## $ k : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 1.514 0.246 0.196 1.687 0.295 ...
## $ Malic : num -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
## $ Ash : num 0.231 -0.826 1.106 0.487 1.835 ...
## $ Alcalinity : num -1.166 -2.484 -0.268 -0.807 0.451 ...
## $ Magnesium : num 1.9085 0.0181 0.0881 0.9283 1.2784 ...
## $ Phenols : num 0.807 0.567 0.807 2.484 0.807 ...
## $ Flavanoids : num 1.032 0.732 1.212 1.462 0.661 ...
## $ Nonflavanoids : num -0.658 -0.818 -0.497 -0.979 0.226 ...
## $ Proanthocyanins: num 1.221 -0.543 2.13 1.029 0.4 ...
## $ Color : num 0.251 -0.292 0.268 1.183 -0.318 ...
## $ Hue : num 0.361 0.405 0.317 -0.426 0.361 ...
## $ Dilution : num 1.843 1.11 0.786 1.181 0.448 ...
## $ Proline : num 1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL
#Randomize the dataset
rdf <- df[sample(1:nrow(df)), ]
print(head(rdf))
## k Alcohol Malic Ash Alcalinity Magnesium Phenols
## 93 2 -0.3826162 -0.7217931 -0.38826018 0.3608424 -1.3822227 -1.462188745
## 69 2 0.4180475 -1.2499245 -0.02375431 -0.7470867 0.7182523 0.375309174
## 13 1 0.9230815 -0.5427655 0.15849862 -1.0465271 -0.7520802 0.487156874
## 57 1 1.5020229 -0.5696196 -0.24245783 -0.9566950 1.2783790 1.445851440
## 117 2 -1.4542737 -0.7755014 -1.37242601 0.3907864 -0.9621277 -0.503494178
## 161 3 -0.7891070 1.3370245 0.04914686 0.4506745 -0.8220960 0.007809591
## Flavanoids Nonflavanoids Proanthocyanins Color Hue
## 93 -0.5699201 1.7528342 0.05084419 -0.8661960 0.01115870
## 69 -0.7301029 1.5117800 -2.04574255 -0.8144336 0.27365854
## 13 0.7315653 -0.5773564 0.38280376 0.2337547 0.84240820
## 57 0.9718395 -0.8184106 0.76717799 0.5702101 -0.07634125
## 117 -0.4297602 -0.4970050 -0.10639981 -1.3406845 -0.03259127
## 161 -1.1105371 1.1100230 -0.96250606 1.1180287 -1.73884025
## Dilution Proline
## 93 -0.7770322 -0.799896093
## 69 -0.9601332 0.009865569
## 13 0.4060824 1.819921051
## 57 0.9835550 0.708483475
## 117 1.0117244 -0.799896093
## 161 -1.4530976 -0.720507695
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]
Train the classifier and plot the results
fit <- rpart(k ~ ., data=train, method="class")
#library(rpart.plot)
#library(RColorBrewer)
#library(rattle.data)
#fancyRpartPlot(fit)
#fancy
Now use the predict() function to see how well the model works
predict(fit, test, type="class")
## 133 148 120 29 14 108 75 11 4 77 160 91 176 118 105 71 3 54
## 3 3 2 1 1 2 1 1 1 2 3 2 3 2 2 3 1 1
## 30 169 165 156 83 166 116 65 112 114 87 140 149 47 132 45 138 50
## 1 3 3 3 2 3 2 2 2 2 2 2 3 1 3 1 3 1
## 1
## 1
## Levels: 1 2 3
#print(table(predict, test$k))
The reason why I choose k as 3 is because as you can see it clearly shows that the sum of squares in between to SS in within in the historgram is about so much.