PART 1

Convert to data frame

data <-read.csv("Wholesale customers data.csv",header=T)


top.n.custs <- function (data,cols,n=5) {
  #Initialize a vector to hold customers being removed 
  idx.to.remove <-integer(0)
  
  for (c in cols){
    # For every column in the data we passed to this function
    #Sort column "c" in descending order (bigger on top)
    #Order returns the sorted index (e.g. row 15, 3, 7, 1, ...) rather than the actual values sorted.
    col.order <-order(data[,c],decreasing=T)
    
    #Take the first n of the sorted column C to
    #combine and de-duplicate the row ids that need to be removed
    idx <-head(col.order, n)
    idx.to.remove <-union(idx.to.remove,idx)
    }
    
    #Return the indexes of customers to be removed 
    return(idx.to.remove)
}

How Many Customers to be Removed?

top.custs <-top.n.custs(data, cols = 1:5,n=5) 
length(top.custs)
## [1] 18

Examine the customers

data[top.custs,]
##     Channel Region  Fresh  Milk Grocery Frozen Detergents_Paper Delicassen
## 1         2      3  12669  9656    7561    214             2674       1338
## 2         2      3   7057  9810    9568   1762             3293       1776
## 3         2      3   6353  8808    7684   2405             3516       7844
## 5         2      3  22615  5410    7198   3915             1777       5185
## 6         2      3   9413  8259    5126    666             1795       1451
## 4         1      3  13265  1196    4221   6404              507       1788
## 182       1      3 112151 29627   18148  16745             4948       8550
## 126       1      3  76237  3473    7102  16538              778        918
## 285       1      3  68951  4411   12609   8692              751       2406
## 40        1      3  56159   555     902  10002              212       2916
## 259       1      1  56083  4563    2124   6422              730       3321
## 87        2      3  22925 73498   32114    987            20070        903
## 48        2      3  44466 54259   55571   7782            24171       6465
## 86        2      3  16117 46197   92780   1026            40827       2944
## 184       1      3  36847 43950   20170  36534              239      47943
## 62        2      3  35942 38369   59598   3254            26701       2017
## 334       2      2   8565  4980   67298    131            38102       1215
## 66        2      3     85 20959   45828     36            24231       1423

Remove the Customers

data.rm.top<-data[-c(top.custs),]

Examine summary stats for the remaining data

print(summary(data.rm.top))
##     Channel         Region          Fresh            Milk      
##  Min.   :1.00   Min.   :1.000   Min.   :    3   Min.   :   55  
##  1st Qu.:1.00   1st Qu.:2.000   1st Qu.: 3072   1st Qu.: 1497  
##  Median :1.00   Median :3.000   Median : 8130   Median : 3582  
##  Mean   :1.31   Mean   :2.531   Mean   :11076   Mean   : 5172  
##  3rd Qu.:2.00   3rd Qu.:3.000   3rd Qu.:16251   3rd Qu.: 6962  
##  Max.   :2.00   Max.   :3.000   Max.   :56082   Max.   :36423  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2132   1st Qu.:  738.8   1st Qu.:  255.2   1st Qu.:  398.0  
##  Median : 4603   Median : 1487.5   Median :  799.5   Median :  904.5  
##  Mean   : 7211   Mean   : 2910.3   Mean   : 2541.6   Mean   : 1352.0  
##  3rd Qu.:10391   3rd Qu.: 3428.0   3rd Qu.: 3879.2   3rd Qu.: 1752.2  
##  Max.   :39694   Max.   :60869.0   Max.   :19410.0   Max.   :16523.0

Set the seed for reproducibility

set.seed(12345)

Try K from 2 to 20

rng<-2:20

Number of times to run the K Means algorithm

tries <-100

Set up an empty vector to hold all of points

avg.totw.ss <-integer(length(rng)) 
avg.totb.ss <- integer(length(rng))
avg.tot.ss <- integer(length(rng))

For each value of the range variable

for(v in rng){
  #Set up an empty vectors to hold the tries 
  v.totw.ss <-integer(tries)
  b.totb.ss <- integer(tries)
  tot.ss <- integer(tries)
  
  #Run kmeans 
  for(i in 1:tries){
    k.temp <-kmeans(data.rm.top,centers=v)
    
    # Store the total withinss 
    v.totw.ss[i] <-k.temp$tot.withinss
    
    # Store the betweenss
    b.totb.ss[i] <- k.temp$betweenss 
    
    # Store the total sum of squares
    tot.ss[i] <- k.temp$totss 
  }
    
  #Average the withinss and betweenss 
  avg.totw.ss[v-1] <-mean(v.totw.ss) 
  avg.totb.ss[v-1] <-mean(b.totb.ss) 
  avg.tot.ss[v-1] <- mean(tot.ss)
}

plot(rng,avg.totw.ss,type="b", main="Total Within SS by Various K", 
  ylab="Average Total Within Sum of Squares",
  xlab="Value of K")

plot(rng,avg.totb.ss,type="b", 
  main="Total between SS by Various K",
  ylab="Average Total Between Sum of Squares", 
  xlab="Value of K")

Plot the ratio of betweenss/total ss and withinss / total ss for evaluation

plot(rng,avg.totb.ss/avg.tot.ss,type="b", 
  main="Ratio of between ss / the total ss by Various K",
  ylab="Ratio Between SS / Total SS",
  xlab="Value of K") 
abline(h=0.85, col="red")

plot(rng,avg.totw.ss/avg.tot.ss,type="b", 
  main="Ratio of within ss / the total ss by Various K", 
  ylab="Ratio   Between SS / Total SS",
  xlab="Value of K")
abline(h=0.15, col="red")

Create the best number of clusters, Remove columns 1 and 2

#n <- readline(prompt = "Enter the best number of clusters:5 ") 
#return(as.integer(5))

Display cluster centers

k <-kmeans(data.rm.top[,-c(1,2)], centers=5)
print(k$centers)
##       Fresh      Milk   Grocery   Frozen Detergents_Paper Delicassen
## 1  5847.637  2379.458  2887.440 2971.375         675.7440   864.8452
## 2 37275.906  5219.844  5850.094 6883.125         824.8125  2169.3438
## 3  4130.566  7624.663 11763.687 1268.361        5135.1205  1301.4217
## 4  6420.711 16893.947 23928.632 2162.421       10290.9737  2438.7368
## 5 18932.426  3374.020  4801.911 3180.574        1142.0198  1536.0099

Give a count of data points in each cluster

print(table(k$cluster))
## 
##   1   2   3   4   5 
## 168  32  83  38 101

Generate a plot of the clusters

library(cluster)
clusplot(data.rm.top, k$cluster, main='2D representation of the Cluster solution',
  color=TRUE, shade=TRUE, 
  labels=2, lines=0)

##Summary: In the wholesale dataset we can see in the plot that after 5 the graph tips off and that was the reason we too the k value as 5.

Applying the same strategy for online news popularity dataset as used in Lab 2 for training and testing data

news <- read.csv("OnlineNewsPopularity.csv", header = T)
sum(is.na(news))
## [1] 0
news<- news[, 29:61]

library(caTools)
library(class)
set.seed(12345)
news$popular<- ifelse(news$avg_positive_polarity>0.5,1, 0)
news<-news[,-23]
split<-sample.split(news$popular , SplitRatio=0.8)
training_set<-subset(news, split==TRUE)
test_set<-subset(news, split==FALSE)

train_label<- training_set$popular
test_label<-test_set$popular
#knn<- knn(train=training_set , test=test_set , cl=train_label, k=13)
knn<- knn(train=training_set , test=test_set , cl=train_label, k=21)
knn<-table(x=test_label, y=knn)
knn
##    y
## x      0    1
##   0 7529    0
##   1  399    0

Summary:

It is said that selecting an odd number for a k value is a good start to finding the optimal k value.Here I have selected 21 as the k value because there are 21 variables and when I tried out other values for k it was not giving me the optimal solution This R script will perform a k-Means analysis on the wine dataset from UCI. This dataset #has 13 chemical measurements on 178 observations of Italian wine.

wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var)) 
  for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)
} 
  plot(1:nc, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
  }

library(rattle.data)
data(wine)
head(wine)
##   Type Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids
## 1    1   14.23  1.71 2.43       15.6       127    2.80       3.06
## 2    1   13.20  1.78 2.14       11.2       100    2.65       2.76
## 3    1   13.16  2.36 2.67       18.6       101    2.80       3.24
## 4    1   14.37  1.95 2.50       16.8       113    3.85       3.49
## 5    1   13.24  2.59 2.87       21.0       118    2.80       2.69
## 6    1   14.20  1.76 2.45       15.2       112    3.27       3.39
##   Nonflavanoids Proanthocyanins Color  Hue Dilution Proline
## 1          0.28            2.29  5.64 1.04     3.92    1065
## 2          0.26            1.28  4.38 1.05     3.40    1050
## 3          0.30            2.81  5.68 1.03     3.17    1185
## 4          0.24            2.18  7.80 0.86     3.45    1480
## 5          0.39            1.82  4.32 1.04     2.93     735
## 6          0.34            1.97  6.75 1.05     2.85    1450
df <- scale(wine[-1])
head(df) 
##        Alcohol       Malic        Ash Alcalinity  Magnesium   Phenols
## [1,] 1.5143408 -0.56066822  0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252  0.02117152  1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684  0.22705328  1.8352256  0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010 -1.2860793 0.85828399 1.5576991
##      Flavanoids Nonflavanoids Proanthocyanins      Color        Hue
## [1,]  1.0319081    -0.6577078       1.2214385  0.2510088  0.3611585
## [2,]  0.7315653    -0.8184106      -0.5431887 -0.2924962  0.4049085
## [3,]  1.2121137    -0.4970050       2.1299594  0.2682629  0.3174085
## [4,]  1.4623994    -0.9791134       1.0292513  1.1827317 -0.4263410
## [5,]  0.6614853     0.2261576       0.4002753 -0.3183774  0.3611585
## [6,]  1.3622851    -0.1755994       0.6623487  0.7298108  0.4049085
##       Dilution     Proline
## [1,] 1.8427215  1.01015939
## [2,] 1.1103172  0.96252635
## [3,] 0.7863692  1.39122370
## [4,] 1.1807407  2.32800680
## [5,] 0.4483365 -0.03776747
## [6,] 0.3356589  2.23274072
wssplot(df)

Start the k-Means analysis using the variable “nc” for the number of clusters

library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
print(table(nc$Best.n[1,]))
## 
##  0  1  2  3 10 12 14 15 
##  2  1  4 15  1  1  1  1
barplot(table(nc$Best.n[1,]), xlab = "Number of Clusters", ylab = "Number of Criteria", main = "Number of Clusters Chosen by 26 Criteria")

Enter the best number of clusters based on the information in the table and barplot

#n <- readline(prompt = "Enter the best number of clusters: ") 
#n <- as.integer(n)

Conduct the k-Means analysis using the best number of clusters

set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25)
print(fit.km$size)
## [1] 62 65 51
print(fit.km$centers)
##      Alcohol      Malic        Ash Alcalinity   Magnesium     Phenols
## 1  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
## 3  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
##    Flavanoids Nonflavanoids Proanthocyanins      Color        Hue
## 1  0.97506900   -0.56050853      0.57865427  0.1705823  0.4726504
## 2  0.02075402   -0.03343924      0.05810161 -0.8993770  0.4605046
## 3 -1.21182921    0.72402116     -0.77751312  0.9388902 -1.1615122
##     Dilution    Proline
## 1  0.7770551  1.1220202
## 2  0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428
print(aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean))
##   cluster  Alcohol    Malic      Ash Alcalinity Magnesium  Phenols
## 1       1 13.67677 1.997903 2.466290   17.46290 107.96774 2.847581
## 2       2 12.25092 1.897385 2.231231   20.06308  92.73846 2.247692
## 3       3 13.13412 3.307255 2.417647   21.24118  98.66667 1.683922
##   Flavanoids Nonflavanoids Proanthocyanins    Color       Hue Dilution
## 1  3.0032258     0.2920968        1.922097 5.453548 1.0654839 3.163387
## 2  2.0500000     0.3576923        1.624154 2.973077 1.0627077 2.803385
## 3  0.8188235     0.4519608        1.145882 7.234706 0.6919608 1.696667
##     Proline
## 1 1100.2258
## 2  510.1692
## 3  619.0588

Use a confusion or truth table to evaluate how well the k-Means analysis performed

ct.km <- table(wine$Type, fit.km$cluster) 
print(ct.km)
##    
##      1  2  3
##   1 59  0  0
##   2  3 65  3
##   3  0  0 48

Generate a plot of the clusters

library(cluster)
clusplot(df, fit.km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)

Set-up to train a model for classification of wines

library(rpart)
df <- data.frame(k=fit.km$cluster, df) 
print(str(df))
## 'data.frame':    178 obs. of  14 variables:
##  $ k              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol        : num  1.514 0.246 0.196 1.687 0.295 ...
##  $ Malic          : num  -0.5607 -0.498 0.0212 -0.3458 0.2271 ...
##  $ Ash            : num  0.231 -0.826 1.106 0.487 1.835 ...
##  $ Alcalinity     : num  -1.166 -2.484 -0.268 -0.807 0.451 ...
##  $ Magnesium      : num  1.9085 0.0181 0.0881 0.9283 1.2784 ...
##  $ Phenols        : num  0.807 0.567 0.807 2.484 0.807 ...
##  $ Flavanoids     : num  1.032 0.732 1.212 1.462 0.661 ...
##  $ Nonflavanoids  : num  -0.658 -0.818 -0.497 -0.979 0.226 ...
##  $ Proanthocyanins: num  1.221 -0.543 2.13 1.029 0.4 ...
##  $ Color          : num  0.251 -0.292 0.268 1.183 -0.318 ...
##  $ Hue            : num  0.361 0.405 0.317 -0.426 0.361 ...
##  $ Dilution       : num  1.843 1.11 0.786 1.181 0.448 ...
##  $ Proline        : num  1.0102 0.9625 1.3912 2.328 -0.0378 ...
## NULL
#Randomize the dataset
rdf <- df[sample(1:nrow(df)), ] 
print(head(rdf))
##     k    Alcohol      Malic         Ash Alcalinity  Magnesium      Phenols
## 93  2 -0.3826162 -0.7217931 -0.38826018  0.3608424 -1.3822227 -1.462188745
## 69  2  0.4180475 -1.2499245 -0.02375431 -0.7470867  0.7182523  0.375309174
## 13  1  0.9230815 -0.5427655  0.15849862 -1.0465271 -0.7520802  0.487156874
## 57  1  1.5020229 -0.5696196 -0.24245783 -0.9566950  1.2783790  1.445851440
## 117 2 -1.4542737 -0.7755014 -1.37242601  0.3907864 -0.9621277 -0.503494178
## 161 3 -0.7891070  1.3370245  0.04914686  0.4506745 -0.8220960  0.007809591
##     Flavanoids Nonflavanoids Proanthocyanins      Color         Hue
## 93  -0.5699201     1.7528342      0.05084419 -0.8661960  0.01115870
## 69  -0.7301029     1.5117800     -2.04574255 -0.8144336  0.27365854
## 13   0.7315653    -0.5773564      0.38280376  0.2337547  0.84240820
## 57   0.9718395    -0.8184106      0.76717799  0.5702101 -0.07634125
## 117 -0.4297602    -0.4970050     -0.10639981 -1.3406845 -0.03259127
## 161 -1.1105371     1.1100230     -0.96250606  1.1180287 -1.73884025
##       Dilution      Proline
## 93  -0.7770322 -0.799896093
## 69  -0.9601332  0.009865569
## 13   0.4060824  1.819921051
## 57   0.9835550  0.708483475
## 117  1.0117244 -0.799896093
## 161 -1.4530976 -0.720507695
train <- rdf[1:(as.integer(.8*nrow(rdf))-1), ]
test <- rdf[(as.integer(.8*nrow(rdf))):nrow(rdf), ]

Train the classifier and plot the results

fit <- rpart(k ~ ., data=train, method="class")
#library(rpart.plot)
#library(RColorBrewer)
#library(rattle.data) 
#fancyRpartPlot(fit)
#fancy

Now use the predict() function to see how well the model works

predict(fit, test, type="class")
## 133 148 120  29  14 108  75  11   4  77 160  91 176 118 105  71   3  54 
##   3   3   2   1   1   2   1   1   1   2   3   2   3   2   2   3   1   1 
##  30 169 165 156  83 166 116  65 112 114  87 140 149  47 132  45 138  50 
##   1   3   3   3   2   3   2   2   2   2   2   2   3   1   3   1   3   1 
##   1 
##   1 
## Levels: 1 2 3
#print(table(predict, test$k))

Summary:

The reason why I choose k as 3 is because as you can see it clearly shows that the sum of squares in between to SS in within in the historgram is about so much.