1 Load Data

library(data.table)
library(dplyr)
library(ggplot2)
library(caret)
library(e1071)
#library(xgboost)
#library(corplot)
setwd("./")
train = fread("Train_UWu5bXk.csv")
test = fread("Test_u94Q5KV.csv")
#test[,Item_Outlet_Sales :=NA]
combi = rbind(train, test, fill =T)
train = combi[1:nrow(train)]

2 Data Preprocessing

2.1 Data Cleaning

Check NAs

colSums(is.na(combi))
          Item_Identifier               Item_Weight          Item_Fat_Content           Item_Visibility                 Item_Type                  Item_MRP 
                        0                      2439                         0                         0                         0                         0 
        Outlet_Identifier Outlet_Establishment_Year               Outlet_Size      Outlet_Location_Type               Outlet_Type         Item_Outlet_Sales 
                        0                         0                         0                         0                         0                      5681 

There are too many NAs in Item_Outlet_Sales (5681 out of 8523). Therefore, we shall drop this column.

Item_Outlet_Sales has 2439 missing variables, which we shall replace with mean value.

Remove All NA rows

# Data Cleaning
# Item_Weight is removed as it contains too many missing values
combi$Item_Weight <- NULL
#colSums(is.na(combi))
# Repalce with Average
weight.mean = mean(combi$Item_Outlet_Sales, na.rm=TRUE)
combi$Item_Outlet_Sales[is.na(combi$Item_Outlet_Sales)] = weight.mean
# removing any rows that contain NA
#combi = na.omit(combi)
colSums(is.na(combi))
          Item_Identifier          Item_Fat_Content           Item_Visibility                 Item_Type                  Item_MRP         Outlet_Identifier 
                        0                         0                         0                         0                         0                         0 
Outlet_Establishment_Year               Outlet_Size      Outlet_Location_Type               Outlet_Type         Item_Outlet_Sales 
                        0                         0                         0                         0                         0 

2.2 Data Encoding

combi = combi[Item_Fat_Content == "LF", Item_Fat_Content :="Low Fat"]
combi = combi[Item_Fat_Content == "low fat", Item_Fat_Content :="Low Fat"]
combi = combi[Item_Fat_Content == "reg", Item_Fat_Content :="Regular"]
combi$Outlet_Size[combi$Outlet_Size == ""] <- NA
# Encoding Categorial Variables
combi[,Outlet_Size_num := ifelse(Outlet_Size == "Small", 0,
                                 ifelse(Outlet_Size == "Medium", 1, 2))]
combi[,Outlet_Location_Type_num := ifelse(Outlet_Location_Type == "Tier 3", 0,
                                          ifelse(Outlet_Location_Type == "Tier 2", 1, 2))]
# removing categorical variables after label encoding
combi[, c("Outlet_Size", "Outlet_Location_Type") := NULL]
str(combi)
Classes ‘data.table’ and 'data.frame':  14204 obs. of  11 variables:
 $ Item_Identifier          : chr  "FDA15" "DRC01" "FDN15" "FDX07" ...
 $ Item_Fat_Content         : chr  "Low Fat" "Regular" "Low Fat" "Regular" ...
 $ Item_Visibility          : num  0.016 0.0193 0.0168 0 0 ...
 $ Item_Type                : chr  "Dairy" "Soft Drinks" "Meat" "Fruits and Vegetables" ...
 $ Item_MRP                 : num  249.8 48.3 141.6 182.1 53.9 ...
 $ Outlet_Identifier        : chr  "OUT049" "OUT018" "OUT049" "OUT010" ...
 $ Outlet_Establishment_Year: int  1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
 $ Outlet_Type              : chr  "Supermarket Type1" "Supermarket Type2" "Supermarket Type1" "Grocery Store" ...
 $ Item_Outlet_Sales        : num  3735 443 2097 732 995 ...
 $ Outlet_Size_num          : num  1 1 1 NA 2 1 2 1 NA NA ...
 $ Outlet_Location_Type_num : num  2 0 2 0 0 0 0 0 1 1 ...
 - attr(*, ".internal.selfref")=<externalptr> 
 - attr(*, "index")= int 

2.3 Scaling

#Scaling
scaling = scale(combi$Item_Outlet_Sales)
combi[,Item_Outlet_Sales := scaling]
scaling = scale(combi$Item_Visibility)
combi[,Item_Visibility := scaling]
scaling = scale(combi$Item_MRP)
combi[,Item_MRP := scaling]
scaling = scale(combi$Outlet_Establishment_Year)
combi[,Outlet_Establishment_Year:= scaling]
#scaling = scale(combi$Item_Weight)
#combi[,Item_Weight:= scaling]

We see that historgram on Item_MRP has four peaks. This variable potentially can form four groups with single variable.

par(mfrow=c(1,1))
hist(combi$Item_MRP)

Variable Item_Visibility and Outlet_Sales is skewed towards left.

par(mfrow=c(1,2))
hist(combi$Item_Outlet_Sales)
hist(combi$Item_Visibility)

3 Visualize Scatter Plot

# select number features
train.df = combi %>% 
  select( Item_Visibility,Item_MRP, Item_Outlet_Sales, Outlet_Establishment_Year)
plot(train.df)

4 K-Mean Clustering

4.1 Feature Selection

We choose only SINGLE feature, Item_MRP for this clustering.

train.df1 = combi %>% select(Item_MRP)

4.2 Discover K

k <- list()
for(i in 1:10){
  k[[i]] <- kmeans(train.df1, i)
}
betweenss_totss <- list()
for(i in 1:10){
  betweenss_totss[[i]] <- k[[i]]$betweenss/k[[i]]$totss
}
plot(1:10, betweenss_totss, type = "b", 
     ylab = "Between SS / Total SS", xlab = "Clusters (k)")

4.3 Cluster and Plot

We choose k=4, similar to visualization on scatter plot.

# build cluster
set.seed(123)
km.res <- kmeans(train.df1, 4, nstart =25 ) 
# k-means group size
km.cluster=km.res$cluster
table(km.cluster)
km.cluster
   1    2    3    4 
2556 4931 2400 4317 
# visualize
plot(combi[,c('Item_Visibility','Item_MRP', 'Item_Outlet_Sales')], col = km.cluster)

5 H-Clustering

5.1 Feature Selection

We follow the same SINGLE feature like the kmean above.

train.df = combi %>% select(Item_MRP)
d <- dist(train.df)
fitH <- hclust(d, "ward.D2")
plot(fitH) 
rect.hclust(fitH, k = 4, border = "red") 

5.2 Cluster and Plot

hc.clusters <- cutree(fitH, k = 4) 
plot(combi[,c('Item_Visibility','Item_MRP', 'Item_Outlet_Sales')], col = clusters)

table(hc.clusters)
hc.clusters
   1    2    3    4 
2400 2208 4935 4661 

6 Cluster Agreeemnt

We can see that both clustering method quite agrees to each other in this dataset, with only some exception on group 3 and group 4.

agreement.table = table(km.cluster, hc.clusters, dnn=c('kmeans','hcluster'))
agreement.table
      hcluster
kmeans    1    2    3    4
     1    0 2208    0  348
     2    0    0 4931    0
     3 2400    0    0    0
     4    0    0    4 4313

The Rand Index or Rand measure (named after William M. Rand) in statistics, and in particular in data clustering, is a measure of the similarity between two data clusterings. Adjusted Rand Index is correlated chance of version of Rand Index.

*ARI** shows high value of 0.943, which means both Hierechical clustering and kmeans method agrees highly with each other.

randIndex(agreement.table)
      ARI 
0.9429736 
LS0tDQp0aXRsZTogImFzc2lnbm1lbnRfM19zY3JpcHQiDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6DQogICAgbnVtYmVyX3NlY3Rpb25zOiB5ZXMNCiAgICB0aGVtZTogY2VydWxlYW4NCiAgICB0b2M6IHllcw0KICAgIHRvY19kZXB0aDogNA0KICBodG1sX2RvY3VtZW50Og0KICAgIGRmX3ByaW50OiBwYWdlZA0KICAgIHRvYzogeWVzDQogICAgdG9jX2RlcHRoOiAnNCcNCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSwgbWVzc2FnZT1GQUxTRSkNCmBgYA0KDQojIExvYWQgRGF0YQ0KDQpgYGB7cn0NCmxpYnJhcnkoZGF0YS50YWJsZSkNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KGNhcmV0KQ0KbGlicmFyeShlMTA3MSkNCmxpYnJhcnkoZmxleGNsdXN0KQ0KbGlicmFyeShkYnNjYW4pDQojbGlicmFyeSh4Z2Jvb3N0KQ0KI2xpYnJhcnkoY29ycGxvdCkNCg0Kc2V0d2QoIi4vIikNCg0KdHJhaW4gPSBmcmVhZCgiVHJhaW5fVVd1NWJYay5jc3YiKQ0KdGVzdCA9IGZyZWFkKCJUZXN0X3U5NFE1S1YuY3N2IikNCiN0ZXN0WyxJdGVtX091dGxldF9TYWxlcyA6PU5BXQ0KY29tYmkgPSByYmluZCh0cmFpbiwgdGVzdCwgZmlsbCA9VCkNCnRyYWluID0gY29tYmlbMTpucm93KHRyYWluKV0NCmBgYA0KDQojIERhdGEgUHJlcHJvY2Vzc2luZw0KDQojIyBEYXRhIENsZWFuaW5nDQoNCioqQ2hlY2sgTkFzKioNCg0KYGBge3J9DQpjb2xTdW1zKGlzLm5hKGNvbWJpKSkNCmBgYA0KDQpUaGVyZSBhcmUgdG9vIG1hbnkgTkFzIGluIEl0ZW1fT3V0bGV0X1NhbGVzICg1NjgxIG91dCBvZiA4NTIzKS4gVGhlcmVmb3JlLCB3ZSBzaGFsbCBkcm9wIHRoaXMgY29sdW1uLg0KDQpJdGVtX091dGxldF9TYWxlcyBoYXMgMjQzOSBtaXNzaW5nIHZhcmlhYmxlcywgd2hpY2ggd2Ugc2hhbGwgcmVwbGFjZSB3aXRoIG1lYW4gdmFsdWUuDQoNCioqUmVtb3ZlIEFsbCBOQSByb3dzKioNCg0KYGBge3J9DQojIERhdGEgQ2xlYW5pbmcNCiMgSXRlbV9XZWlnaHQgaXMgcmVtb3ZlZCBhcyBpdCBjb250YWlucyB0b28gbWFueSBtaXNzaW5nIHZhbHVlcw0KY29tYmkkSXRlbV9XZWlnaHQgPC0gTlVMTA0KI2NvbFN1bXMoaXMubmEoY29tYmkpKQ0KDQojIFJlcGFsY2Ugd2l0aCBBdmVyYWdlDQp3ZWlnaHQubWVhbiA9IG1lYW4oY29tYmkkSXRlbV9PdXRsZXRfU2FsZXMsIG5hLnJtPVRSVUUpDQpjb21iaSRJdGVtX091dGxldF9TYWxlc1tpcy5uYShjb21iaSRJdGVtX091dGxldF9TYWxlcyldID0gd2VpZ2h0Lm1lYW4NCg0KIyByZW1vdmluZyBhbnkgcm93cyB0aGF0IGNvbnRhaW4gTkENCiNjb21iaSA9IG5hLm9taXQoY29tYmkpDQpjb2xTdW1zKGlzLm5hKGNvbWJpKSkNCg0KYGBgDQoNCiMjIERhdGEgRW5jb2RpbmcNCg0KYGBge3J9DQpjb21iaSA9IGNvbWJpW0l0ZW1fRmF0X0NvbnRlbnQgPT0gIkxGIiwgSXRlbV9GYXRfQ29udGVudCA6PSJMb3cgRmF0Il0NCmNvbWJpID0gY29tYmlbSXRlbV9GYXRfQ29udGVudCA9PSAibG93IGZhdCIsIEl0ZW1fRmF0X0NvbnRlbnQgOj0iTG93IEZhdCJdDQpjb21iaSA9IGNvbWJpW0l0ZW1fRmF0X0NvbnRlbnQgPT0gInJlZyIsIEl0ZW1fRmF0X0NvbnRlbnQgOj0iUmVndWxhciJdDQpjb21iaSRPdXRsZXRfU2l6ZVtjb21iaSRPdXRsZXRfU2l6ZSA9PSAiIl0gPC0gTkENCg0KIyBFbmNvZGluZyBDYXRlZ29yaWFsIFZhcmlhYmxlcw0KY29tYmlbLE91dGxldF9TaXplX251bSA6PSBpZmVsc2UoT3V0bGV0X1NpemUgPT0gIlNtYWxsIiwgMCwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGlmZWxzZShPdXRsZXRfU2l6ZSA9PSAiTWVkaXVtIiwgMSwgMikpXQ0KY29tYmlbLE91dGxldF9Mb2NhdGlvbl9UeXBlX251bSA6PSBpZmVsc2UoT3V0bGV0X0xvY2F0aW9uX1R5cGUgPT0gIlRpZXIgMyIsIDAsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBpZmVsc2UoT3V0bGV0X0xvY2F0aW9uX1R5cGUgPT0gIlRpZXIgMiIsIDEsIDIpKV0NCg0KIyByZW1vdmluZyBjYXRlZ29yaWNhbCB2YXJpYWJsZXMgYWZ0ZXIgbGFiZWwgZW5jb2RpbmcNCmNvbWJpWywgYygiT3V0bGV0X1NpemUiLCAiT3V0bGV0X0xvY2F0aW9uX1R5cGUiKSA6PSBOVUxMXQ0KDQpzdHIoY29tYmkpDQpgYGANCg0KIyMgU2NhbGluZw0KDQpgYGB7cn0NCiNTY2FsaW5nDQpzY2FsaW5nID0gc2NhbGUoY29tYmkkSXRlbV9PdXRsZXRfU2FsZXMpDQpjb21iaVssSXRlbV9PdXRsZXRfU2FsZXMgOj0gc2NhbGluZ10NCnNjYWxpbmcgPSBzY2FsZShjb21iaSRJdGVtX1Zpc2liaWxpdHkpDQpjb21iaVssSXRlbV9WaXNpYmlsaXR5IDo9IHNjYWxpbmddDQpzY2FsaW5nID0gc2NhbGUoY29tYmkkSXRlbV9NUlApDQpjb21iaVssSXRlbV9NUlAgOj0gc2NhbGluZ10NCnNjYWxpbmcgPSBzY2FsZShjb21iaSRPdXRsZXRfRXN0YWJsaXNobWVudF9ZZWFyKQ0KY29tYmlbLE91dGxldF9Fc3RhYmxpc2htZW50X1llYXI6PSBzY2FsaW5nXQ0KI3NjYWxpbmcgPSBzY2FsZShjb21iaSRJdGVtX1dlaWdodCkNCiNjb21iaVssSXRlbV9XZWlnaHQ6PSBzY2FsaW5nXQ0KYGBgDQoNCldlIHNlZSB0aGF0IGhpc3RvcmdyYW0gb24gKipJdGVtX01SUCoqIGhhcyBmb3VyIHBlYWtzLiBUaGlzIHZhcmlhYmxlIHBvdGVudGlhbGx5IGNhbiBmb3JtIGZvdXIgZ3JvdXBzIHdpdGggc2luZ2xlIHZhcmlhYmxlLg0KDQpgYGB7ciBmaWcud2lkdGg9OS4zLCBmaWcuaGVpZ2g9M30NCnBhcihtZnJvdz1jKDEsMSkpDQpoaXN0KGNvbWJpJEl0ZW1fTVJQKQ0KYGBgDQoNCg0KVmFyaWFibGUgSXRlbV9WaXNpYmlsaXR5IGFuZCBPdXRsZXRfU2FsZXMgaXMgc2tld2VkIHRvd2FyZHMgbGVmdC4NCg0KYGBge3IgZmlnLndpZHRoPTkuMywgZmlnLmhlaWdoPTN9DQpwYXIobWZyb3c9YygxLDIpKQ0KaGlzdChjb21iaSRJdGVtX091dGxldF9TYWxlcykNCmhpc3QoY29tYmkkSXRlbV9WaXNpYmlsaXR5KQ0KYGBgDQoNCg0KIyBWaXN1YWxpemUgU2NhdHRlciBQbG90DQoNCmBgYHtyLCBmaWcud2lkdGg9OS4zfQ0KIyBzZWxlY3QgbnVtYmVyIGZlYXR1cmVzDQp0cmFpbi5kZiA9IGNvbWJpICU+JSANCiAgc2VsZWN0KCBJdGVtX1Zpc2liaWxpdHksSXRlbV9NUlAsIEl0ZW1fT3V0bGV0X1NhbGVzLCBPdXRsZXRfRXN0YWJsaXNobWVudF9ZZWFyKQ0KcGxvdCh0cmFpbi5kZikNCmBgYA0KDQojIEstTWVhbiBDbHVzdGVyaW5nDQoNCiMjIEZlYXR1cmUgU2VsZWN0aW9uDQoNCldlIGNob29zZSBvbmx5IFNJTkdMRSBmZWF0dXJlLCAqKkl0ZW1fTVJQKiogZm9yIHRoaXMgY2x1c3RlcmluZy4NCg0KYGBge3J9DQp0cmFpbi5kZjEgPSBjb21iaSAlPiUgc2VsZWN0KEl0ZW1fTVJQKQ0KYGBgDQoNCiMjIERpc2NvdmVyIEsNCg0KYGBge3J9DQoNCmsgPC0gbGlzdCgpDQpmb3IoaSBpbiAxOjEwKXsNCiAga1tbaV1dIDwtIGttZWFucyh0cmFpbi5kZjEsIGkpDQp9DQoNCmJldHdlZW5zc190b3RzcyA8LSBsaXN0KCkNCmZvcihpIGluIDE6MTApew0KICBiZXR3ZWVuc3NfdG90c3NbW2ldXSA8LSBrW1tpXV0kYmV0d2VlbnNzL2tbW2ldXSR0b3Rzcw0KfQ0KDQpwbG90KDE6MTAsIGJldHdlZW5zc190b3RzcywgdHlwZSA9ICJiIiwgDQogICAgIHlsYWIgPSAiQmV0d2VlbiBTUyAvIFRvdGFsIFNTIiwgeGxhYiA9ICJDbHVzdGVycyAoaykiKQ0KDQpgYGANCg0KDQojIyBDbHVzdGVyIGFuZCBQbG90DQoNCldlIGNob29zZSBrPTQsIHNpbWlsYXIgdG8gdmlzdWFsaXphdGlvbiBvbiBzY2F0dGVyIHBsb3QuDQoNCmBgYHtyLCBmaWcud2lkdGg9OS4zfQ0KIyBidWlsZCBjbHVzdGVyDQpzZXQuc2VlZCgxMjMpDQprbS5yZXMgPC0ga21lYW5zKHRyYWluLmRmMSwgNCwgbnN0YXJ0ID0yNSApIA0KDQojIGstbWVhbnMgZ3JvdXAgc2l6ZQ0Ka20uY2x1c3Rlcj1rbS5yZXMkY2x1c3Rlcg0KdGFibGUoa20uY2x1c3RlcikNCg0KIyB2aXN1YWxpemUNCnBsb3QoY29tYmlbLGMoJ0l0ZW1fVmlzaWJpbGl0eScsJ0l0ZW1fTVJQJywgJ0l0ZW1fT3V0bGV0X1NhbGVzJyldLCBjb2wgPSBrbS5jbHVzdGVyKQ0KYGBgDQoNCiMgSC1DbHVzdGVyaW5nDQoNCiMjIEZlYXR1cmUgU2VsZWN0aW9uDQoNCldlIGZvbGxvdyB0aGUgc2FtZSBTSU5HTEUgZmVhdHVyZSBsaWtlIHRoZSBrbWVhbiBhYm92ZS4NCg0KYGBge3IsIGZpZy53aWR0aD05LjN9DQp0cmFpbi5kZiA9IGNvbWJpICU+JSBzZWxlY3QoSXRlbV9NUlApDQoNCmQgPC0gZGlzdCh0cmFpbi5kZjIpDQpmaXRIIDwtIGhjbHVzdChkLCAid2FyZC5EMiIpDQpwbG90KGZpdEgpIA0KcmVjdC5oY2x1c3QoZml0SCwgayA9IDQsIGJvcmRlciA9ICJyZWQiKSANCmBgYA0KDQoNCiMjIENsdXN0ZXIgYW5kIFBsb3QNCg0KYGBge3J9DQpoYy5jbHVzdGVycyA8LSBjdXRyZWUoZml0SCwgayA9IDQpIA0KcGxvdChjb21iaVssYygnSXRlbV9WaXNpYmlsaXR5JywnSXRlbV9NUlAnLCAnSXRlbV9PdXRsZXRfU2FsZXMnKV0sIGNvbCA9IGNsdXN0ZXJzKQ0KYGBgDQoNCmBgYHtyfQ0KdGFibGUoaGMuY2x1c3RlcnMpDQpgYGANCg0KDQojIENsdXN0ZXIgQWdyZWVlbW50DQoNCldlIGNhbiBzZWUgdGhhdCBib3RoIGNsdXN0ZXJpbmcgbWV0aG9kIHF1aXRlIGFncmVlcyB0byBlYWNoIG90aGVyIGluIHRoaXMgZGF0YXNldCwgd2l0aCBvbmx5IHNvbWUgZXhjZXB0aW9uIG9uIGdyb3VwIDMgYW5kIGdyb3VwIDQuIA0KDQpgYGB7cn0NCmFncmVlbWVudC50YWJsZSA9IHRhYmxlKGttLmNsdXN0ZXIsIGhjLmNsdXN0ZXJzLCBkbm49Yygna21lYW5zJywnaGNsdXN0ZXInKSkNCmFncmVlbWVudC50YWJsZQ0KYGBgDQoNCioqVGhlIFJhbmQgSW5kZXgqKiBvciBSYW5kIG1lYXN1cmUgKG5hbWVkIGFmdGVyIFdpbGxpYW0gTS4gUmFuZCkgaW4gc3RhdGlzdGljcywgYW5kIGluIHBhcnRpY3VsYXIgaW4gZGF0YSBjbHVzdGVyaW5nLCBpcyBhIG1lYXN1cmUgb2YgdGhlIHNpbWlsYXJpdHkgYmV0d2VlbiB0d28gZGF0YSBjbHVzdGVyaW5ncy4gKipBZGp1c3RlZCBSYW5kIEluZGV4KiogaXMgY29ycmVsYXRlZCBjaGFuY2Ugb2YgdmVyc2lvbiBvZiBSYW5kIEluZGV4Lg0KDQoqQVJJKiogc2hvd3MgaGlnaCB2YWx1ZSBvZiAwLjk0Mywgd2hpY2ggbWVhbnMgYm90aCBIaWVyZWNoaWNhbCBjbHVzdGVyaW5nIGFuZCBrbWVhbnMgbWV0aG9kIGFncmVlcyBoaWdobHkgd2l0aCBlYWNoIG90aGVyLg0KDQpgYGB7cn0NCnJhbmRJbmRleChhZ3JlZW1lbnQudGFibGUpDQpgYGANCg0KDQoNCg0K