library(data.table)
library(dplyr)
library(ggplot2)
library(caret)
library(e1071)
#library(xgboost)
#library(corplot)
setwd("./")
train = fread("Train_UWu5bXk.csv")
test = fread("Test_u94Q5KV.csv")
#test[,Item_Outlet_Sales :=NA]
combi = rbind(train, test, fill =T)
train = combi[1:nrow(train)]
Check NAs
colSums(is.na(combi))
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP
0 2439 0 0 0 0
Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 0 0 0 0 5681
There are too many NAs in Item_Outlet_Sales (5681 out of 8523). Therefore, we shall drop this column.
Item_Outlet_Sales has 2439 missing variables, which we shall replace with mean value.
Remove All NA rows
# Data Cleaning
# Item_Weight is removed as it contains too many missing values
combi$Item_Weight <- NULL
#colSums(is.na(combi))
# Repalce with Average
weight.mean = mean(combi$Item_Outlet_Sales, na.rm=TRUE)
combi$Item_Outlet_Sales[is.na(combi$Item_Outlet_Sales)] = weight.mean
# removing any rows that contain NA
#combi = na.omit(combi)
colSums(is.na(combi))
Item_Identifier Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier
0 0 0 0 0 0
Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 0 0 0 0
combi = combi[Item_Fat_Content == "LF", Item_Fat_Content :="Low Fat"]
combi = combi[Item_Fat_Content == "low fat", Item_Fat_Content :="Low Fat"]
combi = combi[Item_Fat_Content == "reg", Item_Fat_Content :="Regular"]
combi$Outlet_Size[combi$Outlet_Size == ""] <- NA
# Encoding Categorial Variables
combi[,Outlet_Size_num := ifelse(Outlet_Size == "Small", 0,
ifelse(Outlet_Size == "Medium", 1, 2))]
combi[,Outlet_Location_Type_num := ifelse(Outlet_Location_Type == "Tier 3", 0,
ifelse(Outlet_Location_Type == "Tier 2", 1, 2))]
# removing categorical variables after label encoding
combi[, c("Outlet_Size", "Outlet_Location_Type") := NULL]
str(combi)
Classes ‘data.table’ and 'data.frame': 14204 obs. of 11 variables:
$ Item_Identifier : chr "FDA15" "DRC01" "FDN15" "FDX07" ...
$ Item_Fat_Content : chr "Low Fat" "Regular" "Low Fat" "Regular" ...
$ Item_Visibility : num 0.016 0.0193 0.0168 0 0 ...
$ Item_Type : chr "Dairy" "Soft Drinks" "Meat" "Fruits and Vegetables" ...
$ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
$ Outlet_Identifier : chr "OUT049" "OUT018" "OUT049" "OUT010" ...
$ Outlet_Establishment_Year: int 1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
$ Outlet_Type : chr "Supermarket Type1" "Supermarket Type2" "Supermarket Type1" "Grocery Store" ...
$ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
$ Outlet_Size_num : num 1 1 1 NA 2 1 2 1 NA NA ...
$ Outlet_Location_Type_num : num 2 0 2 0 0 0 0 0 1 1 ...
- attr(*, ".internal.selfref")=<externalptr>
- attr(*, "index")= int
#Scaling
scaling = scale(combi$Item_Outlet_Sales)
combi[,Item_Outlet_Sales := scaling]
scaling = scale(combi$Item_Visibility)
combi[,Item_Visibility := scaling]
scaling = scale(combi$Item_MRP)
combi[,Item_MRP := scaling]
scaling = scale(combi$Outlet_Establishment_Year)
combi[,Outlet_Establishment_Year:= scaling]
#scaling = scale(combi$Item_Weight)
#combi[,Item_Weight:= scaling]
We see that historgram on Item_MRP has four peaks. This variable potentially can form four groups with single variable.
par(mfrow=c(1,1))
hist(combi$Item_MRP)
Variable Item_Visibility and Outlet_Sales is skewed towards left.
par(mfrow=c(1,2))
hist(combi$Item_Outlet_Sales)
hist(combi$Item_Visibility)
# select number features
train.df = combi %>%
select( Item_Visibility,Item_MRP, Item_Outlet_Sales, Outlet_Establishment_Year)
plot(train.df)
We choose only SINGLE feature, Item_MRP for this clustering.
train.df1 = combi %>% select(Item_MRP)
k <- list()
for(i in 1:10){
k[[i]] <- kmeans(train.df1, i)
}
betweenss_totss <- list()
for(i in 1:10){
betweenss_totss[[i]] <- k[[i]]$betweenss/k[[i]]$totss
}
plot(1:10, betweenss_totss, type = "b",
ylab = "Between SS / Total SS", xlab = "Clusters (k)")
We choose k=4, similar to visualization on scatter plot.
# build cluster
set.seed(123)
km.res <- kmeans(train.df1, 4, nstart =25 )
# k-means group size
km.cluster=km.res$cluster
table(km.cluster)
km.cluster
1 2 3 4
2556 4931 2400 4317
# visualize
plot(combi[,c('Item_Visibility','Item_MRP', 'Item_Outlet_Sales')], col = km.cluster)
We follow the same SINGLE feature like the kmean above.
train.df = combi %>% select(Item_MRP)
d <- dist(train.df)
fitH <- hclust(d, "ward.D2")
plot(fitH)
rect.hclust(fitH, k = 4, border = "red")
hc.clusters <- cutree(fitH, k = 4)
plot(combi[,c('Item_Visibility','Item_MRP', 'Item_Outlet_Sales')], col = clusters)
table(hc.clusters)
hc.clusters
1 2 3 4
2400 2208 4935 4661
We can see that both clustering method quite agrees to each other in this dataset, with only some exception on group 3 and group 4.
agreement.table = table(km.cluster, hc.clusters, dnn=c('kmeans','hcluster'))
agreement.table
hcluster
kmeans 1 2 3 4
1 0 2208 0 348
2 0 0 4931 0
3 2400 0 0 0
4 0 0 4 4313
The Rand Index or Rand measure (named after William M. Rand) in statistics, and in particular in data clustering, is a measure of the similarity between two data clusterings. Adjusted Rand Index is correlated chance of version of Rand Index.
*ARI** shows high value of 0.943, which means both Hierechical clustering and kmeans method agrees highly with each other.
randIndex(agreement.table)
ARI
0.9429736