## Example
# ---
# Question: Perform clustering analysis on the following dataset using the K-Means clustering algorithm.
# ---
# OUR CODE GOES BELOW
#
require("datasets")
# Loading the Iris dataset
# ---
#
data("iris")
# Viewing the structure of the dataset
# ---
#
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# Viewing the statistical summary of the dataset
# ---
#
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# Previewing the dataset
# ---
#
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Preprocessing the dataset
# ---
# Since clustering is a type of Unsupervised Learning,
# we would not require Class Label(output) during execution of our algorithm.
# We will, therefore, remove Class Attribute “Species” and store it in another variable.
# We would then normalize the attributes between 0 and 1 using our own function.
# ---
#
iris.new<- iris[, c(1, 2, 3, 4)]
iris.class<- iris[, "Species"]
head(iris.new)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
# Previewing the class column
# ---
#
head(iris.class)
## [1] setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
# Normalizing the dataset so that no particular attribute
# has more impact on clustering algorithm than others.
# ---
#
normalize <- function(x){
return ((x-min(x)) / (max(x)-min(x)))
}
normalize
## function(x){
## return ((x-min(x)) / (max(x)-min(x)))
## }
iris.new$Sepal.Length<- normalize(iris.new$Sepal.Length)
iris.new$Sepal.Width<- normalize(iris.new$Sepal.Width)
iris.new$Petal.Length<- normalize(iris.new$Petal.Length)
iris.new$Petal.Width<- normalize(iris.new$Petal.Width)
head(iris.new)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 0.22222222 0.6250000 0.06779661 0.04166667
## 2 0.16666667 0.4166667 0.06779661 0.04166667
## 3 0.11111111 0.5000000 0.05084746 0.04166667
## 4 0.08333333 0.4583333 0.08474576 0.04166667
## 5 0.19444444 0.6666667 0.06779661 0.04166667
## 6 0.30555556 0.7916667 0.11864407 0.12500000
# Applying the K-means clustering algorithm with no. of centroids(k)=3
# ---
#
result<- kmeans(iris.new,3)
# Previewing the no. of records in each cluster
#
result$size
## [1] 39 50 61
#These are the records contained in the three mutually exclusive clusters
# Getting the value of cluster center datapoint value(3 centers for k=3)
# ---
#
result$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 0.7072650 0.4508547 0.79704476 0.82478632
## 2 0.1961111 0.5950000 0.07830508 0.06083333
## 3 0.4412568 0.3073770 0.57571548 0.54918033
# Getting the cluster vector that shows the cluster where each record falls
# ---
#
result$cluster
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [75] 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1 1 3 1 1 1 1
## [112] 1 1 3 1 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 3 1 1 1 3 1 1 1 3 1 1 1 3 1
## [149] 1 3
# The graph shows that we have got 3 clearly distinguishable clusters for Ozone and Solar.R data points.
# Let’s see how clustering has performed on Wind and Temp attributes.
# Visualizing the clustering results
# ---
#
par(mfrow = c(1,2), mar = c(5,4,2,2))
# Plotting to see how Ozone and Solar.R data points have been distributed in clusters
# ---
#
plot(airquality[,1:2], col = result$cluster)
# Verifying the results of clustering
# ---
#
par(mfrow = c(2,2), mar = c(5,4,2,2))
# Plotting to see how Sepal.Length and Sepal.Width data points have been distributed in clusters
plot(iris.new[c(1,2)], col = result$cluster)
# Plotting to see how Sepal.Length and Sepal.Width data points have been distributed
# originally as per "class" attribute in dataset
# ---
#
plot(iris.new[c(1,2)], col = iris.class)
# Plotting to see how Petal.Length and Petal.Width data points have been distributed in clusters
# ---
#
plot(iris.new[c(3,4)], col = result$cluster)
# Plotting to see how Petal.Length and Petal.Width data points have been distributed in class
# originally as per "class" attribute in dataset
#
plot(iris.new[c(3,4)], col = iris.class)
# Result of table shows that Cluster 1 corresponds to Virginica,
# Cluster 2 corresponds to Versicolor and Cluster 3 to Setosa.
# ---
#
table(result$cluster, iris.class)
## iris.class
## setosa versicolor virginica
## 1 0 3 36
## 2 50 0 0
## 3 0 47 14
In order to improve this accuracy further, we may try different values of “k”. In some cases, it is also beneficial to change the algorithm in case k-means is unable to yield good results.
## Challenge 1
# ---
# Question: Apply unsupervised learning to the given airquality dataset below.
# ---
# OUR CODE GOES BELOW
#
# Load and view the dataset
# ---
# Importing the dataset
# ---
#
data("airquality")
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Viewing the statistical summary of the dataset
# ---
#
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
#Previewing the dataset
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
#2. Preprocess the dataset
Let’s begin by finding which attributes have missing values. We then need to impute those missing values(NA), which we will be doing simply by replacing NA with monthly average.
col1<- mapply(anyNA,airquality) # apply function anyNA() on all columns of airquality dataset
col1
## Ozone Solar.R Wind Temp Month Day
## TRUE TRUE FALSE FALSE FALSE FALSE
The output shows that only Ozone and Solar.R attributes have NA i.e. some missing value
# Impute monthly mean in Ozone
for (i in 1:nrow(airquality)){
if(is.na(airquality[i,"Ozone"])){
airquality[i,"Ozone"]<- mean(airquality[which(airquality[,"Month"]==airquality[i,"Month"]),"Ozone"],na.rm = TRUE)
}}
# Impute monthly mean in Solar.R
for (i in 1:nrow(airquality)){
if(is.na(airquality[i,"Solar.R"])){
airquality[i,"Solar.R"]<- mean(airquality[which(airquality[,"Month"]==airquality[i,"Month"]),"Solar.R"],na.rm = TRUE)
}}
#Normalizing
#Normalize the dataset so that no particular attribute has more impact on clustering algorithm than others.
normalize<- function(x){
return((x-min(x))/(max(x)-min(x)))
}
airquality<- normalize(airquality) # replace contents of dataset with normalized values
col1<- mapply(anyNA,airquality) # apply function anyNA() on all columns of airquality dataset
col1
## Ozone Solar.R Wind Temp Month Day
## FALSE FALSE FALSE FALSE FALSE FALSE
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 0.12012012 0.5675676 0.01921922 0.1981982 0.01201201 0.000000000
## 2 0.10510511 0.3513514 0.02102102 0.2132132 0.01201201 0.003003003
## 3 0.03303303 0.4444444 0.03483483 0.2192192 0.01201201 0.006006006
## 4 0.05105105 0.9369369 0.03153153 0.1831832 0.01201201 0.009009009
## 5 0.06791407 0.5414303 0.03993994 0.1651652 0.01201201 0.012012012
## 6 0.08108108 0.5414303 0.04174174 0.1951952 0.01201201 0.015015015
#3. Applying Kmeans Clustering algorithm
result1<- kmeans(airquality[c(1,2,3,4)],3) # apply k-means algorithmusing first 4 attributes and with k=3(no. of required clusters)
result$size # gives no. of records in each cluster
## [1] 39 50 61
#results
result1$centers # gives value of cluster center datapoint value(3 centers for k=3)
## Ozone Solar.R Wind Temp
## 1 0.11983821 0.8086587 0.02868368 0.2316817
## 2 0.15748866 0.5412500 0.02264487 0.2421310
## 3 0.06706049 0.1804882 0.03004543 0.2140602
result1$cluster #gives cluster vector showing the custer where each record falls
## [1] 2 3 2 1 2 2 1 3 3 2 2 1 1 1 3 1 1 3 1 3 3 1 3 3 3 1 2 3 1 2 1 1 1 1 2 2 1
## [38] 2 1 1 1 1 1 2 1 1 2 1 3 3 2 2 3 3 1 2 2 3 3 3 2 1 1 1 3 2 1 1 1 1 2 2 1 2
## [75] 1 3 1 1 1 2 2 3 1 1 1 2 3 3 2 1 1 1 3 3 3 2 2 2 1 2 2 2 2 2 1 2 3 3 3 3 1
## [112] 2 1 3 1 2 2 2 2 2 2 1 2 2 2 2 2 3 3 1 2 1 1 1 1 1 3 3 1 1 3 1 2 1 3 2 3 3
## [149] 2 2 2 2 1
#4. Visualizing clustering results
par(mfrow=c(1,2), mar=c(5,4,2,2))
plot(airquality[,1:2], col=result$cluster) # Plot to see how Ozone and Solar.R data points have been distributed in clusters
Graph shows that we have got 3 clearly distinguishable clusters for Ozone and Solar.R data points.
#Let’s see how clustering has performed on Wind and Temp attributes.
plot(airquality[,3:4], col=result$cluster) # Plot to see how Wind and Temp data points have been distributed in clusters
#This graph shows that Wind and Temp data points have not been clustered properly
#Let us find out which attributes have been taken into consideration more by k-means algorithm. For this, we will #plot all possible combinations of attributes
plot(airquality[,], col=result$cluster) # Plot to see all attribute combinations
## Challenge 2
# ---
# Question: Create a model that clusters the following dataset.
# ---
# Dataset = http://bit.ly/SalaryDatasetClustering
# ---
# OUR CODE GOES BELOW
#
library("data.table")
url=("http://bit.ly/SalaryDatasetClustering")
salary<-fread(url)
head(salary)
## Id EmployeeName JobTitle
## 1: 1 NATHANIEL FORD GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY
## 2: 2 GARY JIMENEZ CAPTAIN III (POLICE DEPARTMENT)
## 3: 3 ALBERT PARDINI CAPTAIN III (POLICE DEPARTMENT)
## 4: 4 CHRISTOPHER CHONG WIRE ROPE CABLE MAINTENANCE MECHANIC
## 5: 5 PATRICK GARDNER DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)
## 6: 6 DAVID SULLIVAN ASSISTANT DEPUTY CHIEF II
## BasePay OvertimePay OtherPay Benefits TotalPay TotalPayBenefits Year
## 1: 167411.18 0.0 400184.25 567595.4 567595.4 2011
## 2: 155966.02 245131.88 137811.38 538909.3 538909.3 2011
## 3: 212739.13 106088.18 16452.6 335279.9 335279.9 2011
## 4: 77916.0 56120.71 198306.9 332343.6 332343.6 2011
## 5: 134401.6 9737.0 182234.59 326373.2 326373.2 2011
## 6: 118602.0 8601.0 189082.74 316285.7 316285.7 2011
## Notes Agency Status
## 1: NA San Francisco
## 2: NA San Francisco
## 3: NA San Francisco
## 4: NA San Francisco
## 5: NA San Francisco
## 6: NA San Francisco
#structure of the salary dataset
str(salary)
## Classes 'data.table' and 'data.frame': 148654 obs. of 13 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ EmployeeName : chr "NATHANIEL FORD" "GARY JIMENEZ" "ALBERT PARDINI" "CHRISTOPHER CHONG" ...
## $ JobTitle : chr "GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY" "CAPTAIN III (POLICE DEPARTMENT)" "CAPTAIN III (POLICE DEPARTMENT)" "WIRE ROPE CABLE MAINTENANCE MECHANIC" ...
## $ BasePay : chr "167411.18" "155966.02" "212739.13" "77916.0" ...
## $ OvertimePay : chr "0.0" "245131.88" "106088.18" "56120.71" ...
## $ OtherPay : chr "400184.25" "137811.38" "16452.6" "198306.9" ...
## $ Benefits : chr "" "" "" "" ...
## $ TotalPay : num 567595 538909 335280 332344 326373 ...
## $ TotalPayBenefits: num 567595 538909 335280 332344 326373 ...
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Notes : logi NA NA NA NA NA NA ...
## $ Agency : chr "San Francisco" "San Francisco" "San Francisco" "San Francisco" ...
## $ Status : chr "" "" "" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
It has 148654 observations of thirteen variables
#columns names
colnames(salary)
## [1] "Id" "EmployeeName" "JobTitle" "BasePay"
## [5] "OvertimePay" "OtherPay" "Benefits" "TotalPay"
## [9] "TotalPayBenefits" "Year" "Notes" "Agency"
## [13] "Status"
library("dplyr")
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
nums_salary<-select_if(salary,is.numeric)
nums_salary
## Id TotalPay TotalPayBenefits Year
## 1: 1 567595.43 567595.43 2011
## 2: 2 538909.28 538909.28 2011
## 3: 3 335279.91 335279.91 2011
## 4: 4 332343.61 332343.61 2011
## 5: 5 326373.19 326373.19 2011
## ---
## 148650: 148650 0.00 0.00 2014
## 148651: 148651 0.00 0.00 2014
## 148652: 148652 0.00 0.00 2014
## 148653: 148653 0.00 0.00 2014
## 148654: 148654 -618.13 -618.13 2014
#numerical columns
#summary of the numerical columns in salary dataset
summary(nums_salary)
## Id TotalPay TotalPayBenefits Year
## Min. : 1 Min. : -618.1 Min. : -618.1 Min. :2011
## 1st Qu.: 37164 1st Qu.: 36169.0 1st Qu.: 44065.7 1st Qu.:2012
## Median : 74328 Median : 71426.6 Median : 92404.1 Median :2013
## Mean : 74328 Mean : 74768.3 Mean : 93692.6 Mean :2013
## 3rd Qu.:111491 3rd Qu.:105839.1 3rd Qu.:132876.5 3rd Qu.:2014
## Max. :148654 Max. :567595.4 Max. :567595.4 Max. :2014
#missing values in numerical columns
col1<- mapply(anyNA,nums_salary) # apply function anyNA() on all columns of airquality dataset
col1
## Id TotalPay TotalPayBenefits Year
## FALSE FALSE FALSE FALSE
normalize<- function(x){
return((x-min(x))/(max(x)-min(x)))
}
TotalPay<- normalize(nums_salary$TotalPay)
TotalPayBenefits <- normalize(nums_salary$TotalPayBenefits)
head(nums_salary)
## Id TotalPay TotalPayBenefits Year
## 1: 1 567595.4 567595.4 2011
## 2: 2 538909.3 538909.3 2011
## 3: 3 335279.9 335279.9 2011
## 4: 4 332343.6 332343.6 2011
## 5: 5 326373.2 326373.2 2011
## 6: 6 316285.7 316285.7 2011
#Challenge 3
## Challenge 3
# ---
# Question: Cluster customers from the given wholesale customer database.```{R}
# Question: Cluster customers from the given wholesale customer database.
# ---
# Dataset source = https://archive.ics.uci.edu/ml/datasets/Wholesale+customers
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
library(ClusterR)
## Loading required package: gtools
# ---
# OUR CODE GOES BELOW
#
url=("C:/Users/RoySambu/Downloads/Wholesale customers data.csv")
customer<-read.csv(url)
rmarkdown::paged_table(customer)
colnames(customer)
## [1] "Channel" "Region" "Fresh" "Milk"
## [5] "Grocery" "Frozen" "Detergents_Paper" "Delicassen"
#preprocessing
#structure of the dataset
str(customer)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
4440 observations of 8 variables
##summary of the dataset
summary(customer)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
#Normalize the dataset so that no particular attribute has more impact on clustering algorithm than others.
normalize<- function(x){
return((x-min(x))/(max(x)-min(x)))
}
customer<- normalize(customer)
##correlation
library(corrplot)
## corrplot 0.84 loaded
corrmatrix <-cor(customer)
corrplot(corrmatrix, method='number')
##K-means Cluster algorithm
result3<- kmeans(customer[c(3,4,6,7,8)],4) # apply k-means algorithmusing first 4 attributes and with k=4(no. of required clusters)
result3$size # gives no. of records in each cluster
## [1] 23 286 11 120
result3$centers # gives value of cluster center datapoint value(4 centers for k=4)
## Fresh Milk Frozen Detergents_Paper Delicassen
## 1 0.43954603 0.04442963 0.07387234 0.008571207 0.02267228
## 2 0.04661461 0.04846949 0.01905275 0.027865933 0.01001777
## 3 0.18154825 0.34578608 0.05932315 0.164870101 0.07528067
## 4 0.18032234 0.03374053 0.03539612 0.011006019 0.01469996
result3$cluster #gives cluster vector showing the custer where each record falls
## [1] 2 2 2 4 4 2 2 2 2 2 2 4 4 4 4 2 2 2 4 2 4 2 4 3 4 4 2 4 2 1 4 2 4 4 2 2 4
## [38] 4 2 1 4 4 2 2 2 2 2 3 2 2 2 2 1 2 4 2 3 2 4 2 2 3 2 2 2 2 2 4 2 2 4 4 2 4
## [75] 2 4 2 2 2 2 2 2 2 4 2 3 3 1 2 4 2 4 3 4 2 2 2 2 2 2 2 2 2 1 4 4 2 2 2 2 2
## [112] 2 4 4 4 2 2 2 4 2 4 2 4 2 1 1 4 4 2 1 2 2 4 2 2 2 2 2 4 2 4 4 1 2 4 4 2 2
## [149] 2 4 4 2 4 2 2 2 2 4 2 2 2 4 4 2 2 4 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 3 2
## [186] 2 2 2 2 2 4 4 2 2 2 4 4 2 2 2 2 2 4 2 2 2 2 2 2 2 4 3 2 2 2 2 2 4 2 2 4 2
## [223] 2 2 2 2 4 2 2 2 2 2 4 2 4 2 2 4 2 1 4 4 4 2 2 2 2 4 4 2 2 3 2 4 2 4 2 2 1
## [260] 1 2 2 4 2 2 2 2 4 2 4 2 2 2 1 2 2 4 2 2 4 2 2 1 4 1 1 2 4 4 1 2 2 2 2 4 2
## [297] 4 2 2 2 4 2 2 2 2 2 2 4 2 2 2 4 2 2 2 2 2 2 2 3 2 2 4 4 4 1 2 2 4 2 2 2 4
## [334] 2 4 4 4 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 4 2 4 2 2 2 4 2 2 2 2 2 2 2 4 2
## [371] 1 4 2 4 2 2 2 1 2 2 4 4 4 2 2 2 2 4 2 2 2 2 2 4 4 2 2 2 2 2 2 4 4 4 4 2 4
## [408] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 2 4 2 2 2 2 4 2 4 4 1 2 2 2
#Visualizing clustering results
plot(customer[,3:4], col=result3$cluster) # Plot to see how milk and fresh data points have been distributed in clusters
plot(customer[,4:5], col=result3$cluster) # Plot to see how milk and Grocery data points have been distributed in clusters
plot(customer[,5:6], col=result3$cluster) # Plot to see how Frozen and Grocery data points have been distributed in clusters
plot(customer[,6:7], col=result3$cluster) # Plot to see how milk and Grocery data points have been distributed in clusters
plot(customer[,], col=result3$cluster)