K-means---Clustering.knit

require("datasets")

# Loading the Iris dataset
# ---
# 
data("iris")

# Viewing the structure of the dataset
# ---
#
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

# Viewing the statistical summary of the dataset
# ---
# 
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# Previewing the dataset
# ---
# 
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# Preprocessing the dataset
# ---
# Since clustering is a type of Unsupervised Learning, 
# we would not require Class Label(output) during execution of our algorithm. 
# We will, therefore, remove Class Attribute “Species” and store it in another variable. 
# We would then normalize the attributes between 0 and 1 using our own function.
# ---
#
iris.new<- iris[, c(1, 2, 3, 4)]
iris.class<- iris[, "Species"]
head(iris.new)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4

# Previewing the class column
# ---
# 
head(iris.class)

## [1] setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica

# Normalizing the dataset so that no particular attribute 
# has more impact on clustering algorithm than others.
# ---
# 
normalize <- function(x){
  return ((x-min(x)) / (max(x)-min(x)))
}
normalize

## function(x){
##   return ((x-min(x)) / (max(x)-min(x)))
## }

iris.new$Sepal.Length<- normalize(iris.new$Sepal.Length)
iris.new$Sepal.Width<- normalize(iris.new$Sepal.Width)
iris.new$Petal.Length<- normalize(iris.new$Petal.Length)
iris.new$Petal.Width<- normalize(iris.new$Petal.Width)
head(iris.new)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1   0.22222222   0.6250000   0.06779661  0.04166667
## 2   0.16666667   0.4166667   0.06779661  0.04166667
## 3   0.11111111   0.5000000   0.05084746  0.04166667
## 4   0.08333333   0.4583333   0.08474576  0.04166667
## 5   0.19444444   0.6666667   0.06779661  0.04166667
## 6   0.30555556   0.7916667   0.11864407  0.12500000

# Applying the K-means clustering algorithm with no. of centroids(k)=3
# ---
# 
result<- kmeans(iris.new,3)

# Previewing the no. of records in each cluster
# 
result$size

## [1] 39 50 61

#These are the records contained in the three mutually exclusive clusters

# Getting the value of cluster center datapoint value(3 centers for k=3)
# ---
# 
result$centers

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1    0.7072650   0.4508547   0.79704476  0.82478632
## 2    0.1961111   0.5950000   0.07830508  0.06083333
## 3    0.4412568   0.3073770   0.57571548  0.54918033

# Getting the cluster vector that shows the cluster where each record falls
# ---
# 
result$cluster

##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [75] 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1 1 3 1 1 1 1
## [112] 1 1 3 1 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 3 1 1 1 3 1 1 1 3 1 1 1 3 1
## [149] 1 3

# The graph shows that we have got 3 clearly distinguishable clusters for Ozone and Solar.R data points.
# Let’s see how clustering has performed on Wind and Temp attributes.

# Visualizing the  clustering results
# ---
# 
par(mfrow = c(1,2), mar = c(5,4,2,2))

# Plotting to see how Ozone and Solar.R data points have been distributed in clusters
# ---
#
plot(airquality[,1:2], col = result$cluster)

# Verifying the results of clustering
# ---
# 
par(mfrow = c(2,2), mar = c(5,4,2,2))

# Plotting to see how Sepal.Length and Sepal.Width data points have been distributed in clusters
plot(iris.new[c(1,2)], col = result$cluster)

# Plotting to see how Sepal.Length and Sepal.Width data points have been distributed 
# originally as per "class" attribute in dataset
# ---
#
plot(iris.new[c(1,2)], col = iris.class)

# Plotting to see how Petal.Length and Petal.Width data points have been distributed in clusters
# ---
# 
plot(iris.new[c(3,4)], col = result$cluster)

# Plotting to see how Petal.Length and Petal.Width data points have been distributed in class
# originally as per "class" attribute in dataset
# 
plot(iris.new[c(3,4)], col = iris.class)

# Result of table shows that Cluster 1 corresponds to Virginica, 
# Cluster 2 corresponds to Versicolor and Cluster 3 to Setosa.
# ---
# 
table(result$cluster, iris.class)

##    iris.class
##     setosa versicolor virginica
##   1      0          3        36
##   2     50          0         0
##   3      0         47        14

In order to improve this accuracy further, we may try different values of “k”. In some cases, it is also beneficial to change the algorithm in case k-means is unable to yield good results.

Challenges

## Challenge 1
# ---
# Question: Apply unsupervised learning to the given airquality dataset below.
# ---
# OUR CODE GOES BELOW
# 

# Load and view the dataset 
# ---
# Importing the dataset
# ---
# 
data("airquality")
str(airquality)

## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

# Viewing the statistical summary of the dataset
# ---
# 
summary(airquality)

##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
##

#Previewing the dataset
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

#2. Preprocess the dataset

Let’s begin by finding which attributes have missing values. We then need to impute those missing values(NA), which we will be doing simply by replacing NA with monthly average.

col1<- mapply(anyNA,airquality) # apply function anyNA() on all columns of airquality dataset
col1

##   Ozone Solar.R    Wind    Temp   Month     Day 
##    TRUE    TRUE   FALSE   FALSE   FALSE   FALSE

The output shows that only Ozone and Solar.R attributes have NA i.e. some missing value

# Impute monthly mean in Ozone
for (i in 1:nrow(airquality)){
  if(is.na(airquality[i,"Ozone"])){
    airquality[i,"Ozone"]<- mean(airquality[which(airquality[,"Month"]==airquality[i,"Month"]),"Ozone"],na.rm = TRUE)
  }}

# Impute monthly mean in Solar.R
for (i in 1:nrow(airquality)){
    if(is.na(airquality[i,"Solar.R"])){
    airquality[i,"Solar.R"]<- mean(airquality[which(airquality[,"Month"]==airquality[i,"Month"]),"Solar.R"],na.rm = TRUE)
  }}

#Normalizing

#Normalize the dataset so that no particular attribute has more impact on clustering algorithm than others.
normalize<- function(x){
  return((x-min(x))/(max(x)-min(x)))
}
airquality<- normalize(airquality) # replace contents of dataset with normalized values

col1<- mapply(anyNA,airquality) # apply function anyNA() on all columns of airquality dataset
col1

##   Ozone Solar.R    Wind    Temp   Month     Day 
##   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE

head(airquality)

##        Ozone   Solar.R       Wind      Temp      Month         Day
## 1 0.12012012 0.5675676 0.01921922 0.1981982 0.01201201 0.000000000
## 2 0.10510511 0.3513514 0.02102102 0.2132132 0.01201201 0.003003003
## 3 0.03303303 0.4444444 0.03483483 0.2192192 0.01201201 0.006006006
## 4 0.05105105 0.9369369 0.03153153 0.1831832 0.01201201 0.009009009
## 5 0.06791407 0.5414303 0.03993994 0.1651652 0.01201201 0.012012012
## 6 0.08108108 0.5414303 0.04174174 0.1951952 0.01201201 0.015015015

#3. Applying Kmeans Clustering algorithm

result1<- kmeans(airquality[c(1,2,3,4)],3) # apply k-means algorithmusing first 4 attributes and with k=3(no. of required clusters)
result$size # gives no. of records in each cluster

## [1] 39 50 61

#results

result1$centers # gives value of cluster center datapoint value(3 centers for k=3)

##        Ozone   Solar.R       Wind      Temp
## 1 0.11983821 0.8086587 0.02868368 0.2316817
## 2 0.15748866 0.5412500 0.02264487 0.2421310
## 3 0.06706049 0.1804882 0.03004543 0.2140602

result1$cluster #gives cluster vector showing the custer where each record falls

##   [1] 2 3 2 1 2 2 1 3 3 2 2 1 1 1 3 1 1 3 1 3 3 1 3 3 3 1 2 3 1 2 1 1 1 1 2 2 1
##  [38] 2 1 1 1 1 1 2 1 1 2 1 3 3 2 2 3 3 1 2 2 3 3 3 2 1 1 1 3 2 1 1 1 1 2 2 1 2
##  [75] 1 3 1 1 1 2 2 3 1 1 1 2 3 3 2 1 1 1 3 3 3 2 2 2 1 2 2 2 2 2 1 2 3 3 3 3 1
## [112] 2 1 3 1 2 2 2 2 2 2 1 2 2 2 2 2 3 3 1 2 1 1 1 1 1 3 3 1 1 3 1 2 1 3 2 3 3
## [149] 2 2 2 2 1

#4. Visualizing clustering results

par(mfrow=c(1,2), mar=c(5,4,2,2))
plot(airquality[,1:2], col=result$cluster) # Plot to see how Ozone and Solar.R data points have been distributed in clusters

Graph shows that we have got 3 clearly distinguishable clusters for Ozone and Solar.R data points.

#Let’s see how clustering has performed on Wind and Temp attributes.

plot(airquality[,3:4], col=result$cluster) # Plot to see how Wind and Temp data points have been distributed in clusters

#This graph shows that Wind and Temp data points have not been clustered properly

#Let us find out which attributes have been taken into consideration more by k-means algorithm. For this, we will #plot all possible combinations of attributes

plot(airquality[,], col=result$cluster) # Plot to see all attribute combinations

## Challenge 2
# ---
# Question: Create a model that clusters the following dataset.
# ---
# Dataset = http://bit.ly/SalaryDatasetClustering
# ---
# OUR CODE GOES BELOW
# 
library("data.table")
url=("http://bit.ly/SalaryDatasetClustering")
salary<-fread(url)
head(salary)

##    Id      EmployeeName                                       JobTitle
## 1:  1    NATHANIEL FORD GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY
## 2:  2      GARY JIMENEZ                CAPTAIN III (POLICE DEPARTMENT)
## 3:  3    ALBERT PARDINI                CAPTAIN III (POLICE DEPARTMENT)
## 4:  4 CHRISTOPHER CHONG           WIRE ROPE CABLE MAINTENANCE MECHANIC
## 5:  5   PATRICK GARDNER   DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)
## 6:  6    DAVID SULLIVAN                      ASSISTANT DEPUTY CHIEF II
##      BasePay OvertimePay  OtherPay Benefits TotalPay TotalPayBenefits Year
## 1: 167411.18         0.0 400184.25          567595.4         567595.4 2011
## 2: 155966.02   245131.88 137811.38          538909.3         538909.3 2011
## 3: 212739.13   106088.18   16452.6          335279.9         335279.9 2011
## 4:   77916.0    56120.71  198306.9          332343.6         332343.6 2011
## 5:  134401.6      9737.0 182234.59          326373.2         326373.2 2011
## 6:  118602.0      8601.0 189082.74          316285.7         316285.7 2011
##    Notes        Agency Status
## 1:    NA San Francisco       
## 2:    NA San Francisco       
## 3:    NA San Francisco       
## 4:    NA San Francisco       
## 5:    NA San Francisco       
## 6:    NA San Francisco

#structure of the salary dataset
str(salary)

## Classes 'data.table' and 'data.frame':   148654 obs. of  13 variables:
##  $ Id              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ EmployeeName    : chr  "NATHANIEL FORD" "GARY JIMENEZ" "ALBERT PARDINI" "CHRISTOPHER CHONG" ...
##  $ JobTitle        : chr  "GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY" "CAPTAIN III (POLICE DEPARTMENT)" "CAPTAIN III (POLICE DEPARTMENT)" "WIRE ROPE CABLE MAINTENANCE MECHANIC" ...
##  $ BasePay         : chr  "167411.18" "155966.02" "212739.13" "77916.0" ...
##  $ OvertimePay     : chr  "0.0" "245131.88" "106088.18" "56120.71" ...
##  $ OtherPay        : chr  "400184.25" "137811.38" "16452.6" "198306.9" ...
##  $ Benefits        : chr  "" "" "" "" ...
##  $ TotalPay        : num  567595 538909 335280 332344 326373 ...
##  $ TotalPayBenefits: num  567595 538909 335280 332344 326373 ...
##  $ Year            : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Notes           : logi  NA NA NA NA NA NA ...
##  $ Agency          : chr  "San Francisco" "San Francisco" "San Francisco" "San Francisco" ...
##  $ Status          : chr  "" "" "" "" ...
##  - attr(*, ".internal.selfref")=<externalptr>

It has 148654 observations of thirteen variables

#columns names

colnames(salary)

##  [1] "Id"               "EmployeeName"     "JobTitle"         "BasePay"         
##  [5] "OvertimePay"      "OtherPay"         "Benefits"         "TotalPay"        
##  [9] "TotalPayBenefits" "Year"             "Notes"            "Agency"          
## [13] "Status"

library("dplyr")

## Warning: package 'dplyr' was built under R version 4.0.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

nums_salary<-select_if(salary,is.numeric)
nums_salary

##             Id  TotalPay TotalPayBenefits Year
##      1:      1 567595.43        567595.43 2011
##      2:      2 538909.28        538909.28 2011
##      3:      3 335279.91        335279.91 2011
##      4:      4 332343.61        332343.61 2011
##      5:      5 326373.19        326373.19 2011
##     ---                                       
## 148650: 148650      0.00             0.00 2014
## 148651: 148651      0.00             0.00 2014
## 148652: 148652      0.00             0.00 2014
## 148653: 148653      0.00             0.00 2014
## 148654: 148654   -618.13          -618.13 2014

#numerical columns

#summary of the numerical columns in salary dataset
summary(nums_salary)

##        Id            TotalPay        TotalPayBenefits        Year     
##  Min.   :     1   Min.   :  -618.1   Min.   :  -618.1   Min.   :2011  
##  1st Qu.: 37164   1st Qu.: 36169.0   1st Qu.: 44065.7   1st Qu.:2012  
##  Median : 74328   Median : 71426.6   Median : 92404.1   Median :2013  
##  Mean   : 74328   Mean   : 74768.3   Mean   : 93692.6   Mean   :2013  
##  3rd Qu.:111491   3rd Qu.:105839.1   3rd Qu.:132876.5   3rd Qu.:2014  
##  Max.   :148654   Max.   :567595.4   Max.   :567595.4   Max.   :2014

#missing values in numerical columns

col1<- mapply(anyNA,nums_salary) # apply function anyNA() on all columns of airquality dataset
col1

##               Id         TotalPay TotalPayBenefits             Year 
##            FALSE            FALSE            FALSE            FALSE

normalize<- function(x){
  return((x-min(x))/(max(x)-min(x)))
}
TotalPay<- normalize(nums_salary$TotalPay)
TotalPayBenefits <- normalize(nums_salary$TotalPayBenefits)

head(nums_salary)

##    Id TotalPay TotalPayBenefits Year
## 1:  1 567595.4         567595.4 2011
## 2:  2 538909.3         538909.3 2011
## 3:  3 335279.9         335279.9 2011
## 4:  4 332343.6         332343.6 2011
## 5:  5 326373.2         326373.2 2011
## 6:  6 316285.7         316285.7 2011

#Challenge 3

## Challenge 3
# ---
# Question: Cluster customers from the given wholesale customer database.```{R}
# Question: Cluster customers from the given wholesale customer database.
# ---
# Dataset source = https://archive.ics.uci.edu/ml/datasets/Wholesale+customers
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.0.5

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(dplyr)
library(ClusterR)

## Loading required package: gtools

# ---
# OUR CODE GOES BELOW

# 
url=("C:/Users/RoySambu/Downloads/Wholesale customers data.csv")
customer<-read.csv(url)
rmarkdown::paged_table(customer)

colnames(customer)

## [1] "Channel"          "Region"           "Fresh"            "Milk"            
## [5] "Grocery"          "Frozen"           "Detergents_Paper" "Delicassen"

#preprocessing

#structure of the dataset
str(customer)

## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

4440 observations of 8 variables

##summary of the dataset
summary(customer)

##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

#Normalize the dataset so that no particular attribute has more impact on clustering algorithm than others.
normalize<- function(x){
  return((x-min(x))/(max(x)-min(x)))
}
customer<- normalize(customer)

##correlation

library(corrplot)

## corrplot 0.84 loaded

corrmatrix <-cor(customer)
corrplot(corrmatrix, method='number')

##K-means Cluster algorithm

result3<- kmeans(customer[c(3,4,6,7,8)],4) # apply k-means algorithmusing first 4 attributes and with k=4(no. of required clusters)
result3$size # gives no. of records in each cluster

## [1]  23 286  11 120

result3$centers # gives value of cluster center datapoint value(4 centers for k=4)

##        Fresh       Milk     Frozen Detergents_Paper Delicassen
## 1 0.43954603 0.04442963 0.07387234      0.008571207 0.02267228
## 2 0.04661461 0.04846949 0.01905275      0.027865933 0.01001777
## 3 0.18154825 0.34578608 0.05932315      0.164870101 0.07528067
## 4 0.18032234 0.03374053 0.03539612      0.011006019 0.01469996

result3$cluster #gives cluster vector showing the custer where each record falls

##   [1] 2 2 2 4 4 2 2 2 2 2 2 4 4 4 4 2 2 2 4 2 4 2 4 3 4 4 2 4 2 1 4 2 4 4 2 2 4
##  [38] 4 2 1 4 4 2 2 2 2 2 3 2 2 2 2 1 2 4 2 3 2 4 2 2 3 2 2 2 2 2 4 2 2 4 4 2 4
##  [75] 2 4 2 2 2 2 2 2 2 4 2 3 3 1 2 4 2 4 3 4 2 2 2 2 2 2 2 2 2 1 4 4 2 2 2 2 2
## [112] 2 4 4 4 2 2 2 4 2 4 2 4 2 1 1 4 4 2 1 2 2 4 2 2 2 2 2 4 2 4 4 1 2 4 4 2 2
## [149] 2 4 4 2 4 2 2 2 2 4 2 2 2 4 4 2 2 4 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 3 2
## [186] 2 2 2 2 2 4 4 2 2 2 4 4 2 2 2 2 2 4 2 2 2 2 2 2 2 4 3 2 2 2 2 2 4 2 2 4 2
## [223] 2 2 2 2 4 2 2 2 2 2 4 2 4 2 2 4 2 1 4 4 4 2 2 2 2 4 4 2 2 3 2 4 2 4 2 2 1
## [260] 1 2 2 4 2 2 2 2 4 2 4 2 2 2 1 2 2 4 2 2 4 2 2 1 4 1 1 2 4 4 1 2 2 2 2 4 2
## [297] 4 2 2 2 4 2 2 2 2 2 2 4 2 2 2 4 2 2 2 2 2 2 2 3 2 2 4 4 4 1 2 2 4 2 2 2 4
## [334] 2 4 4 4 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 4 2 4 2 2 2 4 2 2 2 2 2 2 2 4 2
## [371] 1 4 2 4 2 2 2 1 2 2 4 4 4 2 2 2 2 4 2 2 2 2 2 4 4 2 2 2 2 2 2 4 4 4 4 2 4
## [408] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 2 4 2 2 2 2 4 2 4 4 1 2 2 2

#Visualizing clustering results

plot(customer[,3:4], col=result3$cluster) # Plot to see how milk and fresh data points have been distributed in clusters

plot(customer[,4:5], col=result3$cluster) # Plot to see how milk and Grocery data points have been distributed in clusters

plot(customer[,5:6], col=result3$cluster) # Plot to see how Frozen and Grocery data points have been distributed in clusters

plot(customer[,6:7], col=result3$cluster) # Plot to see how milk and Grocery data points have been distributed in clusters

plot(customer[,], col=result3$cluster)

R Programming: K-Means Clustering

Example

Load and view the dataset

—

Importing the dataset

—

Challenges