ActivityU7-Victor Omondi

rent1<-read.delim("C:/Users/Q/Downloads/Rent_Data.txt")
rent<- rent1[-c(1,6)]
#remove outliers
library(outliers)
scores(na.omit(rent), type="iqr", prob=NA, lim="iqr")

#standardize the data
scaled_rent<-scale(rent)
summary(scaled_rent)

##       Rent         Distance.from.Airport Distance.to...Downtown
##  Min.   :-1.1486   Min.   :-1.5853       Min.   :-1.5624       
##  1st Qu.:-0.7384   1st Qu.:-0.9554       1st Qu.:-0.8694       
##  Median :-0.4454   Median : 0.1995       Median :-0.1764       
##  Mean   : 0.0000   Mean   : 0.0000       Mean   : 0.0000       
##  3rd Qu.: 1.0197   3rd Qu.: 0.9869       3rd Qu.: 0.8316       
##  Max.   : 1.6643   Max.   : 1.3543       Max.   : 1.4616       
##  Distance.to.University
##  Min.   :-1.4665       
##  1st Qu.:-0.6637       
##  Median : 0.1392       
##  Mean   : 0.0000       
##  3rd Qu.: 0.5138       
##  Max.   : 2.1730

#distance between data points
dist_among_observed<- dist(scaled_rent, method="euclidean")
head(table(dist_among_observed))

## dist_among_observed
##  0.47688569332196 0.526433710186056 0.610904547152077 0.610904547152078 
##                 1                 1                 1                 1 
## 0.629392393721958 0.661318394049812 
##                 1                 1

cluster_fit_average<-hclust(dist_among_observed,"average")
cluster_fit_centroid<- hclust(dist_among_observed, method="centroid")

library(NbClust)
number_of_clusters<-NbClust(scaled_rent, distance='euclidean', min.nc=2, max.nc=5, method="average")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 9 proposed 2 as the best number of clusters 
## * 10 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

plot(cluster_fit_centroid, rent$Rent)
rect.hclust(cluster_fit_centroid, k=3)

final_clusters<-cutree(cluster_fit_average, k=3)
final_clusters

##  [1] 1 1 1 1 1 2 1 2 1 1 2 2 2 3 3 1 2 2 2 1

final_clusters1<-cutree(cluster_fit_centroid, k=3)
final_clusters1

##  [1] 1 1 1 1 1 2 1 2 1 1 2 2 2 3 3 1 2 2 2 1

rent

aggregate(rent, by=list(cluster=final_clusters), median)

optimal_clusters<-NbClust(scaled_rent, min.nc=2, max.nc=5, method='kmeans')

## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 10 proposed 2 as the best number of clusters 
## * 10 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

optimal_clusters

## $All.index
##       KL      CH Hartigan     CCC    Scott   Marriot  TrCovW  TraceW Friedman
## 2 6.5164 41.4423   9.7260  0.9730  44.4857 1226.0515 58.1023 23.0139  20.6982
## 3 1.7392 34.7371   6.0442  0.8702  67.9151  854.9249 22.5630 14.9409  24.4803
## 4 2.6774 31.4414   3.1314  0.5742  87.1351  581.3630 10.4622 11.0221  31.3540
## 5 0.6694 27.1677   3.3347 -0.0470 107.2006  333.0823  7.4508  9.2180  48.4969
##    Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale Ratkowsky    Ball
## 2 3.3023 0.3419 0.6017     0.5742 0.5606   7.8372 1.7201    0.5839 11.5070
## 3 5.0867 0.3262 0.6350     0.5269 0.4120   8.5646 3.0632    0.5172  4.9803
## 4 6.8953 0.4570 0.8674     0.4332 9.0009  -8.8890 0.0000    0.4622  2.7555
## 5 8.2447 0.4690 0.8263     0.3777 0.6387   3.3946 1.1708    0.4191  1.8436
##   Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2     0.8070 0.7603  0.4373 0.3601 0.0178  1.2370 0.9747 0.3275
## 3     0.8103 2.0500  0.5925 0.4370 0.0182  1.1999 0.8009 0.1951
## 4     0.6770 2.3866  1.0277 0.3560 0.0193  1.9718 0.7107 0.1682
## 5     0.6201 1.7215  1.2875 0.3801 0.0193  2.2134 0.6487 0.1414
## 
## $All.CriticalValues
##   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2         0.2576            28.8239       0.1646
## 3         0.2019            23.7173       0.0303
## 4        -0.5879           -27.0105          NaN
## 5         0.1265            41.4361       0.3485
## 
## $Best.nc
##                     KL      CH Hartigan   CCC   Scott Marriot  TrCovW TraceW
## Number_clusters 2.0000  2.0000   3.0000 2.000  3.0000  3.0000  3.0000 3.0000
## Value_Index     6.5164 41.4423   3.6818 0.973 23.4294 97.5647 35.5394 4.1543
##                 Friedman   Rubin Cindex     DB Silhouette   Duda PseudoT2
## Number_clusters   5.0000  4.0000 3.0000 2.0000     2.0000 2.0000   2.0000
## Value_Index      17.1428 -0.4591 0.3262 0.6017     0.5742 0.5606   7.8372
##                  Beale Ratkowsky   Ball PtBiserial Frey McClain  Dunn Hubert
## Number_clusters 2.0000    2.0000 3.0000     3.0000    1  2.0000 3.000      0
## Value_Index     1.7201    0.5839 6.5267     0.8103   NA  0.4373 0.437      0
##                 SDindex Dindex   SDbw
## Number_clusters  3.0000      0 5.0000
## Value_Index      1.1999      0 0.1414
## 
## $Best.partition
##  [1] 1 1 1 1 1 2 1 2 1 1 2 2 2 1 1 1 2 2 2 1

final_clusters_kmeans<-kmeans(scaled_rent, 2, nstart=20)
final_clusters_kmeans

## K-means clustering with 2 clusters of sizes 12, 8
## 
## Cluster means:
##         Rent Distance.from.Airport Distance.to...Downtown
## 1 -0.7383714             0.7244067             -0.4913949
## 2  1.1075571            -1.0866100              0.7370923
##   Distance.to.University
## 1               0.674371
## 2              -1.011557
## 
## Clustering vector:
##  [1] 1 1 1 1 1 2 1 2 1 1 2 2 2 1 1 1 2 2 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 18.373988  4.639936
##  (between_SS / total_SS =  69.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

cluster1<- final_clusters_kmeans$cluster
aggregate(rent, by=list(cluster1), median)

ActivityU7-Victor Omondi

Victor Omondi

2022-05-22