rent1<-read.delim("C:/Users/Q/Downloads/Rent_Data.txt")
rent<- rent1[-c(1,6)]
#remove outliers
library(outliers)
scores(na.omit(rent), type="iqr", prob=NA, lim="iqr")
#standardize the data
scaled_rent<-scale(rent)
summary(scaled_rent)
## Rent Distance.from.Airport Distance.to...Downtown
## Min. :-1.1486 Min. :-1.5853 Min. :-1.5624
## 1st Qu.:-0.7384 1st Qu.:-0.9554 1st Qu.:-0.8694
## Median :-0.4454 Median : 0.1995 Median :-0.1764
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.0197 3rd Qu.: 0.9869 3rd Qu.: 0.8316
## Max. : 1.6643 Max. : 1.3543 Max. : 1.4616
## Distance.to.University
## Min. :-1.4665
## 1st Qu.:-0.6637
## Median : 0.1392
## Mean : 0.0000
## 3rd Qu.: 0.5138
## Max. : 2.1730
#distance between data points
dist_among_observed<- dist(scaled_rent, method="euclidean")
head(table(dist_among_observed))
## dist_among_observed
## 0.47688569332196 0.526433710186056 0.610904547152077 0.610904547152078
## 1 1 1 1
## 0.629392393721958 0.661318394049812
## 1 1
cluster_fit_average<-hclust(dist_among_observed,"average")
cluster_fit_centroid<- hclust(dist_among_observed, method="centroid")
library(NbClust)
number_of_clusters<-NbClust(scaled_rent, distance='euclidean', min.nc=2, max.nc=5, method="average")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 9 proposed 2 as the best number of clusters
## * 10 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
plot(cluster_fit_centroid, rent$Rent)
rect.hclust(cluster_fit_centroid, k=3)

final_clusters<-cutree(cluster_fit_average, k=3)
final_clusters
## [1] 1 1 1 1 1 2 1 2 1 1 2 2 2 3 3 1 2 2 2 1
final_clusters1<-cutree(cluster_fit_centroid, k=3)
final_clusters1
## [1] 1 1 1 1 1 2 1 2 1 1 2 2 2 3 3 1 2 2 2 1
rent
aggregate(rent, by=list(cluster=final_clusters), median)
optimal_clusters<-NbClust(scaled_rent, min.nc=2, max.nc=5, method='kmeans')
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 10 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
optimal_clusters
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman
## 2 6.5164 41.4423 9.7260 0.9730 44.4857 1226.0515 58.1023 23.0139 20.6982
## 3 1.7392 34.7371 6.0442 0.8702 67.9151 854.9249 22.5630 14.9409 24.4803
## 4 2.6774 31.4414 3.1314 0.5742 87.1351 581.3630 10.4622 11.0221 31.3540
## 5 0.6694 27.1677 3.3347 -0.0470 107.2006 333.0823 7.4508 9.2180 48.4969
## Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky Ball
## 2 3.3023 0.3419 0.6017 0.5742 0.5606 7.8372 1.7201 0.5839 11.5070
## 3 5.0867 0.3262 0.6350 0.5269 0.4120 8.5646 3.0632 0.5172 4.9803
## 4 6.8953 0.4570 0.8674 0.4332 9.0009 -8.8890 0.0000 0.4622 2.7555
## 5 8.2447 0.4690 0.8263 0.3777 0.6387 3.3946 1.1708 0.4191 1.8436
## Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 0.8070 0.7603 0.4373 0.3601 0.0178 1.2370 0.9747 0.3275
## 3 0.8103 2.0500 0.5925 0.4370 0.0182 1.1999 0.8009 0.1951
## 4 0.6770 2.3866 1.0277 0.3560 0.0193 1.9718 0.7107 0.1682
## 5 0.6201 1.7215 1.2875 0.3801 0.0193 2.2134 0.6487 0.1414
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.2576 28.8239 0.1646
## 3 0.2019 23.7173 0.0303
## 4 -0.5879 -27.0105 NaN
## 5 0.1265 41.4361 0.3485
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## Number_clusters 2.0000 2.0000 3.0000 2.000 3.0000 3.0000 3.0000 3.0000
## Value_Index 6.5164 41.4423 3.6818 0.973 23.4294 97.5647 35.5394 4.1543
## Friedman Rubin Cindex DB Silhouette Duda PseudoT2
## Number_clusters 5.0000 4.0000 3.0000 2.0000 2.0000 2.0000 2.0000
## Value_Index 17.1428 -0.4591 0.3262 0.6017 0.5742 0.5606 7.8372
## Beale Ratkowsky Ball PtBiserial Frey McClain Dunn Hubert
## Number_clusters 2.0000 2.0000 3.0000 3.0000 1 2.0000 3.000 0
## Value_Index 1.7201 0.5839 6.5267 0.8103 NA 0.4373 0.437 0
## SDindex Dindex SDbw
## Number_clusters 3.0000 0 5.0000
## Value_Index 1.1999 0 0.1414
##
## $Best.partition
## [1] 1 1 1 1 1 2 1 2 1 1 2 2 2 1 1 1 2 2 2 1
final_clusters_kmeans<-kmeans(scaled_rent, 2, nstart=20)
final_clusters_kmeans
## K-means clustering with 2 clusters of sizes 12, 8
##
## Cluster means:
## Rent Distance.from.Airport Distance.to...Downtown
## 1 -0.7383714 0.7244067 -0.4913949
## 2 1.1075571 -1.0866100 0.7370923
## Distance.to.University
## 1 0.674371
## 2 -1.011557
##
## Clustering vector:
## [1] 1 1 1 1 1 2 1 2 1 1 2 2 2 1 1 1 2 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 18.373988 4.639936
## (between_SS / total_SS = 69.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
cluster1<- final_clusters_kmeans$cluster
aggregate(rent, by=list(cluster1), median)