mydata_full <- read.table("C:/Julia/SEB LU/IMB/SEMESTER 2/Multivariate Analysis/HOMEWORK/Real estate.csv", header=TRUE, sep=",", dec=".")

mydata <- mydata_full [c(-2)]

# Renaming columns
colnames(mydata) <- c("ID", "House_age", "MRT_distance", "No_stores", "Latitude", "Longitude", "House_price")

head(mydata)
##   ID House_age MRT_distance No_stores Latitude Longitude House_price
## 1  1      32.0     84.87882        10 24.98298  121.5402        37.9
## 2  2      19.5    306.59470         9 24.98034  121.5395        42.2
## 3  3      13.3    561.98450         5 24.98746  121.5439        47.3
## 4  4      13.3    561.98450         5 24.98746  121.5439        54.8
## 5  5       5.0    390.56840         5 24.97937  121.5425        43.1
## 6  6       7.1   2175.03000         3 24.96305  121.5125        32.1

Description of dataset:

The dataset was found on kaggle.com, and investigates the price of houses on the real estate market. The unit of observation is a house. It incorporates 414 observations.

Description of variables:

#Standardising variables, due to different variances between variables:
mydata$House_age_z <- scale(mydata$House_age)
mydata$MRT_distance_z   <- scale(mydata$MRT_distance)
mydata$No_stores_z <- scale(mydata$No_stores)
mydata$Latitude_z <- scale(mydata$Latitude)
library(Hmisc)
rcorr(as.matrix(mydata[, c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z")]), 
      type = "pearson")
##                House_age_z MRT_distance_z No_stores_z Latitude_z
## House_age_z           1.00           0.03        0.05       0.05
## MRT_distance_z        0.03           1.00       -0.60      -0.59
## No_stores_z           0.05          -0.60        1.00       0.44
## Latitude_z            0.05          -0.59        0.44       1.00
## 
## n= 414 
## 
## 
## P
##                House_age_z MRT_distance_z No_stores_z Latitude_z
## House_age_z                0.6032         0.3141      0.2693    
## MRT_distance_z 0.6032                     0.0000      0.0000    
## No_stores_z    0.3141      0.0000                     0.0000    
## Latitude_z     0.2693      0.0000         0.0000
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
scatterplot(mydata$MRT_distance, mydata$House_age)

mydata$Dissimilarity <- sqrt(mydata$House_age_z^2 + mydata$MRT_distance_z^2 + mydata$No_stores_z^2 + mydata$Latitude_z^2) #Finding outliers; we don't need to subtract the average before squaring (formula), because the data is standardized so the averages are 0.
head(mydata[order(-mydata$Dissimilarity), ], 10) #10 units with the highest value of dissimilarity; - before: descending order (highest dissimilarity to lowest).
##      ID House_age MRT_distance No_stores Latitude Longitude House_price House_age_z MRT_distance_z No_stores_z
## 117 117      30.9     6396.283         1 24.94375  121.4788        12.2  1.15755607       4.209141   -1.050463
## 36   36      13.9     4079.418         0 25.01459  121.5182        27.3 -0.33465574       2.373433   -1.389957
## 348 348      17.4     6488.021         1 24.95719  121.4735        11.2 -0.02743566       4.281827   -1.050463
## 250 250      18.0     6306.153         1 24.95743  121.4752        15.0  0.02523063       4.137729   -1.050463
## 9     9      31.7     5512.038         1 24.95095  121.4846        18.8  1.22777780       3.508532   -1.050463
## 256 256      31.5     5512.038         1 24.95095  121.4846        17.4  1.21022237       3.508532   -1.050463
## 149 149      16.4     3780.590         0 24.93293  121.5120        45.1 -0.11521283       2.136664   -1.389957
## 195 195      15.2     3771.895         0 24.93363  121.5116        29.3 -0.22054543       2.129775   -1.389957
## 383 383      16.3     3529.564         0 24.93207  121.5160        29.3 -0.12399055       1.937770   -1.389957
## 321 321      13.5     4197.349         0 24.93885  121.5038        18.6 -0.36976661       2.466872   -1.389957
##     Latitude_z Dissimilarity
## 117 -2.0370405      4.930498
## 36   3.6711689      4.599417
## 348 -0.9540600      4.510931
## 250 -0.9347211      4.370196
## 9   -1.4568724      4.128339
## 256 -1.4568724      4.123152
## 149 -2.9089042      3.869407
## 195 -2.8524989      3.827964
## 383 -2.9782020      3.817328
## 321 -2.4318771      3.750759
#mydata <- mydata[-117, ] #If we would want to remove that person (the outlier), but I will leave them for now, maybe they are not problematic.
library(factoextra)

#Calculating Euclidean distances
distance <- get_dist(mydata[c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z")], 
                     method = "euclidian") #method: what type of distance you will use

distance2 <- distance^2

fviz_dist(distance2) #Showing dissimilarity matrix

The dissimilarity matrix does not look great, but I will formally test the clusterability using Hopkins statistics.

get_clust_tendency(mydata[c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z")], #Hopkins statistics
                   n = nrow(mydata) - 1, 
                   graph = FALSE)
## $hopkins_stat
## [1] 0.9022894
## 
## $plot
## NULL

=> good, more than 0.5, so the data is clusterable

Hierarchecal clustering:

library(dplyr) #Allowing use of %>% (pipes)

WARD <- mydata[c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z")] %>%
  #scale() %>% -> don't need it because we already standardized before, but good to know if we would want to do it in one step                          
  get_dist(method = "euclidean") %>%  #Selecting distance
  hclust(method = "ward.D2") #Selecting algorithm; ward automatically squares the distances; (distance used is SQUARED EUCLIDEAN - ward.D2)

WARD
## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 414
library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.

From the dendrogram above, I would go for 4 clusters.

If we show this in different colours it would look like this:

fviz_dend(WARD, 
          k = 4,
          cex = 0.5, 
          palette = "jama",
          color_labels_by_k = TRUE, 
          rect = TRUE)

From the dendrogram we can also see that there is no unit which is problematic, so the potential outlier 177 is okay to stay.

We can also check the appropriate numbler of clusters with the following method:

set.seed(1)
#install.packages("NbClust")
library(NbClust)
OptNumber <- mydata[c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z")] %>%
  #scale() %>%
  NbClust(distance = "euclidean",
          min.nc = 2, max.nc = 10, 
          method = "ward.D2", 
          index = "all") 

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 1 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 11 proposed 4 as the best number of clusters 
## * 3 proposed 5 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 5 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************

According to this way, 11 rules proposed 4 as the best number of clusters, which is also what I chose from the dendrogram.

Now I must cut the dendrogram:

mydata$ClusterWard <- cutree(WARD, 
                             k = 4) #k=4, because we are making 4 clusters
head(mydata)
##   ID House_age MRT_distance No_stores Latitude Longitude House_price House_age_z MRT_distance_z No_stores_z Latitude_z
## 1  1      32.0     84.87882        10 24.98298  121.5402        37.9   1.2541110     -0.7915373   2.0049816  1.1240698
## 2  2      19.5    306.59470         9 24.98034  121.5395        42.2   0.1568964     -0.6158665   1.6654877  0.9113415
## 3  3      13.3    561.98450         5 24.98746  121.5439        47.3  -0.3873220     -0.4135150   0.3075125  1.4850633
## 4  4      13.3    561.98450         5 24.98746  121.5439        54.8  -0.3873220     -0.4135150   0.3075125  1.4850633
## 5  5       5.0    390.56840         5 24.97937  121.5425        43.1  -1.1158725     -0.5493321   0.3075125  0.8331800
## 6  6       7.1   2175.03000         3 24.96305  121.5125        32.1  -0.9315405      0.8645401  -0.3714751 -0.4818677
##   Dissimilarity ClusterWard
## 1      2.735472           1
## 2      2.002074           2
## 3      1.618947           2
## 4      1.618947           2
## 5      1.528296           2
## 6      1.409038           3
#Calculating positions of initial leaders.

Initial_leaders <- aggregate(mydata[, c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z")], 
                            by = list(mydata$ClusterWard), 
                            FUN = mean)

Initial_leaders
##   Group.1 House_age_z MRT_distance_z No_stores_z Latitude_z
## 1       1   1.4303968     -0.4908891   0.5444509  0.4435726
## 2       2  -0.6539170     -0.5691685   0.6709450  0.3734360
## 3       3  -0.3384873      0.3201352  -0.8726326 -0.2387496
## 4       4   0.2039547      2.6549612  -1.3050831 -1.9654595

“-” in front means below average of the entire sample

Next, I will perform K-Means clustering, where initial leaders are chosen as centroids of groups, found with hierarhical clustering:

library(factoextra)

K_MEANS <- hkmeans(mydata[c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z")], 
                   k = 4, 
                   hc.metric = "euclidean",
                   hc.method = "ward.D2")

K_MEANS
## Hierarchical K-means clustering with 4 clusters of sizes 100, 147, 129, 38
## 
## Cluster means:
##   House_age_z MRT_distance_z No_stores_z Latitude_z
## 1   1.3984166     -0.4783617   0.5179987  0.4714694
## 2  -0.7046932     -0.5734776   0.7024339  0.4213341
## 3  -0.3268987      0.2588928  -0.8162383 -0.2791024
## 4   0.1557414      2.5984266  -1.3095501 -1.9231273
## 
## Clustering vector:
##   [1] 1 2 2 2 2 3 1 2 4 3 1 2 2 3 2 1 2 3 2 2 3 2 3 2 1 1 2 2 2 2 4 1 1 2 2 3 3 3 2 2 4 4 1 1 2 1 2 1 4 4 1 3 1 2 2 3 1
##  [58] 2 4 2 3 2 3 2 3 1 2 2 1 2 2 1 1 4 2 3 1 3 1 3 2 1 2 3 2 2 3 4 3 4 3 3 3 3 1 2 2 1 2 2 2 3 2 2 1 2 2 3 1 3 2 1 3 2
## [115] 1 3 4 4 3 2 2 2 1 3 2 2 1 2 1 1 1 3 1 2 1 3 2 2 3 2 2 3 2 2 3 2 3 2 4 1 1 2 3 2 4 4 3 2 2 2 2 3 4 2 3 3 2 1 1 3 4
## [172] 2 2 1 2 1 4 1 2 3 4 2 3 4 3 1 3 4 1 4 1 3 1 2 4 3 3 1 1 2 3 2 1 2 3 3 2 1 3 1 2 3 3 2 3 2 1 1 2 1 1 3 1 3 1 2 4 1
## [229] 3 3 3 4 4 1 3 2 2 3 3 3 3 2 3 1 3 2 2 3 3 4 1 3 2 3 2 4 3 3 2 3 3 3 2 3 1 2 3 1 2 3 3 2 2 3 1 2 3 3 2 3 2 2 3 1 2
## [286] 1 2 3 2 2 1 2 3 2 1 3 2 1 4 1 2 1 3 1 3 2 3 4 2 3 3 1 1 2 2 3 2 3 2 1 4 2 3 1 3 1 2 2 3 4 3 4 1 2 1 1 3 1 1 2 1 3
## [343] 2 1 1 3 3 4 2 2 2 3 3 3 3 2 3 2 2 3 1 1 2 1 1 3 3 3 3 3 2 2 1 3 2 3 3 2 1 2 2 2 4 1 4 2 3 3 3 1 1 3 1 2 4 1 1 2 3
## [400] 3 1 3 3 1 2 1 2 3 3 4 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 139.40664 170.30184 236.62928  36.56635
##  (between_SS / total_SS =  64.7 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
##  [8] "iter"         "ifault"       "data"         "hclust"

This was done because hierarchical clustering is used to define initial leaders, whereas K-Means clustering is used to cluster based on length.

I also check the ratio of between and total sum of squares, in this case the number is high which is good. If it would be 100%, this would mean that all units within a cluster are completely the same. This means having a higher number is better for the purpose of clustering.

fviz_cluster(K_MEANS, 
             palette = "jama", 
             repel = TRUE, #to better see if units (values) overlap
             ggtheme = theme_classic())
## Warning: ggrepel: 333 unlabeled data points (too many overlaps). Consider increasing max.overlaps

Final leaders are the centers of the group. And we represent them with averages.

We can also see that with the first dimension we can catch 52.4% of the data, and with the second 25.1%, some information is lost.

mydata$ClusterK_Means <- K_MEANS$cluster #Saving the final result.
head(mydata[c("ID", "ClusterWard", "ClusterK_Means")])
##   ID ClusterWard ClusterK_Means
## 1  1           1              1
## 2  2           2              2
## 3  3           2              2
## 4  4           2              2
## 5  5           2              2
## 6  6           3              3

Here we can see if any unit (house) was reclustered into a different froup. (We always try to optimize the results with K-means clustering.)

But let us check more in detail. Now I will be checking for reclassifications:

table(mydata$ClusterWard)
## 
##   1   2   3   4 
##  96 156 126  36
table(mydata$ClusterK_Means)
## 
##   1   2   3   4 
## 100 147 129  38
table(mydata$ClusterWard, mydata$ClusterK_Means)
##    
##       1   2   3   4
##   1  94   0   2   0
##   2   4 147   5   0
##   3   2   0 122   2
##   4   0   0   0  36

We see that 96 houses are in group 1, 156 in group 2, 126 in group 3 and 36 in group 4 originally, but then some houses were reclassified.

Now 100 are in group 1, 94 which were originally there, and 4 from group 2 and 2 from group 3.

147 are in group 2, with no additions from other groups.

129 are in group 3, with 122 originally in group 3, 2 reclassified from group 1 to group 3, and 5 reclassified from group 2 to group 3.

Finally, there are 38 in group 4, of which 36 were not reclassified (were originally also in group 4), and 2 which came from group 3.

Centroids <- K_MEANS$centers
Centroids
##   House_age_z MRT_distance_z No_stores_z Latitude_z
## 1   1.3984166     -0.4783617   0.5179987  0.4714694
## 2  -0.7046932     -0.5734776   0.7024339  0.4213341
## 3  -0.3268987      0.2588928  -0.8162383 -0.2791024
## 4   0.1557414      2.5984266  -1.3095501 -1.9231273
library(ggplot2)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
Figure <- as.data.frame(Centroids)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c(House_age_z, MRT_distance_z, No_stores_z, Latitude_z))

Figure$Groups <- factor(Figure$id, 
                        levels = c(1, 2, 3), 
                        labels = c("1", "2", "3"))

Figure$nameFactor <- factor(Figure$name, 
                            levels = c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z"), 
                            labels = c("House_age_z", "MRT_distance_z", "No_stores_z", "Latitude_z"))

ggplot(Figure, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Groups, col = Groups), size = 3) +
  geom_line(aes(group = id), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")+
  ylim(-1.5, 1.5)
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 1 row containing missing values (`geom_line()`).

Now I must check if all of the cluster variables were successful at classifying units into groups? For this I must perform ANOVAs. Because we have 4 clustering variables, 4 ANOVAs are needed.

(cbind, joins more variables into 1 vector, so you can do all ANOVAs in 1 step; otherwise you would need to write the code 4 times)

fit <- aov(cbind(House_age_z, MRT_distance_z, No_stores_z, Latitude_z) ~ as.factor(ClusterK_Means), 
                 data = mydata)

summary(fit)
##  Response 1 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   3 283.26  94.421  298.39 < 2.2e-16 ***
## Residuals                 410 129.74   0.316                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   3 336.44 112.148  600.61 < 2.2e-16 ***
## Residuals                 410  76.56   0.187                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   3 250.48  83.492  210.63 < 2.2e-16 ***
## Residuals                 410 162.52   0.396                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   3 198.91  66.304  126.98 < 2.2e-16 ***
## Residuals                 410 214.09   0.522                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can check F statistics: bigger F test = further away units. So the variable with the highest F statistic separates them most/best (possible because we have standardised data).

The variable which most seperates them is MRT_distance, as it has the highest F statistic.

I check p-values, which are low and so appropriate for the clustering. If some of those p-values would be high, this means that we cannot reject H0 (meaning that this var. does not separate these groups), and if it does not, it is useless, so it would be good to be removed.

library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describeBy(mydata$House_price, mydata$ClusterK_Means)
## 
##  Descriptive statistics by group 
## group: 1
##    vars   n  mean   sd median trimmed  mad  min  max range skew kurtosis   se
## X1    1 100 40.45 9.88  39.65    39.4 5.34 21.5 78.3  56.8 1.44     3.46 0.99
## ------------------------------------------------------------------------------------------ 
## group: 2
##    vars   n  mean   sd median trimmed  mad min  max range  skew kurtosis   se
## X1    1 147 47.68 9.21   47.3   47.56 9.19 7.6 73.6    66 -0.19      1.8 0.76
## ------------------------------------------------------------------------------------------ 
## group: 3
##    vars   n  mean   sd median trimmed  mad  min   max range skew kurtosis   se
## X1    1 129 30.69 11.7   28.4   29.38 7.12 12.8 117.5 104.7 3.43    21.52 1.03
## ------------------------------------------------------------------------------------------ 
## group: 4
##    vars  n  mean   sd median trimmed  mad  min  max range skew kurtosis   se
## X1    1 38 18.72 6.36   17.8   17.89 4.45 11.2 45.1  33.9 1.97      5.5 1.03
aggregate(mydata$House_price, 
          by = list(mydata$ClusterK_Means), 
          FUN = "mean")
##   Group.1        x
## 1       1 40.45500
## 2       2 47.67619
## 3       3 30.68682
## 4       4 18.71842

H0: arith. mean(House price, group1) = arith. mean(House price, group2) = arith. mean(House price, group3) = arith. mean(House price, group4) H1: at least one is different

We are using criterion validity, the first criterion is price - checking the validity.

fit <- aov(House_price ~ as.factor(ClusterK_Means), 
           data = mydata)

summary(fit)
##                            Df Sum Sq Mean Sq F value Pr(>F)    
## as.factor(ClusterK_Means)   3  35393   11798   117.8 <2e-16 ***
## Residuals                 410  41069     100                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Because p-value is low (less than 0.001), we can validate based on this.

write.table(mydata, 
           file = "C:/Julia/SEB LU/IMB/SEMESTER 2/Multivariate Analysis/HOMEWORK/Real-estate-cluster.csv", 
           row.names = FALSE, 
           sep = ";", 
           dec = ",",
           fileEncoding = "UTF-8")

Classification of 414 houses was based on four standardised variables (House age, distance to nearest MRT stop, number of stores in proximity and house latitude).

For hierarchical clustering, Ward’s algorithm was used, and based on the analysis of the dendrogram and the indices analyzing the increase in heterogeneity, it was decided to classify the houses into four groups. The classification was further optimized using the K-Means cluster.

Group 1 contains the second least houses (estimated at 24%). The houses have the lowest mean scores in none of the for variables, and the highest mean score in house age and latitude among all groups. On average, the houses are the second most expensive. The houses in this group are priced at an average of 40.455, ranking second in their price.