Refining dataset by MDS, PCA and Hierarcal Analysis

Introduction

The Dataset was taken from Kaggle website. In this dataset there are 342 hostels of five big cities: “Fukuoka-City”, “Hiroshima”,“Kyoto”,“Osaka” and “Tokyo” in Japan with 16 variables.

This paper is about how to refine dataset for exploration/summary by employing various dimension reduction methods: Multidimensional Scaling (MDS), Principle Component Analysis (PCA) and Hierarchical analysis. Particluarly on the most important feature of a hostel in 8 categories: price from (minimum price) , atmosphere, cleaniness, facilities, location, security, staff and value for money.

Dataset after cleaning

as_tibble(df)

## # A tibble: 327 x 7
##    price.from atmosphere cleanliness facilities security staff
##  *      <int>      <dbl>       <dbl>      <dbl>    <dbl> <dbl>
##  1       3300        8.9         9.4        9.3      9     9.4
##  2       2600        9.4         9.7        9.5      9.2   9.7
##  3       3600        8           7          9       10    10  
##  4       2600        8           7.5        7.5      7     8  
##  5       1500        9.5         9.5        9        9.5  10  
##  6       2100        5.5         8          6        8.5   8.5
##  7       3300        8.7         9.7        9.3      9.3   9.7
##  8       2000        8.8         9.9        9.2      9.8   9.8
##  9       2200        6.7         7.2        6.8      7.8   8.5
## 10       1600        9.5         9.1        8.7      8.9   9.8
## # ... with 317 more rows, and 1 more variable: valueformoney <dbl>

dim(df)

## [1] 327   7

Multidimensional Scaling (MDS)

As rule of thumbs, reducing dataset into 2 dimensions is preferable.

dist_reg<-dist(df)
as.matrix(dist_reg)[1:10, 1:10]

##               1           2         3           4         5         6
## 1     0.0000000  700.000371  300.0136  700.015736 1800.0003 1200.0141
## 2   700.0003714    0.000000 1000.0052    5.246904 1100.0002  500.0413
## 3   300.0136330 1000.005240    0.0000 1000.010875 2100.0021 1500.0090
## 4   700.0157355    5.246904 1000.0109    0.000000 1100.0126  500.0112
## 5  1800.0003000 1100.000218 2100.0021 1100.012614    0.0000  600.0329
## 6  1200.0141166  500.041278 1500.0090  500.011250  600.0329    0.0000
## 7     0.7483315  700.000643  300.0141  700.016078 1800.0003 1200.0133
## 8  1300.0004154  600.000717 1600.0029  600.024475  500.0008  100.1854
## 10 1100.0090318  400.032736 1400.0052  400.007150  700.0179  100.0288
## 11 1700.0002912 1000.000555 2000.0021 1000.011050  100.0032  500.0353
##               7         8        10        11
## 1     0.7483315 1300.0004 1100.0090 1700.0003
## 2   700.0006429  600.0007  400.0327 1000.0006
## 3   300.0140997 1600.0029 1400.0052 2000.0021
## 4   700.0160784  600.0245  400.0071 1000.0110
## 5  1800.0003500  500.0008  700.0179  100.0032
## 6  1200.0132749  100.1854  100.0288  500.0353
## 7     0.0000000 1300.0003 1100.0095 1700.0006
## 8  1300.0002615    0.0000  200.0628  400.0027
## 10 1100.0094681  200.0628    0.0000  600.0166
## 11 1700.0005559  400.0027  600.0166    0.0000

df_mds<-cmdscale(dist_reg, k=2) 
summary(df_mds)

##        V1               V2          
##  Min.   : -7612   Min.   :-14.3348  
##  1st Qu.: -6612   1st Qu.: -0.6014  
##  Median : -6112   Median :  0.5771  
##  Mean   :     0   Mean   :  0.0000  
##  3rd Qu.: -5712   3rd Qu.:  1.5831  
##  Max.   :994588   Max.   :  2.9898

cmdscale(dist_reg, k=2, eig = T)$GOF

## [1] 1 1

Goodness of fitness is unusual high, maybe MDS is not the right method. However…

plot(df_mds) 
library(maptools)

## Loading required package: sp

## Checking rgeos availability: TRUE

pointLabel(df_mds, rownames(df_mds), cex=0.8)
#there is some outlier here,
abline(v=50000) # vertical line
abline(h=-8) # horizontal line
x_out<-which(df_mds[,1]>50000) # arbitrary limits
y_out<-which(df_mds[,2]<(-8))
out_all<-c(x_out, y_out)    # merging into single dataset
out_uni<-unique(out_all)        # unique observations only
df_out<-df[out_uni,]    # dataset conditioned with outliers
df_out$x<-df_mds[out_uni,1]
df_out$y<-df_mds[out_uni,2]
points(df_out[,8:9], pch=21, bg="red")

We can see that there are some outliers here, it is a good idea to remove them for further exploration.

df1 <- df[-out_uni,]
dim(df1)

## [1] 318   7

Principle Component Analysis (PCA)

library('stats')
library(factoextra)
df_pca <- prcomp(df1,  center=TRUE, scale.=TRUE)
summary(df_pca)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.0229 1.0036 0.76916 0.64696 0.63851 0.50016
## Proportion of Variance 0.5846 0.1439 0.08452 0.05979 0.05824 0.03574
## Cumulative Proportion  0.5846 0.7285 0.81296 0.87276 0.93100 0.96674
##                            PC7
## Standard deviation     0.48255
## Proportion of Variance 0.03326
## Cumulative Proportion  1.00000

fviz_eig(df_pca, addlabels = TRUE, ylim = c(0, 70))

For 4 components, up to 88% of dataset can be explained; however, 2 components accumulate 73% (which is still good) and easier to describe/plot, we will focus only 2 PCs.

Result

Coordinates

var <- get_pca_var(df_pca)
head(var$coord)

##                  Dim.1       Dim.2       Dim.3       Dim.4      Dim.5
## price.from  -0.1815930 -0.97383241  0.04186109 -0.02397869  0.1145704
## atmosphere  -0.7760006 -0.01023835 -0.55171075 -0.07123492  0.1693585
## cleanliness -0.8417169 -0.09429280  0.13991439 -0.16880259 -0.3813442
## facilities  -0.8687210  0.05314094  0.06118371 -0.25578759 -0.1721600
## security    -0.7847779  0.07892450  0.47987706  0.19521889  0.2398977
## staff       -0.8121262 -0.01767922 -0.17737963  0.50409353 -0.1744946
##                    Dim.6       Dim.7
## price.from   0.012456179  0.05536741
## atmosphere   0.018153730 -0.24340734
## cleanliness -0.275751704 -0.11440638
## facilities   0.365335758  0.10109317
## security     0.065076012 -0.21844066
## staff        0.009546789  0.15500043

fviz_pca_var(df_pca, col.var = "cos2",repel = TRUE, gradient.cols = c("blue", "yellow", "red"))

Except minimum price, other variables seems to be positive correlated with each other.

Total cos2 of variables on Dim1(PC1) and Dim2(PC2)

fviz_cos2(df_pca, choice = "var", axes = 1:2)

Yet minimum price contribute the most to the both PCs

head(var$contrib)

##                  Dim.1       Dim.2      Dim.3      Dim.4     Dim.5
## price.from   0.8058706 94.16239117  0.2962016  0.1373693  3.219637
## atmosphere  14.7160537  0.01040804 51.4504530  1.2123402  7.035193
## cleanliness 17.3140746  0.88280773  3.3089564  6.8076449 35.669414
## facilities  18.4428418  0.28039318  0.6327585 15.6313899  7.269867
## security    15.0508403  0.61849000 38.9247909  9.1050504 14.116080
## staff       16.1181157  0.03103380  5.3183117 60.7100932  7.468365
##                   Dim.6     Dim.7
## price.from   0.06202272  1.316507
## atmosphere   0.13173856 25.443813
## cleanliness 30.39607959  5.621023
## facilities  53.35382075  4.388929
## security     1.69286772 20.491881
## staff        0.03643307 10.317648

library("corrplot")

## corrplot 0.84 loaded

corrplot(var$cos2, is.corr=FALSE)

Individual points

fviz_pca_ind(df_pca, col.ind = "cos2", pointsize = "cos2", label = FALSE,alpha.ind = 0.7,
             gradient.cols = c("blue", "yellow", "red"),
             repel = TRUE
)

It seems the observations distribute in widerange across PC1 and PC2.

K-means

After using PCA, K-means could be used:

col_pca <-c('PC1','PC2')
pca_data<- df_pca[["x"]][,col_pca]
head(pca_data)

##           PC1         PC2
## 1 -0.93545418 -0.85302682
## 2 -1.52491566 -0.01949737
## 3 -0.06937586 -1.00672603
## 4  3.93089812 -0.72193440
## 5 -1.42391216  1.28501775
## 6  4.36580142 -0.12428257

Calculating number of optimal number of clusters

fviz_nbclust(pca_data, FUNcluster=kmeans)

Kmeans graph:

km2 <- eclust(pca_data,'kmeans', hc_metric = 'euclidean', k= 2)

From there, further inspection could be conducted, however, it is not the scope this paper.

Hierarchical analysis

dist_reg<-dist(df1)
df_hc<-hclust(dist_reg, method="complete") # simple dendrogram
plot(df_hc)

Hierarchical analysis do not modify data into dimension, instead it is seeks to build a hierarchy of clusters. Therefore, any numbers of cluster could be used. In this case, let’s try 2 clusters as using Kmeans after PCA give a good result.

hc<- df1[]
hc$clust <- cutree(df_hc, k=2)
summary(hc$clust)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.019   1.000   2.000

plot(df_hc)
rect.hclust(df_hc, k=2, border="red")
pointLabel(df_mds, rownames(df_mds), cex=0.8)

The quality of hierarchical analysis could be checked by ClustGeo library.

test <-  cutree(df_hc, k=2)
library(ClustGeo)
inertion<-matrix(0, nrow=4, ncol=1)
rownames(inertion)<-c("intra-clust", "total", "percentage", "Q")
inertion[1,1]<-withindiss(dist_reg, part=test)  # intra-cluster
inertion[2,1]<-inertdiss(dist_reg)              # overall
inertion[3,1]<-inertion[1,1]/ inertion[2,1]     # ratio
inertion[4,1]<-1-inertion[3,1]              # Q, inter-cluster
inertion

##                     [,1]
## intra-clust 4.298107e+05
## total       6.776936e+05
## percentage  6.342258e-01
## Q           3.657742e-01

Conclusion

Among the 3 methods, PCA have the best results with 2 components cover up to 73%. However, MDS is also useful to detect outlier, and hierarchical analysis is easier to understand which observation can be group as one.