The Dataset was taken from Kaggle website. In this dataset there are 342 hostels of five big cities: “Fukuoka-City”, “Hiroshima”,“Kyoto”,“Osaka” and “Tokyo” in Japan with 16 variables.
This paper is about how to refine dataset for exploration/summary by employing various dimension reduction methods: Multidimensional Scaling (MDS), Principle Component Analysis (PCA) and Hierarchical analysis. Particluarly on the most important feature of a hostel in 8 categories: price from (minimum price) , atmosphere, cleaniness, facilities, location, security, staff and value for money.
as_tibble(df)
## # A tibble: 327 x 7
## price.from atmosphere cleanliness facilities security staff
## * <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3300 8.9 9.4 9.3 9 9.4
## 2 2600 9.4 9.7 9.5 9.2 9.7
## 3 3600 8 7 9 10 10
## 4 2600 8 7.5 7.5 7 8
## 5 1500 9.5 9.5 9 9.5 10
## 6 2100 5.5 8 6 8.5 8.5
## 7 3300 8.7 9.7 9.3 9.3 9.7
## 8 2000 8.8 9.9 9.2 9.8 9.8
## 9 2200 6.7 7.2 6.8 7.8 8.5
## 10 1600 9.5 9.1 8.7 8.9 9.8
## # ... with 317 more rows, and 1 more variable: valueformoney <dbl>
dim(df)
## [1] 327 7
As rule of thumbs, reducing dataset into 2 dimensions is preferable.
dist_reg<-dist(df)
as.matrix(dist_reg)[1:10, 1:10]
## 1 2 3 4 5 6
## 1 0.0000000 700.000371 300.0136 700.015736 1800.0003 1200.0141
## 2 700.0003714 0.000000 1000.0052 5.246904 1100.0002 500.0413
## 3 300.0136330 1000.005240 0.0000 1000.010875 2100.0021 1500.0090
## 4 700.0157355 5.246904 1000.0109 0.000000 1100.0126 500.0112
## 5 1800.0003000 1100.000218 2100.0021 1100.012614 0.0000 600.0329
## 6 1200.0141166 500.041278 1500.0090 500.011250 600.0329 0.0000
## 7 0.7483315 700.000643 300.0141 700.016078 1800.0003 1200.0133
## 8 1300.0004154 600.000717 1600.0029 600.024475 500.0008 100.1854
## 10 1100.0090318 400.032736 1400.0052 400.007150 700.0179 100.0288
## 11 1700.0002912 1000.000555 2000.0021 1000.011050 100.0032 500.0353
## 7 8 10 11
## 1 0.7483315 1300.0004 1100.0090 1700.0003
## 2 700.0006429 600.0007 400.0327 1000.0006
## 3 300.0140997 1600.0029 1400.0052 2000.0021
## 4 700.0160784 600.0245 400.0071 1000.0110
## 5 1800.0003500 500.0008 700.0179 100.0032
## 6 1200.0132749 100.1854 100.0288 500.0353
## 7 0.0000000 1300.0003 1100.0095 1700.0006
## 8 1300.0002615 0.0000 200.0628 400.0027
## 10 1100.0094681 200.0628 0.0000 600.0166
## 11 1700.0005559 400.0027 600.0166 0.0000
df_mds<-cmdscale(dist_reg, k=2)
summary(df_mds)
## V1 V2
## Min. : -7612 Min. :-14.3348
## 1st Qu.: -6612 1st Qu.: -0.6014
## Median : -6112 Median : 0.5771
## Mean : 0 Mean : 0.0000
## 3rd Qu.: -5712 3rd Qu.: 1.5831
## Max. :994588 Max. : 2.9898
cmdscale(dist_reg, k=2, eig = T)$GOF
## [1] 1 1
Goodness of fitness is unusual high, maybe MDS is not the right method. However…
plot(df_mds)
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
pointLabel(df_mds, rownames(df_mds), cex=0.8)
#there is some outlier here,
abline(v=50000) # vertical line
abline(h=-8) # horizontal line
x_out<-which(df_mds[,1]>50000) # arbitrary limits
y_out<-which(df_mds[,2]<(-8))
out_all<-c(x_out, y_out) # merging into single dataset
out_uni<-unique(out_all) # unique observations only
df_out<-df[out_uni,] # dataset conditioned with outliers
df_out$x<-df_mds[out_uni,1]
df_out$y<-df_mds[out_uni,2]
points(df_out[,8:9], pch=21, bg="red")
We can see that there are some outliers here, it is a good idea to remove them for further exploration.
df1 <- df[-out_uni,]
dim(df1)
## [1] 318 7
library('stats')
library(factoextra)
df_pca <- prcomp(df1, center=TRUE, scale.=TRUE)
summary(df_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0229 1.0036 0.76916 0.64696 0.63851 0.50016
## Proportion of Variance 0.5846 0.1439 0.08452 0.05979 0.05824 0.03574
## Cumulative Proportion 0.5846 0.7285 0.81296 0.87276 0.93100 0.96674
## PC7
## Standard deviation 0.48255
## Proportion of Variance 0.03326
## Cumulative Proportion 1.00000
fviz_eig(df_pca, addlabels = TRUE, ylim = c(0, 70))
For 4 components, up to 88% of dataset can be explained; however, 2 components accumulate 73% (which is still good) and easier to describe/plot, we will focus only 2 PCs.
var <- get_pca_var(df_pca)
head(var$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## price.from -0.1815930 -0.97383241 0.04186109 -0.02397869 0.1145704
## atmosphere -0.7760006 -0.01023835 -0.55171075 -0.07123492 0.1693585
## cleanliness -0.8417169 -0.09429280 0.13991439 -0.16880259 -0.3813442
## facilities -0.8687210 0.05314094 0.06118371 -0.25578759 -0.1721600
## security -0.7847779 0.07892450 0.47987706 0.19521889 0.2398977
## staff -0.8121262 -0.01767922 -0.17737963 0.50409353 -0.1744946
## Dim.6 Dim.7
## price.from 0.012456179 0.05536741
## atmosphere 0.018153730 -0.24340734
## cleanliness -0.275751704 -0.11440638
## facilities 0.365335758 0.10109317
## security 0.065076012 -0.21844066
## staff 0.009546789 0.15500043
fviz_pca_var(df_pca, col.var = "cos2",repel = TRUE, gradient.cols = c("blue", "yellow", "red"))
Except minimum price, other variables seems to be positive correlated with each other.
fviz_cos2(df_pca, choice = "var", axes = 1:2)
Yet minimum price contribute the most to the both PCs
head(var$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## price.from 0.8058706 94.16239117 0.2962016 0.1373693 3.219637
## atmosphere 14.7160537 0.01040804 51.4504530 1.2123402 7.035193
## cleanliness 17.3140746 0.88280773 3.3089564 6.8076449 35.669414
## facilities 18.4428418 0.28039318 0.6327585 15.6313899 7.269867
## security 15.0508403 0.61849000 38.9247909 9.1050504 14.116080
## staff 16.1181157 0.03103380 5.3183117 60.7100932 7.468365
## Dim.6 Dim.7
## price.from 0.06202272 1.316507
## atmosphere 0.13173856 25.443813
## cleanliness 30.39607959 5.621023
## facilities 53.35382075 4.388929
## security 1.69286772 20.491881
## staff 0.03643307 10.317648
library("corrplot")
## corrplot 0.84 loaded
corrplot(var$cos2, is.corr=FALSE)
fviz_pca_ind(df_pca, col.ind = "cos2", pointsize = "cos2", label = FALSE,alpha.ind = 0.7,
gradient.cols = c("blue", "yellow", "red"),
repel = TRUE
)
It seems the observations distribute in widerange across PC1 and PC2.
After using PCA, K-means could be used:
col_pca <-c('PC1','PC2')
pca_data<- df_pca[["x"]][,col_pca]
head(pca_data)
## PC1 PC2
## 1 -0.93545418 -0.85302682
## 2 -1.52491566 -0.01949737
## 3 -0.06937586 -1.00672603
## 4 3.93089812 -0.72193440
## 5 -1.42391216 1.28501775
## 6 4.36580142 -0.12428257
Calculating number of optimal number of clusters
fviz_nbclust(pca_data, FUNcluster=kmeans)
Kmeans graph:
km2 <- eclust(pca_data,'kmeans', hc_metric = 'euclidean', k= 2)
From there, further inspection could be conducted, however, it is not the scope this paper.
dist_reg<-dist(df1)
df_hc<-hclust(dist_reg, method="complete") # simple dendrogram
plot(df_hc)
Hierarchical analysis do not modify data into dimension, instead it is seeks to build a hierarchy of clusters. Therefore, any numbers of cluster could be used. In this case, let’s try 2 clusters as using Kmeans after PCA give a good result.
hc<- df1[]
hc$clust <- cutree(df_hc, k=2)
summary(hc$clust)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.019 1.000 2.000
plot(df_hc)
rect.hclust(df_hc, k=2, border="red")
pointLabel(df_mds, rownames(df_mds), cex=0.8)
The quality of hierarchical analysis could be checked by ClustGeo library.
test <- cutree(df_hc, k=2)
library(ClustGeo)
inertion<-matrix(0, nrow=4, ncol=1)
rownames(inertion)<-c("intra-clust", "total", "percentage", "Q")
inertion[1,1]<-withindiss(dist_reg, part=test) # intra-cluster
inertion[2,1]<-inertdiss(dist_reg) # overall
inertion[3,1]<-inertion[1,1]/ inertion[2,1] # ratio
inertion[4,1]<-1-inertion[3,1] # Q, inter-cluster
inertion
## [,1]
## intra-clust 4.298107e+05
## total 6.776936e+05
## percentage 6.342258e-01
## Q 3.657742e-01
Among the 3 methods, PCA have the best results with 2 components cover up to 73%. However, MDS is also useful to detect outlier, and hierarchical analysis is easier to understand which observation can be group as one.