Packages Required:
library(psych)
library(readr)
library(prettydoc)
library(tidyverse)
library(FactoMineR)
library(car)
library(skimr)
library(stats)
library(purrr)
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8) # load dataset
download.file("https://www.dropbox.com/s/6yoi7hoewg6563n/Hospital.csv?dl=1", "hospital.csv")
hospital <- read_csv("hospital.csv")
df <- hospital # to simplify and minimize amount of changes to make to original codeData Prep to remove demographic variables
###Factor analysis step 2: Check correlation matrix
## reputation quality distance doctor friends
## reputation 1.0000000 -0.5642245 -0.128171212 0.8574469 -0.6640828
## quality -0.5642245 1.0000000 -0.205235351 -0.5724863 0.8389557
## distance -0.1281712 -0.2052354 1.000000000 -0.1281710 -0.2177883
## doctor 0.8574469 -0.5724863 -0.128171001 1.0000000 -0.6636600
## friends -0.6640828 0.8389557 -0.217788263 -0.6636600 1.0000000
## ads -0.1823649 -0.3287447 -0.036027958 -0.1876315 -0.2571390
## staff -0.6748534 0.8718534 -0.192615781 -0.6711862 0.9174313
## facilities 0.8576301 -0.5670271 -0.131866954 0.8434716 -0.6285909
## nursery 0.8337463 -0.5627049 -0.097004128 0.8395771 -0.6534683
## insurance -0.2142136 -0.3015886 -0.003761886 -0.2047735 -0.2413311
## familiar 0.7510432 -0.1803307 -0.308038301 0.7077666 -0.2568143
## convenience 0.7323836 -0.1794451 -0.332817095 0.7072520 -0.2540840
## fullservice -0.6697384 0.3296036 -0.192999920 -0.6617422 0.4275858
## qualcare -0.2777240 0.7503848 -0.387940545 -0.2622053 0.7484171
## ads staff facilities nursery insurance
## reputation -0.18236493 -0.6748534 0.8576301 0.83374630 -0.214213613
## quality -0.32874471 0.8718534 -0.5670271 -0.56270487 -0.301588552
## distance -0.03602796 -0.1926158 -0.1318670 -0.09700413 -0.003761886
## doctor -0.18763150 -0.6711862 0.8434716 0.83957713 -0.204773548
## friends -0.25713904 0.9174313 -0.6285909 -0.65346832 -0.241331110
## ads 1.00000000 -0.2790169 -0.1948477 -0.16296383 0.797335879
## staff -0.27901685 1.0000000 -0.6422828 -0.66544636 -0.279254861
## facilities -0.19484770 -0.6422828 1.0000000 0.81963063 -0.231850433
## nursery -0.16296383 -0.6654464 0.8196306 1.00000000 -0.190847002
## insurance 0.79733588 -0.2792549 -0.2318504 -0.19084700 1.000000000
## familiar -0.50867966 -0.2619830 0.7592179 0.69501094 -0.546273691
## convenience -0.49395653 -0.2445710 0.7375454 0.69562282 -0.513095934
## fullservice 0.56239074 0.4221076 -0.6677868 -0.64350090 0.553869069
## qualcare -0.52456448 0.7806046 -0.2370941 -0.27191100 -0.522785030
## familiar convenience fullservice qualcare
## reputation 0.7510432 0.7323836 -0.66973840 -0.27772397
## quality -0.1803307 -0.1794451 0.32960364 0.75038480
## distance -0.3080383 -0.3328171 -0.19299992 -0.38794054
## doctor 0.7077666 0.7072520 -0.66174222 -0.26220535
## friends -0.2568143 -0.2540840 0.42758583 0.74841713
## ads -0.5086797 -0.4939565 0.56239074 -0.52456448
## staff -0.2619830 -0.2445710 0.42210762 0.78060456
## facilities 0.7592179 0.7375454 -0.66778683 -0.23709408
## nursery 0.6950109 0.6956228 -0.64350090 -0.27191100
## insurance -0.5462737 -0.5130959 0.55386907 -0.52278503
## familiar 1.0000000 0.8776665 -0.69536120 0.20000669
## convenience 0.8776665 1.0000000 -0.65626098 0.19837522
## fullservice -0.6953612 -0.6562610 1.00000000 0.08092793
## qualcare 0.2000067 0.1983752 0.08092793 1.00000000
#Factor analysis step 3: Conduct Principal Components Analysis
##
## Call:
## PCA(X = df, scale.unit = TRUE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 6.977 3.896 1.405 0.235 0.219 0.204 0.185
## % of var. 49.833 27.825 10.036 1.676 1.562 1.454 1.319
## Cumulative % of var. 49.833 77.658 87.694 89.370 90.932 92.386 93.705
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 0.166 0.158 0.142 0.127 0.119 0.104 0.066
## % of var. 1.187 1.126 1.013 0.908 0.847 0.741 0.475
## Cumulative % of var. 94.892 96.017 97.030 97.938 98.784 99.525 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 1 | 3.590 | 3.377 0.605 0.885 | -0.077 0.001 0.000 | 0.185
## 2 | 5.235 | -0.331 0.006 0.004 | -2.982 0.846 0.325 | -3.949
## 3 | 3.910 | 3.673 0.716 0.883 | 0.249 0.006 0.004 | -0.104
## 4 | 3.555 | 3.299 0.578 0.861 | 0.272 0.007 0.006 | -0.179
## 5 | 3.899 | 3.700 0.727 0.901 | 0.525 0.026 0.018 | -0.019
## 6 | 3.776 | 3.532 0.662 0.875 | 0.543 0.028 0.021 | -0.031
## 7 | 2.668 | -2.115 0.237 0.629 | 1.169 0.130 0.192 | -0.070
## 8 | 2.378 | -1.955 0.203 0.676 | 0.825 0.065 0.120 | -0.152
## 9 | 6.016 | -2.640 0.370 0.193 | -5.026 2.402 0.698 | 1.318
## 10 | 3.931 | 3.557 0.672 0.819 | -0.293 0.008 0.006 | 0.591
## ctr cos2
## 1 0.009 0.003 |
## 2 4.111 0.569 |
## 3 0.003 0.001 |
## 4 0.008 0.003 |
## 5 0.000 0.000 |
## 6 0.000 0.000 |
## 7 0.001 0.001 |
## 8 0.006 0.004 |
## 9 0.458 0.048 |
## 10 0.092 0.023 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## reputation | 0.932 12.446 0.868 | 0.051 0.068 0.003 | 0.112 0.885
## quality | -0.677 6.578 0.459 | 0.619 9.822 0.383 | 0.047 0.157
## distance | -0.030 0.013 0.001 | -0.333 2.851 0.111 | -0.894 56.821
## doctor | 0.922 12.174 0.849 | 0.043 0.047 0.002 | 0.108 0.823
## friends | -0.761 8.296 0.579 | 0.563 8.138 0.317 | 0.081 0.462
## ads | -0.204 0.597 0.042 | -0.813 16.962 0.661 | 0.400 11.375
## staff | -0.769 8.470 0.591 | 0.587 8.834 0.344 | 0.048 0.161
## facilities | 0.921 12.154 0.848 | 0.079 0.160 0.006 | 0.114 0.931
## nursery | 0.907 11.786 0.822 | 0.027 0.019 0.001 | 0.095 0.648
## insurance | -0.230 0.756 0.053 | -0.818 17.159 0.668 | 0.357 9.088
## cos2
## reputation 0.012 |
## quality 0.002 |
## distance 0.798 |
## doctor 0.012 |
## friends 0.006 |
## ads 0.160 |
## staff 0.002 |
## facilities 0.013 |
## nursery 0.009 |
## insurance 0.128 |
## Principal Components Analysis
## Call: principal(r = df, nfactors = 3, residuals = TRUE, rotate = "varimax",
## scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 RC3 h2 u2 com
## reputation 0.94 0.06 0.01 0.88 0.117 1.0
## quality -0.67 0.50 0.38 0.84 0.156 2.5
## distance -0.15 0.06 -0.94 0.91 0.090 1.1
## doctor 0.93 0.06 0.00 0.86 0.137 1.0
## friends -0.75 0.43 0.40 0.90 0.098 2.2
## ads -0.14 -0.92 0.04 0.86 0.138 1.0
## staff -0.76 0.46 0.38 0.94 0.063 2.2
## facilities 0.93 0.09 0.02 0.87 0.133 1.0
## nursery 0.91 0.05 -0.01 0.83 0.168 1.0
## insurance -0.17 -0.91 0.01 0.85 0.151 1.1
## familiar 0.77 0.48 0.27 0.89 0.105 2.0
## convenience 0.76 0.45 0.30 0.88 0.122 2.0
## fullservice -0.70 -0.52 0.33 0.87 0.131 2.3
## qualcare -0.34 0.67 0.56 0.89 0.115 2.5
##
## RC1 RC2 RC3
## SS loadings 6.87 3.48 1.92
## Proportion Var 0.49 0.25 0.14
## Cumulative Var 0.49 0.74 0.88
## Proportion Explained 0.56 0.28 0.16
## Cumulative Proportion 0.56 0.84 1.00
##
## Mean item complexity = 1.6
## Test of the hypothesis that 3 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.02
## with the empirical chi square 19.98 with prob < 1
##
## Fit based upon off diagonal values = 1
Factor analysis step 4: Determine the number of factors
Check the % var. explained output from the call: summary(pca)
#Factor analysis step 7a: Calculate factor scores
#First calculate eigenvalues and eigenvectors
df.cov <-cov(df)
df.eigen <- eigen(df.cov)
str(df.eigen)## List of 2
## $ values : num [1:14] 15.518 6.041 1.236 0.523 0.43 ...
## $ vectors: num [1:14, 1:14] 0.36598 -0.29733 -0.00374 0.38539 -0.32329 ...
## - attr(*, "class")= chr "eigen"
## [,1] [,2] [,3]
## [1,] 0.365977271 0.10673110 0.11288937
## [2,] -0.297330001 0.35797404 0.08869184
## [3,] -0.003740584 -0.11103984 -0.54120632
## [4,] 0.385389235 0.10441532 0.11492342
## [5,] -0.323287506 0.30781220 0.13737424
## [6,] -0.025618333 -0.33477705 0.44780216
## [7,] -0.325132825 0.31725125 0.08375900
## [8,] 0.357624565 0.12246177 0.12115846
## [9,] 0.363341486 0.09124877 0.11070834
## [10,] -0.031914466 -0.33127856 0.39503048
## [11,] 0.258666710 0.37738086 0.11866565
## [12,] 0.197159692 0.28547680 0.13011535
## [13,] -0.185043938 -0.16085964 0.45440443
## [14,] -0.131459580 0.38327590 0.16243255
phi<- -phi
row.names(phi) <- c("v1","v2","v3","v4","v5","v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14")
colnames(phi) <- c("PC1", "PC2", "PC3")
#Finally, calculate principal components scores
PC1 <- as.matrix(df) %*%phi[,1]
PC2 <- as.matrix(df) %*%phi[,2]
PC3 <- as.matrix(df) %*%phi[,3]
#Store the principal components scores
PC <- data.frame(PC1,PC2,PC3)Q3
Cluster analysis step 4: Decide on the number of clusters
#Check the plots output for the call: plot() on line 20
#Conduct k means cluster analysis to store cluster assignments
set.seed(123)
kmeans3.df <- kmeans(df, 3,nstart = 25)
print(kmeans3.df)## K-means clustering with 3 clusters of sizes 133, 96, 41
##
## Cluster means:
## reputation quality distance doctor friends ads staff facilities
## 1 2.473684 5.511278 2.774436 2.127820 5.082707 2.172932 5.804511 2.406015
## 2 5.500000 2.739583 2.812500 5.281250 2.104167 2.197917 2.802083 5.343750
## 3 2.487805 2.487805 3.804878 2.195122 2.243902 4.219512 2.853659 2.268293
## nursery insurance familiar convenience fullservice qualcare
## 1 2.157895 2.654135 3.661654 3.917293 4.000000 5.218045
## 2 5.156250 2.625000 5.614583 5.406250 2.531250 3.822917
## 3 2.317073 4.658537 1.560976 2.292683 4.487805 2.219512
##
## Clustering vector:
## [1] 2 3 2 2 2 2 1 1 3 2 1 2 1 1 1 2 1 2 1 1 2 2 1 3 1 2 1 3 1 2 2 3 2 2 2 1 1
## [38] 3 3 3 2 2 2 2 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 1 1 2 1 3 1 1 2 1 3 2 1 1 1 1
## [75] 2 3 2 1 1 2 3 1 1 2 1 1 1 3 2 2 2 2 1 2 2 1 1 1 1 3 3 1 1 3 1 1 1 1 2 1 2
## [112] 1 1 1 2 1 1 1 1 1 1 3 1 2 2 2 3 1 1 1 3 1 3 2 1 2 2 3 1 2 2 1 2 2 2 2 2 1
## [149] 1 1 1 3 3 1 3 3 2 1 2 1 1 1 1 2 2 2 1 3 1 1 3 1 1 3 2 2 2 2 2 1 2 2 3 1 2
## [186] 1 2 1 1 1 1 1 1 2 1 2 2 1 1 2 2 1 1 1 1 3 1 2 2 3 3 2 1 1 1 1 1 1 2 2 2 1
## [223] 2 2 2 2 1 3 2 2 3 2 3 2 1 2 3 1 2 2 1 3 3 2 1 2 1 2 3 1 1 1 3 1 1 1 2 1 1
## [260] 2 1 1 1 2 1 2 2 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 494.8722 368.5625 503.0732
## (between_SS / total_SS = 80.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Q4
Cluster analysis step 4: Decide on the number of clusters
#Check the plots output for the call: plot() on line 20
#Conduct k means cluster analysis to store cluster assignments
set.seed(123)
kmeans4.df <- kmeans(df, 4,nstart = 25)
print(kmeans4.df)## K-means clustering with 4 clusters of sizes 28, 133, 13, 96
##
## Cluster means:
## reputation quality distance doctor friends ads staff facilities
## 1 2.500000 2.357143 2.892857 2.178571 2.321429 5.214286 2.857143 2.285714
## 2 2.473684 5.511278 2.774436 2.127820 5.082707 2.172932 5.804511 2.406015
## 3 2.461538 2.769231 5.769231 2.230769 2.076923 2.076923 2.846154 2.230769
## 4 5.500000 2.739583 2.812500 5.281250 2.104167 2.197917 2.802083 5.343750
## nursery insurance familiar convenience fullservice qualcare
## 1 2.321429 5.571429 1.464286 2.285714 5.464286 2.285714
## 2 2.157895 2.654135 3.661654 3.917293 4.000000 5.218045
## 3 2.307692 2.692308 1.769231 2.307692 2.384615 2.076923
## 4 5.156250 2.625000 5.614583 5.406250 2.531250 3.822917
##
## Clustering vector:
## [1] 4 3 4 4 4 4 2 2 1 4 2 4 2 2 2 4 2 4 2 2 4 4 2 3 2 4 2 3 2 4 4 1 4 4 4 2 2
## [38] 3 3 3 4 4 4 4 2 2 4 2 2 2 2 1 2 2 2 2 2 2 2 2 2 4 2 1 2 2 4 2 1 4 2 2 2 2
## [75] 4 1 4 2 2 4 1 2 2 4 2 2 2 1 4 4 4 4 2 4 4 2 2 2 2 3 1 2 2 1 2 2 2 2 4 2 4
## [112] 2 2 2 4 2 2 2 2 2 2 1 2 4 4 4 3 2 2 2 1 2 1 4 2 4 4 1 2 4 4 2 4 4 4 4 4 2
## [149] 2 2 2 1 3 2 1 1 4 2 4 2 2 2 2 4 4 4 2 1 2 2 1 2 2 3 4 4 4 4 4 2 4 4 1 2 4
## [186] 2 4 2 2 2 2 2 2 4 2 4 4 2 2 4 4 2 2 2 2 1 2 4 4 1 3 4 2 2 2 2 2 2 4 4 4 2
## [223] 4 4 4 4 2 1 4 4 1 4 3 4 2 4 1 2 4 4 2 3 1 4 2 4 2 4 1 2 2 2 1 2 2 2 4 2 2
## [260] 4 2 2 2 4 2 4 4 2 2 4
##
## Within cluster sum of squares by cluster:
## [1] 132.50000 494.87218 48.61538 368.56250
## (between_SS / total_SS = 85.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# a. Which clusters has the most number of prospective consumers below age of 25?
dfa <-
hospital %>%
filter(age < 25) %>%
select(-c(15:19))
set.seed(123)
kmeans4a.df <- kmeans(dfa, 4, nstart = 25)
print(kmeans4a.df)## K-means clustering with 4 clusters of sizes 28, 133, 13, 96
##
## Cluster means:
## reputation quality distance doctor friends ads staff facilities
## 1 2.500000 2.357143 2.892857 2.178571 2.321429 5.214286 2.857143 2.285714
## 2 2.473684 5.511278 2.774436 2.127820 5.082707 2.172932 5.804511 2.406015
## 3 2.461538 2.769231 5.769231 2.230769 2.076923 2.076923 2.846154 2.230769
## 4 5.500000 2.739583 2.812500 5.281250 2.104167 2.197917 2.802083 5.343750
## nursery insurance familiar convenience fullservice qualcare
## 1 2.321429 5.571429 1.464286 2.285714 5.464286 2.285714
## 2 2.157895 2.654135 3.661654 3.917293 4.000000 5.218045
## 3 2.307692 2.692308 1.769231 2.307692 2.384615 2.076923
## 4 5.156250 2.625000 5.614583 5.406250 2.531250 3.822917
##
## Clustering vector:
## [1] 4 3 4 4 4 4 2 2 1 4 2 4 2 2 2 4 2 4 2 2 4 4 2 3 2 4 2 3 2 4 4 1 4 4 4 2 2
## [38] 3 3 3 4 4 4 4 2 2 4 2 2 2 2 1 2 2 2 2 2 2 2 2 2 4 2 1 2 2 4 2 1 4 2 2 2 2
## [75] 4 1 4 2 2 4 1 2 2 4 2 2 2 1 4 4 4 4 2 4 4 2 2 2 2 3 1 2 2 1 2 2 2 2 4 2 4
## [112] 2 2 2 4 2 2 2 2 2 2 1 2 4 4 4 3 2 2 2 1 2 1 4 2 4 4 1 2 4 4 2 4 4 4 4 4 2
## [149] 2 2 2 1 3 2 1 1 4 2 4 2 2 2 2 4 4 4 2 1 2 2 1 2 2 3 4 4 4 4 4 2 4 4 1 2 4
## [186] 2 4 2 2 2 2 2 2 4 2 4 4 2 2 4 4 2 2 2 2 1 2 4 4 1 3 4 2 2 2 2 2 2 4 4 4 2
## [223] 4 4 4 4 2 1 4 4 1 4 3 4 2 4 1 2 4 4 2 3 1 4 2 4 2 4 1 2 2 2 1 2 2 2 4 2 2
## [260] 4 2 2 2 4 2 4 4 2 2 4
##
## Within cluster sum of squares by cluster:
## [1] 132.50000 494.87218 48.61538 368.56250
## (between_SS / total_SS = 85.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
(Optional: Rescale the Data (From book))
## reputation quality distance doctor
## Min. :-1.6356 Min. :-1.94199 Min. :-1.24755 Min. :-1.3678
## 1st Qu.:-0.9947 1st Qu.:-0.67547 1st Qu.: 0.07339 1st Qu.:-0.7624
## Median :-0.3537 Median :-0.04222 Median : 0.07339 Median :-0.1570
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.9282 3rd Qu.: 1.22430 3rd Qu.: 0.07339 3rd Qu.: 1.0539
## Max. : 1.5692 Max. : 1.22430 Max. : 4.03618 Max. : 1.6594
## friends ads staff facilities
## Min. :-1.6638 Min. :-1.4388 Min. :-1.4799 Min. :-1.5701
## 1st Qu.:-1.0220 1st Qu.:-0.4748 1st Qu.:-0.8334 1st Qu.:-0.9239
## Median :-0.3803 Median :-0.4748 Median :-0.8334 Median :-0.2776
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.9032 3rd Qu.: 0.4891 3rd Qu.: 1.1063 3rd Qu.: 1.0148
## Max. : 1.5449 Max. : 3.3810 Max. : 1.1063 Max. : 1.6610
## nursery insurance familiar convenience
## Min. :-1.4199 Min. :-0.93389 Min. :-2.08338 Min. :-1.9365
## 1st Qu.:-0.7883 1st Qu.:-0.93389 1st Qu.:-0.71140 1st Qu.:-0.1760
## Median :-0.1567 Median : 0.05107 Median :-0.02541 Median :-0.1760
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 1.1064 3rd Qu.: 0.05107 3rd Qu.: 0.66058 3rd Qu.: 0.7042
## Max. : 1.7380 Max. : 3.00597 Max. : 1.34658 Max. : 1.5844
## fullservice qualcare
## Min. :-1.4978 Min. :-1.9464
## 1st Qu.:-0.5326 1st Qu.:-0.2290
## Median : 0.4325 Median :-0.2290
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4325 3rd Qu.: 0.6297
## Max. : 2.3628 Max. : 1.4884
Alternate PCA. same as above
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.6413 1.9737 1.1853 0.48442 0.46762 0.45116 0.42972
## Proportion of Variance 0.4983 0.2782 0.1004 0.01676 0.01562 0.01454 0.01319
## Cumulative Proportion 0.4983 0.7766 0.8769 0.89370 0.90932 0.92386 0.93705
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.40760 0.39696 0.37652 0.35651 0.34428 0.32213 0.25774
## Proportion of Variance 0.01187 0.01126 0.01013 0.00908 0.00847 0.00741 0.00475
## Cumulative Proportion 0.94892 0.96017 0.97030 0.97938 0.98784 0.99525 1.00000
Scree Plot
The elbow occurs at either component 3 or 4, depending on interpretation, and this suggests that the first 2 or 3 components explain most of the variation in the observed hospital ratings.
Another solution is to use PCA
Cluster analysis step 2 & 3:
Select a distance measure & clustering procedure Plot Dendrogram
library(factoextra)
# First compute the disimilarity matrix
d <- dist(hospital.sc, method = "euclidean")
# Hierarchical clustering using Wards.D
hcl <-hclust(d, method = "ward.D2")
# Plot the obtained dendrogram
plot(hcl, cex = 0.6, hang = -1, las = 1)