# PREP #########################################################################
# Packages
pacman::p_load(
pacman, rio, tidyverse, magrittr, janitor, # general stuff
psych, # EDA
visdat, # missingness
data.table, # working with data.tables
corrplot, # correlation plot
FactoMineR, # EDA, PCA, MFA
factoextra, # extract and visualize PCA/MFA
nFactors, # how many factors/components to retain
cluster, # clustering algorithms
NbClust, # number of clusters
clValid # ?
)
# Data
data <- import("data/Travel_Review.xlsx", range = "A1:Y5457",
na = "0")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in L2714 / R2714C12: got '2 2.'
# clean variable names
data %<>% rename(SwimmingPools = `Swimming Pools`)
# drop missings
complete <- data[complete.cases(data),]
# standardize by user
data2 <- complete
data2[,-1] <- t(data2[,-1]) %>% scale() %>% t()
# PCA - 1st PASS ###############################################################
# RUN THE PCA
# create X matrix
# X <- subset(data2, select = -c(UserID))
X <- subset(data2, select = -c(UserID, DanceClubs))
str(X)
## 'data.frame': 3724 obs. of 23 variables:
## $ Churches : num -0.609 -0.722 -0.663 -0.587 -0.679 ...
## $ Resorts : num -0.651 -0.678 -0.619 -0.595 -0.635 ...
## $ Beaches : num -0.617 -0.67 -0.612 -0.587 -0.628 ...
## $ Parks : num -0.593 2.39 -0.597 -0.562 -0.606 ...
## $ Theatres : num 1.89 1.55 1.62 1.92 1.56 ...
## $ Museums : num 2.84 2.39 2.45 1.15 2.37 ...
## $ Malls : num 1.129 0.878 2.448 2.853 0.859 ...
## $ Zoo : num 1.121 0.856 0.91 1.121 0.845 ...
## $ Restaurants : num 1.038 0.79 0.837 1.029 0.765 ...
## $ Pubs_Bars : num 1.046 0.489 0.837 1.029 0.765 ...
## $ LocalServices : num 0.409 0.225 0.271 0.391 0.199 ...
## $ Burger_PizzaShops : num -0.0876 -0.208 -0.1707 -0.1143 -0.2288 ...
## $ Hotels_OtherLodgings: num -0.179 -0.296 -0.281 -0.23 -0.316 ...
## $ JuiceBars : num -0.212 -0.325 -0.281 -0.239 -0.345 ...
## $ ArtGalleries : num -0.195 -0.318 -0.274 -0.23 -0.338 ...
## $ SwimmingPools : num -0.841 -0.281 -0.244 -0.811 -0.845 ...
## $ Gyms : num -0.816 -0.854 -0.803 -0.811 -0.831 ...
## $ Bakeries : num -0.857 -0.898 -0.847 -0.852 -0.874 ...
## $ BeautySpas : num -0.882 -0.913 -0.855 -0.869 -0.882 ...
## $ Cafes : num -0.675 -0.898 -0.84 -0.852 -0.874 ...
## $ ViewPoints : num -0.692 -0.729 -0.671 -0.645 2.368 ...
## $ Monuments : num -0.7 -0.744 -0.686 -0.661 -0.7 ...
## $ Gardens : num -0.692 -0.736 -0.663 -0.628 -0.621 ...
# run PCA
pca <- prcomp(X,
center = TRUE,
scale. = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.2674 1.6773 1.31537 1.28890 1.18226 1.08449 1.02765
## Proportion of Variance 0.2235 0.1223 0.07523 0.07223 0.06077 0.05114 0.04592
## Cumulative Proportion 0.2235 0.3458 0.42108 0.49331 0.55408 0.60521 0.65113
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.9287 0.87314 0.8471 0.82327 0.77763 0.76758 0.7352
## Proportion of Variance 0.0375 0.03315 0.0312 0.02947 0.02629 0.02562 0.0235
## Cumulative Proportion 0.6886 0.72177 0.7530 0.78244 0.80873 0.83435 0.8579
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.71319 0.68935 0.66916 0.63865 0.63332 0.59991 0.57179
## Proportion of Variance 0.02211 0.02066 0.01947 0.01773 0.01744 0.01565 0.01422
## Cumulative Proportion 0.87996 0.90062 0.92009 0.93782 0.95526 0.97091 0.98513
## PC22 PC23
## Standard deviation 0.56279 0.1593
## Proportion of Variance 0.01377 0.0011
## Cumulative Proportion 0.99890 1.0000
# pca2 <- principal(X, nfactors = 7, residuals = T)
# pca2
# print(pca2$loadings, cutoff = 0.4, sort = T)
# plot(pca2)
# add labels to PCA results
pca_label <- cbind(data2, as.data.frame(pca$x))
head(pca_label)
## UserID Churches Resorts Beaches Parks Theatres Museums
## 69 User 69 -0.6091162 -0.6505058 -0.6173941 -0.5925604 1.890813 2.842772
## 70 User 70 -0.7217397 -0.6777013 -0.6703616 2.3903039 1.553575 2.390304
## 71 User 71 -0.6634526 -0.6193246 -0.6119699 -0.5972605 1.616495 2.447573
## 72 User 72 -0.5867433 -0.5950317 -0.5867433 -0.5618784 1.916327 1.145514
## 73 User 73 -0.6785527 -0.6350286 -0.6277746 -0.6060126 1.555684 2.368134
## 74 User 74 -0.7170883 -0.6864435 -0.6741855 1.8448168 1.158373 1.844817
## Malls Zoo Restaurants Pubs_Bars LocalServices Burger_PizzaShops
## 69 1.1292449 1.1209670 1.0381879 1.0464658 0.40906672 -0.08760788
## 70 0.8783205 0.8563013 0.7902438 0.4893150 0.22508491 -0.20795889
## 71 2.4475731 0.9104469 0.8369002 0.8369002 0.27059059 -0.17068965
## 72 2.8529057 1.1206487 1.0294773 1.0294773 0.39127734 -0.11430962
## 73 0.8592986 0.8447906 0.7649964 0.7649964 0.19918319 -0.22880375
## 74 1.8448168 0.5577353 0.4841878 0.4780588 0.00612896 -0.36773757
## Hotels_OtherLodgings JuiceBars ArtGalleries DanceClubs SwimmingPools
## 69 -0.1786649 -0.2117765 -0.1952207 -0.1786649 -0.8408977
## 70 -0.2960356 -0.3253945 -0.3180548 -0.3033753 -0.2813561
## 71 -0.2810097 -0.2810097 -0.2736550 -0.2589457 -0.2442364
## 72 -0.2303460 -0.2386343 -0.2303460 -0.2137694 -0.8105277
## 73 -0.3158519 -0.3448680 -0.3376140 -0.3231060 -0.8453951
## 74 -0.4474140 -0.4596720 -0.4535430 -0.4412851 -0.8703123
## Gyms Bakeries BeautySpas Cafes ViewPoints Monuments Gardens
## 69 -0.8160640 -0.8574535 -0.8822872 -0.6753395 -0.6918953 -0.7001732 -0.6918953
## 70 -0.8538547 -0.8978931 -0.9125725 -0.8978931 -0.7290794 -0.7437588 -0.7364191
## 71 -0.8031913 -0.8473193 -0.8546740 -0.8399647 -0.6708072 -0.6855166 -0.6634526
## 72 -0.8105277 -0.8519693 -0.8685459 -0.8519693 -0.6447615 -0.6613381 -0.6281849
## 73 -0.8308871 -0.8744112 -0.8816652 -0.8744112 2.3681339 -0.7003148 -0.6205206
## 74 -0.8519254 -0.8948281 -0.9009571 -0.8948281 1.8448168 -0.7354751 -0.6680566
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 69 -2.4018090 1.2278103 -0.2923164 -0.5814276 -2.032376 0.8311117 -1.0391171
## 70 -1.5285281 2.2347710 0.7060933 -0.2213471 -2.008094 1.3924620 0.4016328
## 71 -2.3264528 0.9043579 -0.2234224 -0.9065431 -2.547463 0.6729884 -0.8727915
## 72 -2.6156521 0.7834587 -0.5954404 -0.5106310 -1.763650 0.2650046 -0.8500840
## 73 -1.1433292 1.2343446 -0.7913175 0.1942561 -1.620325 1.4250462 -0.6586073
## 74 -0.9124913 2.0710481 -0.1661480 0.1184531 -1.415868 1.7969091 -0.0221131
## PC8 PC9 PC10 PC11 PC12 PC13
## 69 0.3297053 0.11251663 -0.22774677 0.03073840 0.05046358 -0.39227801
## 70 0.2264530 0.42523596 -0.28907466 -0.26650641 1.16215674 -0.15074384
## 71 0.1555925 0.21174085 0.49257294 0.04518984 -0.08680914 -0.15991534
## 72 0.3357248 0.07728942 0.08217964 -0.15327959 -0.34785789 -0.08047636
## 73 0.5737145 -0.48398956 -0.39330174 0.31567044 0.41228850 -0.43594555
## 74 0.5371520 -0.39570900 -0.58174990 -0.17581793 0.61425642 -0.38527620
## PC14 PC15 PC16 PC17 PC18 PC19
## 69 -0.07456332 -0.35780502 0.48022504 0.876865612 0.19106203 1.5360486
## 70 -0.17160864 0.02679749 -0.04002613 -0.042603440 -0.67779072 0.2599628
## 71 -0.30669290 -0.67279644 0.42744732 0.763634169 -0.05546526 0.6578375
## 72 -0.13233556 -0.11201236 0.20550858 0.316993416 -0.42551784 0.5378189
## 73 -0.28563083 -0.35644848 1.76171306 0.996104985 1.45044406 0.7993980
## 74 -0.20132555 0.17623385 1.15375125 0.003257858 0.17932864 -0.6338799
## PC20 PC21 PC22 PC23
## 69 0.6678709 -1.1415123 0.1812368 -0.122754769
## 70 0.8980402 0.5814774 -0.5800904 -0.028220795
## 71 -0.3847716 -0.9790030 -0.1680465 -0.095953139
## 72 -1.3612756 -0.9113013 0.6005325 -0.105124169
## 73 -0.1772946 -0.9531523 0.1948476 -0.067323662
## 74 -0.5981263 0.4188443 -0.7404749 -0.001799961
c_pca <- cor(subset(pca_label, select = PC1:PC7))
corrplot(c_pca)

# ANALYZING THE RESULTS
# get eigenvalues
eig.val <- get_eigenvalue(pca)
eig.val %>% round(., 2)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 5.14 22.35 22.35
## Dim.2 2.81 12.23 34.59
## Dim.3 1.73 7.52 42.11
## Dim.4 1.66 7.22 49.33
## Dim.5 1.40 6.08 55.41
## Dim.6 1.18 5.11 60.52
## Dim.7 1.06 4.59 65.11
## Dim.8 0.86 3.75 68.86
## Dim.9 0.76 3.31 72.18
## Dim.10 0.72 3.12 75.30
## Dim.11 0.68 2.95 78.24
## Dim.12 0.60 2.63 80.87
## Dim.13 0.59 2.56 83.43
## Dim.14 0.54 2.35 85.78
## Dim.15 0.51 2.21 88.00
## Dim.16 0.48 2.07 90.06
## Dim.17 0.45 1.95 92.01
## Dim.18 0.41 1.77 93.78
## Dim.19 0.40 1.74 95.53
## Dim.20 0.36 1.56 97.09
## Dim.21 0.33 1.42 98.51
## Dim.22 0.32 1.38 99.89
## Dim.23 0.03 0.11 100.00
# screeplot
fviz_eig(pca,
choice = "variance",
addlabels = T) +
labs(title = "Scree Plot of Variance Explained by PCA Dimension") +
theme(plot.title = element_text(hjust=0.5, size=14, face="bold"))

fviz_eig(pca,
choice = "eigenvalue",
addlabels = T) +
labs(title = "Scree Plot of Eigenvalues by PCA Dimension") +
theme(plot.title = element_text(hjust=0.5, size=14, face="bold")) # 590x590

# estiamte number of factors
vss(X, n = 10)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully

##
## Very Simple Structure
## Call: vss(x = X, n = 10)
## VSS complexity 1 achieves a maximimum of 0.53 with 1 factors
## VSS complexity 2 achieves a maximimum of 0.7 with 3 factors
##
## The Velicer MAP achieves a minimum of 0.02 with 4 factors
## BIC achieves a minimum of 6564.07 with 10 factors
## Sample Size adjusted BIC achieves a minimum of 6780.14 with 10 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex eChisq
## 1 0.53 0.00 0.026 230 22982 0 23.3 0.53 0.16 21091 21821 1.0 29250
## 2 0.50 0.68 0.023 208 18156 0 15.8 0.68 0.15 16446 17107 1.4 14234
## 3 0.48 0.70 0.022 187 14975 0 13.0 0.74 0.15 13437 14031 1.6 9565
## 4 0.45 0.69 0.021 167 12284 0 10.4 0.79 0.14 10911 11441 1.9 5655
## 5 0.45 0.69 0.023 148 11048 0 8.8 0.82 0.14 9831 10301 2.0 3824
## 6 0.45 0.66 0.025 130 9829 0 7.5 0.85 0.14 8760 9173 2.2 2560
## 7 0.41 0.65 0.027 113 8578 0 6.4 0.87 0.14 7649 8008 2.2 1482
## 8 0.41 0.66 0.033 97 8026 0 5.8 0.88 0.15 7228 7537 2.2 1216
## 9 0.42 0.65 0.041 82 7552 0 5.1 0.90 0.16 6878 7138 2.2 1019
## 10 0.43 0.65 0.048 68 7123 0 4.7 0.90 0.17 6564 6780 2.4 799
## SRMR eCRMS eBIC
## 1 0.125 0.131 27359
## 2 0.087 0.096 12524
## 3 0.071 0.083 8027
## 4 0.055 0.067 4281
## 5 0.045 0.059 2607
## 6 0.037 0.051 1491
## 7 0.028 0.042 553
## 8 0.025 0.041 418
## 9 0.023 0.041 345
## 10 0.021 0.040 240
# lines in VSS = how many components will we allow each var to contribute to
# VSS helps if we're assign a var to only 1 component
# 1 factor per var not helpful here
nfactors(X, n = 15)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## An ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully

##
## Number of factors
## Call: vss(x = x, n = n, rotate = rotate, diagonal = diagonal, fm = fm,
## n.obs = n.obs, plot = FALSE, title = title, use = use, cor = cor)
## VSS complexity 1 achieves a maximimum of 0.53 with 1 factors
## VSS complexity 2 achieves a maximimum of 0.7 with 3 factors
## The Velicer MAP achieves a minimum of 0.02 with 4 factors
## Empirical BIC achieves a minimum of 98.08 with 15 factors
## Sample Size adjusted BIC achieves a minimum of 4709.87 with 15 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex eChisq
## 1 0.53 0.00 0.026 230 22982 0 23.3 0.53 0.16 21091 21821 1.0 29250
## 2 0.50 0.68 0.023 208 18156 0 15.8 0.68 0.15 16446 17107 1.4 14234
## 3 0.48 0.70 0.022 187 14975 0 13.0 0.74 0.15 13437 14031 1.6 9565
## 4 0.45 0.69 0.021 167 12284 0 10.4 0.79 0.14 10911 11441 1.9 5655
## 5 0.45 0.69 0.023 148 11048 0 8.8 0.82 0.14 9831 10301 2.0 3824
## 6 0.45 0.66 0.025 130 9829 0 7.5 0.85 0.14 8760 9173 2.2 2560
## 7 0.41 0.65 0.027 113 8578 0 6.4 0.87 0.14 7649 8008 2.2 1482
## 8 0.41 0.66 0.033 97 8026 0 5.8 0.88 0.15 7228 7537 2.2 1216
## 9 0.42 0.65 0.041 82 7552 0 5.1 0.90 0.16 6878 7138 2.2 1019
## 10 0.43 0.65 0.048 68 7123 0 4.7 0.90 0.17 6564 6780 2.4 799
## 11 0.46 0.66 0.059 55 6743 0 4.1 0.92 0.18 6291 6465 2.2 640
## 12 0.43 0.65 0.072 43 6278 0 3.6 0.93 0.20 5925 6061 2.4 491
## 13 0.45 0.66 0.083 32 5883 0 3.3 0.93 0.22 5620 5722 2.2 409
## 14 0.45 0.64 0.102 22 5485 0 2.7 0.95 0.26 5304 5374 2.3 307
## 15 0.47 0.66 0.128 13 4775 0 2.3 0.95 0.31 4669 4710 2.3 205
## SRMR eCRMS eBIC
## 1 0.125 0.131 27359
## 2 0.087 0.096 12524
## 3 0.071 0.083 8027
## 4 0.055 0.067 4281
## 5 0.045 0.059 2607
## 6 0.037 0.051 1491
## 7 0.028 0.042 553
## 8 0.025 0.041 418
## 9 0.023 0.041 345
## 10 0.021 0.040 240
## 11 0.018 0.040 188
## 12 0.016 0.039 138
## 13 0.015 0.041 146
## 14 0.013 0.043 126
## 15 0.010 0.046 98
# DATA2: I'm inclined to keep 7 PCs
# Dims 1-7 have eigenvals > 1
# each dim explains at least 4% of variance
# cumulatively explains 65.1% of variance
# RESULTS AT THE VARIABLE LEVEL
var <- get_pca_var(pca)
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
print(round(var$cor[,1:7],2))
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Churches 0.64 0.02 -0.09 0.01 0.09 -0.33 -0.33
## Resorts 0.27 0.17 0.00 -0.44 0.35 -0.52 0.10
## Beaches 0.36 0.37 0.19 -0.20 0.41 -0.17 0.22
## Parks 0.16 0.71 0.26 0.12 0.08 0.19 0.29
## Theatres 0.02 0.75 0.39 0.08 -0.14 0.18 -0.02
## Museums -0.29 0.55 0.18 -0.35 -0.34 0.22 -0.29
## Malls -0.69 0.11 -0.06 -0.32 -0.21 -0.02 -0.24
## Zoo -0.66 0.21 -0.33 -0.13 -0.18 -0.22 -0.09
## Restaurants -0.70 0.05 -0.40 -0.25 -0.01 0.02 0.08
## Pubs_Bars -0.66 0.04 -0.45 0.19 0.04 -0.12 0.24
## LocalServices -0.52 0.05 -0.16 0.54 0.04 -0.25 0.24
## Burger_PizzaShops -0.46 -0.20 0.40 0.36 0.01 -0.16 -0.05
## Hotels_OtherLodgings -0.43 -0.27 0.42 0.43 0.13 -0.02 -0.17
## JuiceBars -0.33 -0.53 0.41 -0.03 0.18 0.09 -0.18
## ArtGalleries -0.26 -0.51 -0.05 -0.16 0.31 0.31 -0.03
## SwimmingPools 0.33 -0.27 0.07 0.02 -0.56 -0.05 0.37
## Gyms 0.36 -0.39 0.14 -0.09 -0.54 -0.14 0.25
## Bakeries 0.37 -0.40 0.24 -0.30 -0.15 -0.12 0.11
## BeautySpas 0.32 -0.28 -0.07 -0.28 0.26 0.28 0.14
## Cafes 0.45 -0.23 -0.42 0.02 -0.06 0.34 -0.12
## ViewPoints 0.60 0.19 -0.30 0.34 0.07 0.28 0.08
## Monuments 0.57 0.11 -0.28 0.33 -0.05 0.01 -0.22
## Gardens 0.62 0.02 -0.09 0.17 -0.14 -0.29 -0.41
var$cor[,1:7] %>% as.data.frame() %>%
arrange(desc(Dim.1))%>% as.matrix() %>%
corrplot(is.corr = F, tl.col = "black",
cl.align.text = "l",
mar = c(0,0,2,0),
title = "Correlations Between Variables & Dimensions")

corrplot(var$cor, is.corr = FALSE,
cl.align.text = "l") #

corrplot(var$cos2[,1:7], is.corr = FALSE) # quality of representation

corrplot(var$contrib[,1:7], is.corr = FALSE) # amount of contributions

# Contributions of variables to all PCs kept
fviz_contrib(pca, choice = "var", axes = 1:7)

# dashed line is expected value if contributions were uniform
# Contributions of variables to individual PCs
fviz_contrib(pca, choice = "var", axes = 1)

fviz_contrib(pca, choice = "var", axes = 2)

# Correlation Plots
fviz_pca_var(pca, col.var = "cos2",
labelsize = 3,
# select.var = list(cos2 = 0.3),
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoids text overlapping
) + labs(title = "Variable Correlation - Dims 1 & 2",
subtitle = "colored by 'quality of representation'")

fviz_pca_var(pca, col.var = "contrib",
labelsize = 3,
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
) + labs(title = "Variable Correlation - Dims 1 & 2",
subtitle = "colored by variable contribution")

fviz_pca_var(pca, col.var = "cos2",
axes = c(1,2),
labelsize = 3,
# select.var = list(cos2 = 0.3),
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)

# creating rough clusters
set.seed(1679)
km <- kmeans(var$coord, centers = 6, nstart = 25)
grp <- as.factor(km$cluster)
# Color variables by groups
fviz_pca_var(pca, col.var = grp,
labelsize = 3,
legend.title = "Cluster",
repel = T)

# RESULTS AT THE INDIVIDUAL LEVEL
ind <- get_pca_ind(pca)
ind
## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
# Correlation Plots
fviz_pca_ind(pca, col.ind = "cos2",
# select.ind = list(cos2 = 0.3),
gradient.cols = c("#0073C2FF", "#00AFBB", "#E7B800", "#FC4E07"),
label = "none"
) + labs(title = "Individual Correlation - Dims 1 & 2",
subtitle = "colored by 'quality of representation'")

fviz_pca_ind(pca, col.ind = "contrib", axes = c(1,2),
gradient.cols = c("#0073C2FF", "#00AFBB", "#E7B800", "#FC4E07"),
label = "none"
) + labs(title = "Individual Correlation - Dims 1 & 2",
subtitle = "colored by individual contribution")

ind$coord %>% scale() %>%
as.data.frame() %>%
ggplot(aes(Dim.2, Dim.3)) + geom_point()

# visualizing individuals on kept PCs, slow
PerformanceAnalytics::chart.Correlation(dplyr::select(pca_label, PC1:PC7))

# Contributions of individuals to PCs
fviz_contrib(pca, choice = "ind", axes = 1:2)

# slow to run on my laptop
# Rotations are eigenvectors! Aka component loadings
# loadings represent each feature's INFLUENCE on associated PC
PC1 <- pca$rotation[,1]
PC1_scores <- abs(PC1)
PC1_scores_ordered <- sort(PC1_scores, decreasing = TRUE)
PC1_scores_ordered
## Restaurants Malls Pubs_Bars
## 0.306887643 0.304638343 0.291667870
## Zoo Churches Gardens
## 0.291009048 0.282251732 0.271254093
## ViewPoints Monuments LocalServices
## 0.266124324 0.251086378 0.227637461
## Burger_PizzaShops Cafes Hotels_OtherLodgings
## 0.200898981 0.200623532 0.187490813
## Bakeries Gyms Beaches
## 0.161152907 0.160698686 0.159532793
## JuiceBars SwimmingPools BeautySpas
## 0.146787198 0.144747636 0.140974848
## Museums Resorts ArtGalleries
## 0.126624564 0.120836301 0.114199223
## Parks Theatres
## 0.072196224 0.007721186
eig.vec <- pca$rotation %>% as_tibble() %>%
mutate(feature = row.names(pca$rotation))
# influence of each var on PC1
ggplot(eig.vec, aes(PC1, reorder(feature, PC1))) + geom_point()

ggplot(eig.vec, aes(PC1, PC2, label = feature)) + geom_text()

# EFA - 1st PASS ###############################################################
# RUN THE EFA
# create X matrix
# different from PCA, trying something
X <- data2[,-1]
str(X)
## 'data.frame': 3724 obs. of 24 variables:
## $ Churches : num -0.609 -0.722 -0.663 -0.587 -0.679 ...
## $ Resorts : num -0.651 -0.678 -0.619 -0.595 -0.635 ...
## $ Beaches : num -0.617 -0.67 -0.612 -0.587 -0.628 ...
## $ Parks : num -0.593 2.39 -0.597 -0.562 -0.606 ...
## $ Theatres : num 1.89 1.55 1.62 1.92 1.56 ...
## $ Museums : num 2.84 2.39 2.45 1.15 2.37 ...
## $ Malls : num 1.129 0.878 2.448 2.853 0.859 ...
## $ Zoo : num 1.121 0.856 0.91 1.121 0.845 ...
## $ Restaurants : num 1.038 0.79 0.837 1.029 0.765 ...
## $ Pubs_Bars : num 1.046 0.489 0.837 1.029 0.765 ...
## $ LocalServices : num 0.409 0.225 0.271 0.391 0.199 ...
## $ Burger_PizzaShops : num -0.0876 -0.208 -0.1707 -0.1143 -0.2288 ...
## $ Hotels_OtherLodgings: num -0.179 -0.296 -0.281 -0.23 -0.316 ...
## $ JuiceBars : num -0.212 -0.325 -0.281 -0.239 -0.345 ...
## $ ArtGalleries : num -0.195 -0.318 -0.274 -0.23 -0.338 ...
## $ DanceClubs : num -0.179 -0.303 -0.259 -0.214 -0.323 ...
## $ SwimmingPools : num -0.841 -0.281 -0.244 -0.811 -0.845 ...
## $ Gyms : num -0.816 -0.854 -0.803 -0.811 -0.831 ...
## $ Bakeries : num -0.857 -0.898 -0.847 -0.852 -0.874 ...
## $ BeautySpas : num -0.882 -0.913 -0.855 -0.869 -0.882 ...
## $ Cafes : num -0.675 -0.898 -0.84 -0.852 -0.874 ...
## $ ViewPoints : num -0.692 -0.729 -0.671 -0.645 2.368 ...
## $ Monuments : num -0.7 -0.744 -0.686 -0.661 -0.7 ...
## $ Gardens : num -0.692 -0.736 -0.663 -0.628 -0.621 ...
# couldn't get this to work with data2... doesn't like negative? diff way to scale?
# fit <- factanal(X, factors = 8, rotation = "varimax")
# print(fit, digits = 2, cutoff = 0.3, sort = T)
# print(fit$loadings, cutoff = 0.5, sort = T)
# Keeping 7 factors: SS loadings > 1, prop var > 4%, cum var = 44.3%
# Calculate and plot factors with fa()
fit <- fa(X, nfactors = 7, rotate = "varimax", fm = "pa")
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## In factor.scores, the correlation matrix is singular, an approximation is used
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
fit
## Factor Analysis using method = pa
## Call: fa(r = X, nfactors = 7, rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA4 PA3 PA6 PA5 PA7 h2 u2 com
## Churches 0.59 -0.06 -0.22 0.15 -0.06 0.06 0.27 0.50 0.50 1.9
## Resorts 0.06 0.00 -0.11 0.16 0.03 -0.02 0.66 0.47 0.53 1.2
## Beaches 0.10 0.27 -0.20 0.12 -0.13 -0.12 0.45 0.37 0.63 2.9
## Parks -0.01 0.74 -0.02 0.11 -0.18 -0.15 0.13 0.63 0.37 1.3
## Theatres 0.01 0.82 -0.06 -0.02 0.13 -0.12 -0.02 0.70 0.30 1.1
## Museums -0.18 0.45 -0.09 0.12 0.62 -0.12 -0.15 0.68 0.32 2.4
## Malls -0.38 -0.06 0.23 -0.07 0.61 -0.15 -0.12 0.61 0.39 2.3
## Zoo -0.24 -0.05 0.53 0.03 0.48 -0.16 -0.04 0.60 0.40 2.6
## Restaurants -0.47 -0.20 0.42 0.14 0.34 -0.25 -0.07 0.65 0.35 4.1
## Pubs_Bars -0.32 -0.15 0.71 0.00 0.04 -0.21 -0.11 0.68 0.32 1.8
## LocalServices -0.08 0.02 0.64 -0.30 -0.07 -0.13 -0.12 0.54 0.46 1.7
## Burger_PizzaShops -0.17 -0.02 0.18 -0.56 0.04 0.01 -0.11 0.39 0.61 1.5
## Hotels_OtherLodgings -0.09 -0.08 0.08 -0.69 -0.01 -0.12 -0.21 0.55 0.45 1.3
## JuiceBars -0.28 -0.33 -0.21 -0.51 0.04 -0.03 -0.10 0.51 0.49 2.9
## ArtGalleries -0.34 -0.42 -0.14 -0.10 -0.09 -0.13 -0.12 0.36 0.64 2.8
## DanceClubs -0.11 0.01 0.01 0.17 -0.26 0.14 -0.11 0.14 0.86 3.2
## SwimmingPools 0.09 -0.03 -0.08 0.08 -0.13 0.54 -0.09 0.34 0.66 1.3
## Gyms 0.09 -0.11 -0.15 0.04 -0.07 0.72 -0.03 0.56 0.44 1.2
## Bakeries 0.06 -0.19 -0.34 -0.01 -0.06 0.37 0.14 0.31 0.69 2.9
## BeautySpas 0.00 -0.21 -0.31 0.17 -0.19 0.03 0.08 0.21 0.79 3.4
## Cafes 0.24 -0.24 -0.20 0.37 -0.22 0.07 -0.18 0.38 0.62 4.7
## ViewPoints 0.46 0.15 -0.10 0.37 -0.42 -0.09 -0.15 0.59 0.41 3.6
## Monuments 0.58 0.05 -0.07 0.23 -0.19 0.00 -0.07 0.44 0.56 1.6
## Gardens 0.68 -0.01 -0.15 0.11 -0.02 0.15 0.08 0.52 0.48 1.3
##
## PA1 PA2 PA4 PA3 PA6 PA5 PA7
## SS loadings 2.26 2.03 1.95 1.67 1.59 1.27 0.98
## Proportion Var 0.09 0.08 0.08 0.07 0.07 0.05 0.04
## Cumulative Var 0.09 0.18 0.26 0.33 0.40 0.45 0.49
## Proportion Explained 0.19 0.17 0.17 0.14 0.14 0.11 0.08
## Cumulative Proportion 0.19 0.37 0.53 0.67 0.81 0.92 1.00
##
## Mean item complexity = 2.3
## Test of the hypothesis that 7 factors are sufficient.
##
## The degrees of freedom for the null model are 276 and the objective function was 28.93 with Chi Square of 107452.8
## The degrees of freedom for the model are 129 and the objective function was 21.66
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic number of observations is 3724 with the empirical chi square 1723.45 with prob < 9.7e-277
## The total number of observations was 3724 with Likelihood Chi Square = 80360.28 with prob < 0
##
## Tucker Lewis Index of factoring reliability = -0.604
## RMSEA index = 0.409 and the 90 % confidence intervals are 0.406 0.411
## BIC = 79299.57
## Fit based upon off diagonal values = 0.98
print(fit$loadings, cutoff = 0.3, sort = T)
##
## Loadings:
## PA1 PA2 PA4 PA3 PA6 PA5 PA7
## Churches 0.591
## Monuments 0.581
## Gardens 0.675
## Parks 0.741
## Theatres 0.816
## Zoo 0.532 0.483
## Pubs_Bars -0.316 0.709
## LocalServices 0.635
## Burger_PizzaShops -0.564
## Hotels_OtherLodgings -0.689
## JuiceBars -0.329 -0.511
## Museums 0.451 0.619
## Malls -0.382 0.610
## SwimmingPools 0.542
## Gyms 0.716
## Resorts 0.655
## Beaches 0.448
## Restaurants -0.473 0.420 0.343
## ArtGalleries -0.344 -0.424
## DanceClubs
## Bakeries -0.336 0.368
## BeautySpas -0.305
## Cafes 0.370
## ViewPoints 0.461 0.371 -0.424
##
## PA1 PA2 PA4 PA3 PA6 PA5 PA7
## SS loadings 2.260 2.031 1.948 1.672 1.589 1.270 0.979
## Proportion Var 0.094 0.085 0.081 0.070 0.066 0.053 0.041
## Cumulative Var 0.094 0.179 0.260 0.330 0.396 0.449 0.490
fit2 <- fa(X, nfactors = 7, rotate = "varimax", fm = "minres")
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## In factor.scores, the correlation matrix is singular, an approximation is used
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
fit2
## Factor Analysis using method = minres
## Call: fa(r = X, nfactors = 7, rotate = "varimax", fm = "minres")
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 MR2 MR4 MR3 MR6 MR5 MR7 h2 u2 com
## Churches 0.59 -0.06 -0.22 0.15 -0.06 0.06 0.27 0.50 0.50 1.9
## Resorts 0.06 0.00 -0.11 0.16 0.03 -0.02 0.66 0.48 0.52 1.2
## Beaches 0.11 0.27 -0.21 0.12 -0.13 -0.12 0.45 0.37 0.63 2.9
## Parks -0.01 0.74 -0.02 0.11 -0.18 -0.15 0.13 0.63 0.37 1.3
## Theatres 0.01 0.82 -0.06 -0.02 0.14 -0.12 -0.02 0.70 0.30 1.1
## Museums -0.18 0.45 -0.09 0.12 0.62 -0.12 -0.15 0.68 0.32 2.4
## Malls -0.38 -0.06 0.23 -0.07 0.61 -0.15 -0.12 0.61 0.39 2.3
## Zoo -0.24 -0.05 0.53 0.03 0.48 -0.16 -0.04 0.60 0.40 2.6
## Restaurants -0.47 -0.20 0.42 0.14 0.34 -0.25 -0.07 0.65 0.35 4.1
## Pubs_Bars -0.32 -0.15 0.71 0.00 0.04 -0.21 -0.11 0.68 0.32 1.8
## LocalServices -0.08 0.02 0.63 -0.30 -0.07 -0.13 -0.12 0.54 0.46 1.7
## Burger_PizzaShops -0.17 -0.02 0.18 -0.56 0.04 0.01 -0.11 0.39 0.61 1.5
## Hotels_OtherLodgings -0.09 -0.08 0.08 -0.69 -0.01 -0.12 -0.21 0.55 0.45 1.3
## JuiceBars -0.28 -0.33 -0.21 -0.51 0.04 -0.03 -0.10 0.51 0.49 2.9
## ArtGalleries -0.34 -0.42 -0.14 -0.10 -0.09 -0.13 -0.12 0.36 0.64 2.8
## DanceClubs -0.11 0.01 0.01 0.17 -0.26 0.14 -0.11 0.14 0.86 3.2
## SwimmingPools 0.09 -0.03 -0.09 0.08 -0.13 0.54 -0.09 0.34 0.66 1.3
## Gyms 0.09 -0.11 -0.15 0.04 -0.07 0.72 -0.03 0.57 0.43 1.2
## Bakeries 0.06 -0.19 -0.34 -0.01 -0.06 0.37 0.14 0.31 0.69 2.9
## BeautySpas 0.00 -0.21 -0.31 0.17 -0.19 0.03 0.08 0.21 0.79 3.4
## Cafes 0.24 -0.24 -0.20 0.37 -0.22 0.07 -0.18 0.38 0.62 4.7
## ViewPoints 0.46 0.15 -0.10 0.37 -0.42 -0.09 -0.15 0.59 0.41 3.6
## Monuments 0.58 0.05 -0.07 0.23 -0.19 0.00 -0.07 0.44 0.56 1.6
## Gardens 0.68 -0.01 -0.15 0.11 -0.02 0.15 0.08 0.52 0.48 1.3
##
## MR1 MR2 MR4 MR3 MR6 MR5 MR7
## SS loadings 2.26 2.03 1.95 1.67 1.59 1.27 0.98
## Proportion Var 0.09 0.08 0.08 0.07 0.07 0.05 0.04
## Cumulative Var 0.09 0.18 0.26 0.33 0.40 0.45 0.49
## Proportion Explained 0.19 0.17 0.17 0.14 0.14 0.11 0.08
## Cumulative Proportion 0.19 0.37 0.53 0.67 0.81 0.92 1.00
##
## Mean item complexity = 2.3
## Test of the hypothesis that 7 factors are sufficient.
##
## The degrees of freedom for the null model are 276 and the objective function was 28.93 with Chi Square of 107452.8
## The degrees of freedom for the model are 129 and the objective function was 21.66
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic number of observations is 3724 with the empirical chi square 1723.41 with prob < 9.9e-277
## The total number of observations was 3724 with Likelihood Chi Square = 80359.23 with prob < 0
##
## Tucker Lewis Index of factoring reliability = -0.604
## RMSEA index = 0.409 and the 90 % confidence intervals are 0.406 0.411
## BIC = 79298.52
## Fit based upon off diagonal values = 0.98
print(fit2$loadings, cutoff = 0.3, sort = T)
##
## Loadings:
## MR1 MR2 MR4 MR3 MR6 MR5 MR7
## Churches 0.591
## Monuments 0.581
## Gardens 0.675
## Parks 0.741
## Theatres 0.815
## Zoo 0.533 0.482
## Pubs_Bars -0.316 0.709
## LocalServices 0.634
## Burger_PizzaShops -0.564
## Hotels_OtherLodgings -0.689
## JuiceBars -0.329 -0.511
## Museums 0.450 0.621
## Malls -0.382 0.610
## SwimmingPools 0.541
## Gyms 0.718
## Resorts 0.659
## Beaches 0.445
## Restaurants -0.473 0.421 0.342
## ArtGalleries -0.344 -0.424
## DanceClubs
## Bakeries -0.337 0.368
## BeautySpas -0.306
## Cafes 0.370
## ViewPoints 0.461 0.371 -0.423
##
## MR1 MR2 MR4 MR3 MR6 MR5 MR7
## SS loadings 2.261 2.031 1.951 1.672 1.587 1.271 0.980
## Proportion Var 0.094 0.085 0.081 0.070 0.066 0.053 0.041
## Cumulative Var 0.094 0.179 0.260 0.330 0.396 0.449 0.490
fit3 <- fa(X, nfactors = 7, rotate = "varimax", fm = "ols")
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## In factor.scores, the correlation matrix is singular, an approximation is used
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
fit3
## Factor Analysis using method = ols
## Call: fa(r = X, nfactors = 7, rotate = "varimax", fm = "ols")
## Standardized loadings (pattern matrix) based upon correlation matrix
## 1 2 3 4 5 6 7 h2 u2 com
## Churches 0.59 -0.06 -0.22 0.15 -0.06 0.06 0.27 0.50 0.50 1.9
## Resorts 0.06 0.00 -0.11 0.16 0.03 -0.02 0.66 0.48 0.52 1.2
## Beaches 0.11 0.27 -0.21 0.12 -0.13 -0.12 0.45 0.37 0.63 2.9
## Parks -0.01 0.74 -0.02 0.11 -0.18 -0.15 0.13 0.63 0.37 1.3
## Theatres 0.01 0.82 -0.06 -0.02 0.14 -0.12 -0.02 0.70 0.30 1.1
## Museums -0.18 0.45 -0.09 0.12 0.62 -0.12 -0.15 0.68 0.32 2.4
## Malls -0.38 -0.06 0.23 -0.07 0.61 -0.15 -0.12 0.61 0.39 2.3
## Zoo -0.24 -0.05 0.53 0.03 0.48 -0.16 -0.04 0.60 0.40 2.6
## Restaurants -0.47 -0.20 0.42 0.14 0.34 -0.25 -0.07 0.65 0.35 4.1
## Pubs_Bars -0.32 -0.15 0.71 0.00 0.04 -0.21 -0.11 0.68 0.32 1.8
## LocalServices -0.08 0.02 0.63 -0.30 -0.07 -0.13 -0.12 0.54 0.46 1.7
## Burger_PizzaShops -0.17 -0.02 0.18 -0.56 0.04 0.01 -0.11 0.39 0.61 1.5
## Hotels_OtherLodgings -0.09 -0.08 0.08 -0.69 -0.01 -0.12 -0.21 0.55 0.45 1.3
## JuiceBars -0.28 -0.33 -0.21 -0.51 0.04 -0.03 -0.10 0.51 0.49 2.9
## ArtGalleries -0.34 -0.42 -0.14 -0.10 -0.09 -0.13 -0.12 0.36 0.64 2.8
## DanceClubs -0.11 0.01 0.01 0.17 -0.26 0.14 -0.11 0.14 0.86 3.2
## SwimmingPools 0.09 -0.03 -0.09 0.08 -0.13 0.54 -0.09 0.34 0.66 1.3
## Gyms 0.09 -0.11 -0.15 0.04 -0.07 0.72 -0.03 0.57 0.43 1.2
## Bakeries 0.06 -0.19 -0.34 -0.01 -0.06 0.37 0.14 0.31 0.69 2.9
## BeautySpas 0.00 -0.21 -0.31 0.17 -0.19 0.03 0.08 0.21 0.79 3.4
## Cafes 0.24 -0.24 -0.20 0.37 -0.22 0.07 -0.18 0.38 0.62 4.7
## ViewPoints 0.46 0.15 -0.10 0.37 -0.42 -0.09 -0.15 0.59 0.41 3.6
## Monuments 0.58 0.05 -0.07 0.23 -0.19 0.00 -0.07 0.44 0.56 1.6
## Gardens 0.68 -0.01 -0.15 0.11 -0.02 0.15 0.08 0.52 0.48 1.3
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## SS loadings 2.26 2.03 1.95 1.67 1.59 1.27 0.98
## Proportion Var 0.09 0.08 0.08 0.07 0.07 0.05 0.04
## Cumulative Var 0.09 0.18 0.26 0.33 0.40 0.45 0.49
## Proportion Explained 0.19 0.17 0.17 0.14 0.14 0.11 0.08
## Cumulative Proportion 0.19 0.37 0.53 0.67 0.81 0.92 1.00
##
## Mean item complexity = 2.3
## Test of the hypothesis that 7 factors are sufficient.
##
## The degrees of freedom for the null model are 276 and the objective function was 28.93 with Chi Square of 107452.8
## The degrees of freedom for the model are 129 and the objective function was 21.66
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic number of observations is 3724 with the empirical chi square 1723.41 with prob < 9.9e-277
## The total number of observations was 3724 with Likelihood Chi Square = 80359.23 with prob < 0
##
## Tucker Lewis Index of factoring reliability = -0.604
## RMSEA index = 0.409 and the 90 % confidence intervals are 0.406 0.411
## BIC = 79298.52
## Fit based upon off diagonal values = 0.98
print(fit3$loadings, cutoff = 0.3, sort = T)
##
## Loadings:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## Churches 0.591
## Monuments 0.581
## Gardens 0.675
## Parks 0.741
## Theatres 0.815
## Zoo 0.533 0.482
## Pubs_Bars -0.316 0.709
## LocalServices 0.634
## Burger_PizzaShops -0.564
## Hotels_OtherLodgings -0.689
## JuiceBars -0.329 -0.511
## Museums 0.450 0.621
## Malls -0.382 0.610
## SwimmingPools 0.541
## Gyms 0.718
## Resorts 0.659
## Beaches 0.445
## Restaurants -0.473 0.421 0.342
## ArtGalleries -0.344 -0.424
## DanceClubs
## Bakeries -0.337 0.368
## BeautySpas -0.306
## Cafes 0.370
## ViewPoints 0.461 0.371 -0.423
##
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## SS loadings 2.261 2.031 1.951 1.672 1.587 1.271 0.980
## Proportion Var 0.094 0.085 0.081 0.070 0.066 0.053 0.041
## Cumulative Var 0.094 0.179 0.260 0.330 0.396 0.449 0.490
fit4 <- fa(X, nfactors=8, rotate="varimax", fm = "pa")
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## In factor.scores, the correlation matrix is singular, an approximation is used
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
fit4 # print results
## Factor Analysis using method = pa
## Call: fa(r = X, nfactors = 8, rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA2 PA1 PA4 PA3 PA8 PA5 PA7 PA6 h2 u2
## Churches -0.04 0.56 -0.21 0.18 -0.15 0.07 -0.24 0.12 0.49 0.51
## Resorts 0.01 0.08 -0.10 0.14 0.00 -0.02 -0.67 0.05 0.48 0.52
## Beaches 0.28 0.06 -0.20 0.14 -0.19 -0.10 -0.42 0.09 0.38 0.62
## Parks 0.73 -0.05 -0.02 0.12 -0.17 -0.14 -0.12 -0.06 0.62 0.38
## Theatres 0.82 0.03 -0.07 -0.04 0.16 -0.13 0.01 -0.03 0.73 0.27
## Museums 0.44 -0.16 -0.11 0.11 0.60 -0.13 0.14 0.18 0.68 0.32
## Malls -0.06 -0.33 0.21 -0.09 0.62 -0.17 0.10 0.14 0.61 0.39
## Zoo -0.05 -0.23 0.52 0.03 0.47 -0.17 0.04 0.17 0.61 0.39
## Restaurants -0.21 -0.46 0.41 0.13 0.38 -0.25 0.06 0.07 0.66 0.34
## Pubs_Bars -0.15 -0.30 0.70 -0.02 0.11 -0.22 0.09 -0.08 0.68 0.32
## LocalServices 0.02 -0.12 0.64 -0.26 -0.09 -0.13 0.15 0.05 0.54 0.46
## Burger_PizzaShops -0.02 -0.11 0.19 -0.61 0.07 -0.02 0.11 -0.05 0.44 0.56
## Hotels_OtherLodgings -0.08 -0.13 0.09 -0.63 -0.10 -0.11 0.27 0.22 0.58 0.42
## JuiceBars -0.33 -0.28 -0.21 -0.49 0.00 -0.02 0.13 0.13 0.50 0.50
## ArtGalleries -0.45 -0.27 -0.16 -0.17 0.02 -0.17 0.07 -0.27 0.44 0.56
## DanceClubs 0.00 -0.06 0.01 0.10 -0.13 0.12 0.06 -0.36 0.18 0.82
## SwimmingPools -0.03 0.11 -0.08 0.05 -0.07 0.53 0.07 -0.16 0.34 0.66
## Gyms -0.11 0.09 -0.14 0.03 -0.05 0.71 0.03 -0.05 0.56 0.44
## Bakeries -0.18 0.00 -0.33 0.03 -0.13 0.42 -0.11 0.16 0.37 0.63
## BeautySpas -0.21 -0.03 -0.30 0.18 -0.20 0.06 -0.08 -0.01 0.22 0.78
## Cafes -0.24 0.28 -0.20 0.34 -0.15 0.06 0.14 -0.24 0.39 0.61
## ViewPoints 0.16 0.36 -0.09 0.47 -0.51 -0.06 0.21 0.01 0.69 0.31
## Monuments 0.06 0.54 -0.06 0.26 -0.23 0.02 0.07 -0.01 0.43 0.57
## Gardens 0.00 0.71 -0.13 0.10 -0.06 0.14 -0.08 0.03 0.57 0.43
## com
## Churches 2.3
## Resorts 1.2
## Beaches 3.4
## Parks 1.3
## Theatres 1.2
## Museums 2.7
## Malls 2.3
## Zoo 2.9
## Restaurants 4.3
## Pubs_Bars 1.8
## LocalServices 1.7
## Burger_PizzaShops 1.4
## Hotels_OtherLodgings 2.0
## JuiceBars 3.3
## ArtGalleries 3.6
## DanceClubs 1.8
## SwimmingPools 1.4
## Gyms 1.2
## Bakeries 3.1
## BeautySpas 3.7
## Cafes 5.4
## ViewPoints 3.5
## Monuments 2.0
## Gardens 1.2
##
## PA2 PA1 PA4 PA3 PA8 PA5 PA7 PA6
## SS loadings 2.06 2.05 1.89 1.71 1.67 1.31 0.97 0.51
## Proportion Var 0.09 0.09 0.08 0.07 0.07 0.05 0.04 0.02
## Cumulative Var 0.09 0.17 0.25 0.32 0.39 0.45 0.49 0.51
## Proportion Explained 0.17 0.17 0.16 0.14 0.14 0.11 0.08 0.04
## Cumulative Proportion 0.17 0.34 0.49 0.63 0.77 0.88 0.96 1.00
##
## Mean item complexity = 2.4
## Test of the hypothesis that 8 factors are sufficient.
##
## The degrees of freedom for the null model are 276 and the objective function was 28.93 with Chi Square of 107452.8
## The degrees of freedom for the model are 112 and the objective function was 21.55
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic number of observations is 3724 with the empirical chi square 1431.89 with prob < 1e-227
## The total number of observations was 3724 with Likelihood Chi Square = 79919.21 with prob < 0
##
## Tucker Lewis Index of factoring reliability = -0.838
## RMSEA index = 0.437 and the 90 % confidence intervals are 0.435 0.44
## BIC = 78998.28
## Fit based upon off diagonal values = 0.99
print(fit4$loadings, cutoff = 0.3, sort = T)
##
## Loadings:
## PA2 PA1 PA4 PA3 PA8 PA5 PA7 PA6
## Parks 0.734
## Theatres 0.822
## Churches 0.560
## Monuments 0.540
## Gardens 0.711
## Zoo 0.521 0.469
## Pubs_Bars -0.304 0.698
## LocalServices 0.635
## Burger_PizzaShops -0.611
## Hotels_OtherLodgings -0.635
## Museums 0.443 0.602
## Malls -0.331 0.621
## ViewPoints 0.356 0.472 -0.509
## SwimmingPools 0.530
## Gyms 0.712
## Resorts -0.666
## Beaches -0.424
## Restaurants -0.464 0.406 0.377
## JuiceBars -0.333 -0.488
## ArtGalleries -0.451
## DanceClubs -0.365
## Bakeries -0.330 0.419
## BeautySpas
## Cafes 0.339
##
## PA2 PA1 PA4 PA3 PA8 PA5 PA7 PA6
## SS loadings 2.063 2.045 1.889 1.705 1.673 1.306 0.971 0.511
## Proportion Var 0.086 0.085 0.079 0.071 0.070 0.054 0.040 0.021
## Cumulative Var 0.086 0.171 0.250 0.321 0.391 0.445 0.486 0.507
# Interpretation of Factors
plot(fit$loadings[,1:2], type="n")
text(fit$loadings[,1:2], labels = names(X), cex = 0.7)
abline(h = 0, v = 0)

plot(fit2$loadings[,1:2], type="n")
text(fit2$loadings[,1:2], labels = names(X), cex = 0.7)
abline(h = 0, v = 0)

plot(fit3$loadings[,1:2], type="n")
text(fit3$loadings[,1:2], labels = names(X), cex = 0.7)
abline(h = 0, v = 0)

plot(fit4$loadings[,1:2], type="n")
text(fit4$loadings[,1:2], labels = names(X), cex = 0.7)
abline(h = 0, v = 0)

# Determine Number of Factors to Extract
ev <- eigen(cor(X)) # get eigenvalues
ap <- parallel(subject=nrow(X),var=ncol(X),
rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)

# Well at least this supports my PCA thoughts
fa.diagram(fit3)
