# NK Project 3 EDA
# PREP #########################################################################
# Packages
pacman::p_load(
pacman, rio, tidyverse, magrittr, janitor, # general stuff
psych, # EDA
visdat, # missingness
data.table, # working with data.tables
corrplot, # correlation plot
FactoMineR, # EDA, PCA, MFA
factoextra, # extract and visualize PCA/MFA
nFactors, # how many factors/components to retain
cluster, # clustering algorithms
NbClust, # number of clusters
clValid # ?
)
# Data
data <- import("data/Travel_Review.xlsx", range = "A1:Y5457",
na = "0")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in L2714 / R2714C12: got '2 2.'
# clean variable names
data %<>% rename(SwimmingPools = `Swimming Pools`)
str(data)
## 'data.frame': 5456 obs. of 25 variables:
## $ UserID : chr "User 1" "User 2" "User 3" "User 4" ...
## $ Churches : num NA NA NA NA NA NA NA NA NA NA ...
## $ Resorts : num NA NA NA 0.5 NA NA 5 5 5 5 ...
## $ Beaches : num 3.63 3.63 3.63 3.63 3.63 3.63 3.63 3.63 3.64 3.64 ...
## $ Parks : num 3.65 3.65 3.63 3.63 3.63 3.63 3.63 3.63 3.64 3.64 ...
## $ Theatres : num 5 5 5 5 5 5 5 5 5 5 ...
## $ Museums : num 2.92 2.92 2.92 2.92 2.92 2.92 2.92 2.92 2.92 2.92 ...
## $ Malls : num 5 5 5 5 5 5 3.03 5 3.03 5 ...
## $ Zoo : num 2.35 2.64 2.64 2.35 2.64 2.63 2.35 2.63 2.62 2.35 ...
## $ Restaurants : num 2.33 2.33 2.33 2.33 2.33 2.33 2.33 2.33 2.32 2.32 ...
## $ Pubs_Bars : num 2.64 2.65 2.64 2.64 2.64 2.65 2.64 2.64 2.63 2.63 ...
## $ LocalServices : num 1.7 1.7 1.7 1.73 1.7 1.71 1.73 1.7 1.71 1.69 ...
## $ Burger_PizzaShops : num 1.69 1.69 1.69 1.69 1.69 1.69 1.68 1.68 1.67 1.67 ...
## $ Hotels_OtherLodgings: num 1.7 1.7 1.7 1.7 1.7 1.69 1.69 1.69 1.68 1.67 ...
## $ JuiceBars : num 1.72 1.72 1.72 1.72 1.72 1.72 1.71 1.71 1.7 1.7 ...
## $ ArtGalleries : num 1.74 1.74 1.74 1.74 1.74 1.74 1.75 1.74 0.75 0.74 ...
## $ DanceClubs : num 0.59 0.59 0.59 0.59 0.59 0.59 0.59 0.6 0.6 0.59 ...
## $ SwimmingPools : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 NA NA ...
## $ Gyms : num NA NA NA NA NA NA NA NA NA NA ...
## $ Bakeries : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 NA ...
## $ BeautySpas : num NA NA NA NA NA NA NA NA NA NA ...
## $ Cafes : num NA NA NA NA NA NA NA NA NA NA ...
## $ ViewPoints : num NA NA NA NA NA NA NA NA NA NA ...
## $ Monuments : num NA NA NA NA NA NA NA NA NA NA ...
## $ Gardens : num NA NA NA NA NA NA NA NA NA NA ...
# DATA CLEANING ################################################################
# Missings
vis_miss(data, cluster = T, sort_miss = T)

# 3.9% missing total
# Bakeries has most missing, 19.2%
sum(is.na(data))
## [1] 5322
colSums(is.na(data)) %>% sort(decreasing = T)
## Bakeries Gyms BeautySpas
## 1046 1017 896
## Cafes SwimmingPools ViewPoints
## 604 479 345
## Monuments Gardens Churches
## 302 226 195
## DanceClubs Resorts Beaches
## 112 90 4
## ArtGalleries LocalServices Burger_PizzaShops
## 4 1 1
## UserID Parks Theatres
## 0 0 0
## Museums Malls Zoo
## 0 0 0
## Restaurants Pubs_Bars Hotels_OtherLodgings
## 0 0 0
## JuiceBars
## 0
sum(complete.cases(data)) # 3724 complete cases
## [1] 3724
sum(complete.cases(data))/nrow(data) # 68.3% complete
## [1] 0.6825513
# data %<>% drop_na()
complete <- data[complete.cases(data),]
incomplete <- data[!complete.cases(data),]
# Standardizing Ratings by User
# # create objects to save into
# user_avgs <- data.frame(user_mean = rep(NA_real_, nrow(complete)),
# user_sd = rep(NA_real_, nrow(complete)))
# data2 <- complete
#
# # find mean & sd for each user
# for (i in 1:nrow(data2)) {
# user_avgs[i, ] <- c(rowMeans(data2[i,-1]), sd(data2[i,-1]))
# }
#
# for (i in 1:nrow(data2)) {
# data2[i,-1] <- data2[i,-1] %>%
# mutate(across(.fns = function(x) {
# scale(x, center = user_avgs[i,1], scale = user_avgs[i,2])
# }))
# }
# MUCH easier and faster way:
data2 <- complete
data2[,-1] <- t(data2[,-1]) %>% scale() %>% t()
head(data2)
## UserID Churches Resorts Beaches Parks Theatres Museums
## 69 User 69 -0.6091162 -0.6505058 -0.6173941 -0.5925604 1.890813 2.842772
## 70 User 70 -0.7217397 -0.6777013 -0.6703616 2.3903039 1.553575 2.390304
## 71 User 71 -0.6634526 -0.6193246 -0.6119699 -0.5972605 1.616495 2.447573
## 72 User 72 -0.5867433 -0.5950317 -0.5867433 -0.5618784 1.916327 1.145514
## 73 User 73 -0.6785527 -0.6350286 -0.6277746 -0.6060126 1.555684 2.368134
## 74 User 74 -0.7170883 -0.6864435 -0.6741855 1.8448168 1.158373 1.844817
## Malls Zoo Restaurants Pubs_Bars LocalServices Burger_PizzaShops
## 69 1.1292449 1.1209670 1.0381879 1.0464658 0.40906672 -0.08760788
## 70 0.8783205 0.8563013 0.7902438 0.4893150 0.22508491 -0.20795889
## 71 2.4475731 0.9104469 0.8369002 0.8369002 0.27059059 -0.17068965
## 72 2.8529057 1.1206487 1.0294773 1.0294773 0.39127734 -0.11430962
## 73 0.8592986 0.8447906 0.7649964 0.7649964 0.19918319 -0.22880375
## 74 1.8448168 0.5577353 0.4841878 0.4780588 0.00612896 -0.36773757
## Hotels_OtherLodgings JuiceBars ArtGalleries DanceClubs SwimmingPools
## 69 -0.1786649 -0.2117765 -0.1952207 -0.1786649 -0.8408977
## 70 -0.2960356 -0.3253945 -0.3180548 -0.3033753 -0.2813561
## 71 -0.2810097 -0.2810097 -0.2736550 -0.2589457 -0.2442364
## 72 -0.2303460 -0.2386343 -0.2303460 -0.2137694 -0.8105277
## 73 -0.3158519 -0.3448680 -0.3376140 -0.3231060 -0.8453951
## 74 -0.4474140 -0.4596720 -0.4535430 -0.4412851 -0.8703123
## Gyms Bakeries BeautySpas Cafes ViewPoints Monuments Gardens
## 69 -0.8160640 -0.8574535 -0.8822872 -0.6753395 -0.6918953 -0.7001732 -0.6918953
## 70 -0.8538547 -0.8978931 -0.9125725 -0.8978931 -0.7290794 -0.7437588 -0.7364191
## 71 -0.8031913 -0.8473193 -0.8546740 -0.8399647 -0.6708072 -0.6855166 -0.6634526
## 72 -0.8105277 -0.8519693 -0.8685459 -0.8519693 -0.6447615 -0.6613381 -0.6281849
## 73 -0.8308871 -0.8744112 -0.8816652 -0.8744112 2.3681339 -0.7003148 -0.6205206
## 74 -0.8519254 -0.8948281 -0.9009571 -0.8948281 1.8448168 -0.7354751 -0.6680566
# complete[1,]
# user_avgs[1,]
# data2[2,]
# DATA EXPLORATION - FULL DATA SET #############################################
numvars <- data2[,sapply(data2, is.numeric)==T]
psy <- psych::describe(data2[,-1])
arrange(psy, median)
## vars n mean sd median trimmed mad min max range
## Gyms 18 3724 -0.71 0.69 -0.86 -0.86 0.18 -1.35 4.16 5.51
## SwimmingPools 17 3724 -0.72 0.62 -0.84 -0.84 0.17 -1.33 4.02 5.35
## Bakeries 19 3724 -0.52 0.98 -0.84 -0.78 0.22 -1.49 4.32 5.81
## BeautySpas 20 3724 -0.51 0.97 -0.82 -0.73 0.24 -3.15 4.26 7.40
## DanceClubs 16 3724 -0.60 0.71 -0.79 -0.77 0.18 -1.51 3.33 4.84
## Cafes 21 3724 -0.63 0.73 -0.78 -0.77 0.24 -3.72 2.56 6.28
## Monuments 23 3724 -0.17 0.94 -0.64 -0.34 0.30 -1.18 3.29 4.48
## ViewPoints 22 3724 0.03 1.14 -0.62 -0.12 0.43 -1.79 3.63 5.42
## ArtGalleries 15 3724 0.14 1.21 -0.55 0.01 0.49 -1.58 4.16 5.75
## Gardens 24 3724 -0.19 0.87 -0.52 -0.38 0.35 -1.11 3.74 4.85
## Churches 1 3724 -0.29 0.60 -0.46 -0.40 0.39 -1.10 2.95 4.05
## JuiceBars 14 3724 0.04 1.05 -0.41 -0.10 0.56 -1.54 4.00 5.54
## Hotels_OtherLodgings 13 3724 0.03 0.92 -0.28 -0.10 0.57 -1.51 3.48 4.99
## Burger_PizzaShops 12 3724 -0.08 0.76 -0.26 -0.22 0.53 -1.14 2.77 3.91
## LocalServices 11 3724 0.17 0.82 -0.12 0.07 0.61 -1.07 2.83 3.90
## Zoo 8 3724 0.18 0.69 -0.06 0.11 0.56 -1.01 2.54 3.55
## Resorts 2 3724 0.27 0.92 -0.05 0.17 0.72 -0.91 4.13 5.05
## Beaches 3 3724 0.30 0.79 0.00 0.18 0.51 -0.87 4.32 5.19
## Pubs_Bars 10 3724 0.30 0.79 0.19 0.23 0.75 -1.09 2.84 3.92
## Parks 4 3724 0.53 0.87 0.26 0.44 0.79 -0.78 4.18 4.96
## Museums 6 3724 0.48 0.84 0.31 0.40 0.80 -0.86 2.85 3.71
## Restaurants 9 3724 0.60 0.93 0.38 0.56 1.02 -1.07 2.85 3.92
## Theatres 5 3724 0.57 0.86 0.39 0.50 0.93 -0.77 3.61 4.38
## Malls 7 3724 0.79 0.97 0.57 0.77 1.27 -0.91 4.04 4.95
## skew kurtosis se
## Gyms 4.09 17.69 0.01
## SwimmingPools 4.50 21.41 0.01
## Bakeries 2.69 6.69 0.02
## BeautySpas 2.34 5.42 0.02
## DanceClubs 3.08 8.95 0.01
## Cafes 1.99 5.87 0.01
## Monuments 1.40 0.62 0.02
## ViewPoints 0.96 -0.62 0.02
## ArtGalleries 0.87 -0.75 0.02
## Gardens 1.85 2.59 0.01
## Churches 1.95 4.41 0.01
## JuiceBars 1.11 -0.18 0.02
## Hotels_OtherLodgings 1.15 -0.01 0.02
## Burger_PizzaShops 1.47 1.42 0.01
## LocalServices 1.04 0.08 0.01
## Zoo 0.90 0.05 0.01
## Resorts 0.95 -0.14 0.02
## Beaches 1.32 1.32 0.01
## Pubs_Bars 0.73 -0.48 0.01
## Parks 0.82 -0.20 0.01
## Museums 0.77 -0.45 0.01
## Restaurants 0.42 -1.12 0.02
## Theatres 0.62 -0.73 0.01
## Malls 0.22 -1.18 0.02
# SwimmingPools & Gyms is VERY skewed
arrange(psy, skew)
## vars n mean sd median trimmed mad min max range
## Malls 7 3724 0.79 0.97 0.57 0.77 1.27 -0.91 4.04 4.95
## Restaurants 9 3724 0.60 0.93 0.38 0.56 1.02 -1.07 2.85 3.92
## Theatres 5 3724 0.57 0.86 0.39 0.50 0.93 -0.77 3.61 4.38
## Pubs_Bars 10 3724 0.30 0.79 0.19 0.23 0.75 -1.09 2.84 3.92
## Museums 6 3724 0.48 0.84 0.31 0.40 0.80 -0.86 2.85 3.71
## Parks 4 3724 0.53 0.87 0.26 0.44 0.79 -0.78 4.18 4.96
## ArtGalleries 15 3724 0.14 1.21 -0.55 0.01 0.49 -1.58 4.16 5.75
## Zoo 8 3724 0.18 0.69 -0.06 0.11 0.56 -1.01 2.54 3.55
## Resorts 2 3724 0.27 0.92 -0.05 0.17 0.72 -0.91 4.13 5.05
## ViewPoints 22 3724 0.03 1.14 -0.62 -0.12 0.43 -1.79 3.63 5.42
## LocalServices 11 3724 0.17 0.82 -0.12 0.07 0.61 -1.07 2.83 3.90
## JuiceBars 14 3724 0.04 1.05 -0.41 -0.10 0.56 -1.54 4.00 5.54
## Hotels_OtherLodgings 13 3724 0.03 0.92 -0.28 -0.10 0.57 -1.51 3.48 4.99
## Beaches 3 3724 0.30 0.79 0.00 0.18 0.51 -0.87 4.32 5.19
## Monuments 23 3724 -0.17 0.94 -0.64 -0.34 0.30 -1.18 3.29 4.48
## Burger_PizzaShops 12 3724 -0.08 0.76 -0.26 -0.22 0.53 -1.14 2.77 3.91
## Gardens 24 3724 -0.19 0.87 -0.52 -0.38 0.35 -1.11 3.74 4.85
## Churches 1 3724 -0.29 0.60 -0.46 -0.40 0.39 -1.10 2.95 4.05
## Cafes 21 3724 -0.63 0.73 -0.78 -0.77 0.24 -3.72 2.56 6.28
## BeautySpas 20 3724 -0.51 0.97 -0.82 -0.73 0.24 -3.15 4.26 7.40
## Bakeries 19 3724 -0.52 0.98 -0.84 -0.78 0.22 -1.49 4.32 5.81
## DanceClubs 16 3724 -0.60 0.71 -0.79 -0.77 0.18 -1.51 3.33 4.84
## Gyms 18 3724 -0.71 0.69 -0.86 -0.86 0.18 -1.35 4.16 5.51
## SwimmingPools 17 3724 -0.72 0.62 -0.84 -0.84 0.17 -1.33 4.02 5.35
## skew kurtosis se
## Malls 0.22 -1.18 0.02
## Restaurants 0.42 -1.12 0.02
## Theatres 0.62 -0.73 0.01
## Pubs_Bars 0.73 -0.48 0.01
## Museums 0.77 -0.45 0.01
## Parks 0.82 -0.20 0.01
## ArtGalleries 0.87 -0.75 0.02
## Zoo 0.90 0.05 0.01
## Resorts 0.95 -0.14 0.02
## ViewPoints 0.96 -0.62 0.02
## LocalServices 1.04 0.08 0.01
## JuiceBars 1.11 -0.18 0.02
## Hotels_OtherLodgings 1.15 -0.01 0.02
## Beaches 1.32 1.32 0.01
## Monuments 1.40 0.62 0.02
## Burger_PizzaShops 1.47 1.42 0.01
## Gardens 1.85 2.59 0.01
## Churches 1.95 4.41 0.01
## Cafes 1.99 5.87 0.01
## BeautySpas 2.34 5.42 0.02
## Bakeries 2.69 6.69 0.02
## DanceClubs 3.08 8.95 0.01
## Gyms 4.09 17.69 0.01
## SwimmingPools 4.50 21.41 0.01
arrange(psy, range)
## vars n mean sd median trimmed mad min max range
## Zoo 8 3724 0.18 0.69 -0.06 0.11 0.56 -1.01 2.54 3.55
## Museums 6 3724 0.48 0.84 0.31 0.40 0.80 -0.86 2.85 3.71
## LocalServices 11 3724 0.17 0.82 -0.12 0.07 0.61 -1.07 2.83 3.90
## Burger_PizzaShops 12 3724 -0.08 0.76 -0.26 -0.22 0.53 -1.14 2.77 3.91
## Restaurants 9 3724 0.60 0.93 0.38 0.56 1.02 -1.07 2.85 3.92
## Pubs_Bars 10 3724 0.30 0.79 0.19 0.23 0.75 -1.09 2.84 3.92
## Churches 1 3724 -0.29 0.60 -0.46 -0.40 0.39 -1.10 2.95 4.05
## Theatres 5 3724 0.57 0.86 0.39 0.50 0.93 -0.77 3.61 4.38
## Monuments 23 3724 -0.17 0.94 -0.64 -0.34 0.30 -1.18 3.29 4.48
## DanceClubs 16 3724 -0.60 0.71 -0.79 -0.77 0.18 -1.51 3.33 4.84
## Gardens 24 3724 -0.19 0.87 -0.52 -0.38 0.35 -1.11 3.74 4.85
## Malls 7 3724 0.79 0.97 0.57 0.77 1.27 -0.91 4.04 4.95
## Parks 4 3724 0.53 0.87 0.26 0.44 0.79 -0.78 4.18 4.96
## Hotels_OtherLodgings 13 3724 0.03 0.92 -0.28 -0.10 0.57 -1.51 3.48 4.99
## Resorts 2 3724 0.27 0.92 -0.05 0.17 0.72 -0.91 4.13 5.05
## Beaches 3 3724 0.30 0.79 0.00 0.18 0.51 -0.87 4.32 5.19
## SwimmingPools 17 3724 -0.72 0.62 -0.84 -0.84 0.17 -1.33 4.02 5.35
## ViewPoints 22 3724 0.03 1.14 -0.62 -0.12 0.43 -1.79 3.63 5.42
## Gyms 18 3724 -0.71 0.69 -0.86 -0.86 0.18 -1.35 4.16 5.51
## JuiceBars 14 3724 0.04 1.05 -0.41 -0.10 0.56 -1.54 4.00 5.54
## ArtGalleries 15 3724 0.14 1.21 -0.55 0.01 0.49 -1.58 4.16 5.75
## Bakeries 19 3724 -0.52 0.98 -0.84 -0.78 0.22 -1.49 4.32 5.81
## Cafes 21 3724 -0.63 0.73 -0.78 -0.77 0.24 -3.72 2.56 6.28
## BeautySpas 20 3724 -0.51 0.97 -0.82 -0.73 0.24 -3.15 4.26 7.40
## skew kurtosis se
## Zoo 0.90 0.05 0.01
## Museums 0.77 -0.45 0.01
## LocalServices 1.04 0.08 0.01
## Burger_PizzaShops 1.47 1.42 0.01
## Restaurants 0.42 -1.12 0.02
## Pubs_Bars 0.73 -0.48 0.01
## Churches 1.95 4.41 0.01
## Theatres 0.62 -0.73 0.01
## Monuments 1.40 0.62 0.02
## DanceClubs 3.08 8.95 0.01
## Gardens 1.85 2.59 0.01
## Malls 0.22 -1.18 0.02
## Parks 0.82 -0.20 0.01
## Hotels_OtherLodgings 1.15 -0.01 0.02
## Resorts 0.95 -0.14 0.02
## Beaches 1.32 1.32 0.01
## SwimmingPools 4.50 21.41 0.01
## ViewPoints 0.96 -0.62 0.02
## Gyms 4.09 17.69 0.01
## JuiceBars 1.11 -0.18 0.02
## ArtGalleries 0.87 -0.75 0.02
## Bakeries 2.69 6.69 0.02
## Cafes 1.99 5.87 0.01
## BeautySpas 2.34 5.42 0.02
# UNIVARIATE
hist(data$Churches) # skews low

hist(data$Resorts)

hist(data$Beaches)

hist(data$Parks)

hist(data$Theatres) # lots of 5s

hist(data$Museums) # lots of 5s

hist(data$Malls) # lots of 5s

hist(data$Zoo)

hist(data$Restaurants)

hist(data$Pubs_Bars)

hist(data$LocalServices)

hist(data$Burger_PizzaShops)

hist(data$Hotels_OtherLodgings)

hist(data$JuiceBars)

hist(data$ArtGalleries)

hist(data$DanceClubs) # lots of 1s

hist(data$SwimmingPools) # lots of 1s

hist(data$Gyms) # lots of 1s

hist(data$Bakeries) # lots of 1s

hist(data$BeautySpas) # lots of 1s

hist(data$Cafes) # lots of 1s

hist(data$ViewPoints)

hist(data$Monuments)

hist(data$Gardens)

par(mfrow=c(4,1))
par(mfrow=c(1,1))
hist(data2$Malls)

hist(data2$Restaurants)

hist(data2$Theatres)

hist(data2$Pubs_Bars)

hist(data2$Museums)

hist(data2$Parks)

hist(data2$ArtGalleries)

hist(data2$Zoo)

hist(data2$Resorts)

hist(data2$ViewPoints)

hist(data2$LocalServices)

hist(data2$JuiceBars)

hist(data2$Hotels_OtherLodgings)

hist(data2$Beaches)

hist(data2$Monuments)

hist(data2$Burger_PizzaShops)

hist(data2$Gardens)

hist(data2$Churches)

hist(data2$Cafes) # range

hist(data2$BeautySpas) # range

hist(data2$Bakeries)

hist(data2$DanceClubs)

hist(data2$Gyms) # skew

hist(data2$SwimmingPools) # skew

summary(data2$BeautySpas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1454 -0.9766 -0.8243 -0.5069 -0.6442 4.2574
summary(data2$Cafes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.7168 -0.9556 -0.7821 -0.6344 -0.6400 2.5628
summary(data2$Gyms)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.3518 -0.9838 -0.8593 -0.7133 -0.7397 4.1610
summary(data2$SwimmingPools)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.3274 -0.9554 -0.8444 -0.7227 -0.7218 4.0243
boxplot(data2$BeautySpas, horizontal = T)

boxplot(data2$Cafes, horizontal = T)

boxplot(data2$Gyms, horizontal = T)

boxplot(data2$SwimmingPools, horizontal = T)

data2 %>% dplyr::select(Gyms, SwimmingPools) %>%
pivot_longer(cols = 1:2) %>%
ggplot(aes(value, fill = name)) +
facet_wrap(.~name, nrow = 2, ncol = 1) +
theme_bw() +
geom_histogram(binwidth = 0.1, color = "grey20") +
scale_x_continuous(breaks = seq(-1.5, 4.5, by=0.5)) +
scale_fill_manual(values = c("lightgreen", "lightskyblue")) +
labs(title = "Extreme Skewness in Average User Rating",
x = "Average User Rating", y = "Count",
caption = "*Rating of 0 indicates a rating equal to a user's average rating across all categories") +
theme(legend.position = "none",
plot.title = element_text(hjust=0.5, face = "bold", size = 13),
plot.subtitle = element_text(hjust = 0.5))

data2 %>% dplyr::select(BeautySpas, Cafes) %>%
pivot_longer(cols = 1:2) %>%
ggplot(aes(value, fill = name)) +
facet_wrap(.~name, nrow = 2, ncol = 1) +
theme_bw() +
geom_histogram(binwidth = 0.1, color = "grey20") +
scale_x_continuous(breaks = seq(-4, 4.5, by=0.5)) +
scale_fill_manual(values = c("plum", "goldenrod1")) +
labs(title = "Largest Ranges in Average User Rating",
x = "Average User Rating", y = "Count",
caption = "*Rating of 0 indicates a rating equal to a user's average rating across all categories") +
theme(legend.position = "none",
plot.title = element_text(hjust=0.5, face = "bold", size = 13),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_text(size = 8),
plot.background = element_rect(color = "white"))

# CORRELATION
# Corrplot with user-standardized data
c <- cor(data2[,-1])
cc <- data[,-1] %>% drop_na() %>% cor()
corrplot(c, method = "color",
# addCoef.col = T, number.digits = 2, number.cex = 0.5,
type = "full",
diag = T,
addgrid.col = "darkgrey",
# mar = c(0,0,2,0),
# title = "Variable Correlation Plot",
order = "FPC",
# hclust.method = "ward.D2",
addrect = 6,
tl.col = "black",
tl.cex = 0.8,
tl.srt = 45
)
text(x = 12.5, y = 30, font = 2, col = "black", cex = 1.2,
labels = "Variable Correlation Plot")

# making a list of correlations
cor_df <- tibble(var1 = rep(NA_character_, nrow(c)),
var2 = rep(NA_character_, ncol(c)),
cor = rep(NA_real_, ncol(c)))
count <- 1
for (i in 1:(ncol(c)-1)) {
for (j in (i+1):ncol(c)) {
cor_df[count, 1] <- rownames(c)[i]
cor_df[count, 2] <- colnames(c)[j]
cor_df[count, 3] <- c[i, j]
count <- count+1
}
}
cor_df %>%
# filter(cor > 0.4) %>%
# filter(var1 == "DanceClubs" | var2 == "DanceClubs") %>%
arrange(-abs(cor))
## # A tibble: 276 x 3
## var1 var2 cor
## <chr> <chr> <dbl>
## 1 Parks Theatres 0.573
## 2 Zoo Restaurants 0.539
## 3 Restaurants Pubs_Bars 0.536
## 4 Zoo Pubs_Bars 0.529
## 5 Malls Restaurants 0.516
## 6 Malls ViewPoints -0.501
## 7 Malls Zoo 0.496
## 8 Churches Gardens 0.492
## 9 Pubs_Bars LocalServices 0.475
## 10 Restaurants Gardens -0.442
## # ... with 266 more rows
# finding range of correlations per variable
count <- 1
cor_range <- data.frame(var = rep(NA_character_, length(rownames(c))),
min = rep(NA_real_, length(rownames(c))),
max = rep(NA_real_, length(rownames(c))))
for (i in rownames(c)) {
cor_range[count,] <-
cor_df %>%
filter(var1 == i | var2 == i) %>%
summarise(var = i,
min = min(cor),
max = max(cor))
count <- count + 1
}
print(cor_range)
## var min max
## 1 Churches -0.4015975 0.4921240
## 2 Resorts -0.2803196 0.3130604
## 3 Beaches -0.2890621 0.3130604
## 4 Parks -0.3340272 0.5725764
## 5 Theatres -0.3631907 0.5725764
## 6 Museums -0.2264539 0.4408501
## 7 Malls -0.5007488 0.5155590
## 8 Zoo -0.3528443 0.5386676
## 9 Restaurants -0.4421413 0.5386676
## 10 Pubs_Bars -0.3929903 0.5355326
## 11 LocalServices -0.2996600 0.4748105
## 12 Burger_PizzaShops -0.3713415 0.3910358
## 13 Hotels_OtherLodgings -0.2803196 0.4055654
## 14 JuiceBars -0.3383527 0.4055654
## 15 ArtGalleries -0.3631907 0.2723495
## 16 DanceClubs -0.1584600 0.1140728
## 17 SwimmingPools -0.2344388 0.4176647
## 18 Gyms -0.2793281 0.4176647
## 19 Bakeries -0.3349344 0.3464021
## 20 BeautySpas -0.2996600 0.1784818
## 21 Cafes -0.2974572 0.3365528
## 22 ViewPoints -0.5007488 0.4284096
## 23 Monuments -0.3733976 0.4284096
## 24 Gardens -0.4421413 0.4921240
cor_range %>%
filter(min > -0.25, max < 0.25)
## var min max
## 1 DanceClubs -0.15846 0.1140728
# exclude DanceClubs?
# BIVARIATE
# greatest positive corr
ggplot(data2, aes(Parks, Theatres)) + geom_point()

ggplot(data2, aes(Restaurants, Zoo)) + geom_point()

ggplot(data2, aes(Restaurants, Pubs_Bars)) + geom_point()

ggplot(data2, aes(Zoo, Pubs_Bars)) + geom_point()

ggplot(data2, aes(Malls, Restaurants)) + geom_point()

# greatest negative corr
ggplot(data2, aes(Malls, ViewPoints)) + geom_point()

# no corr
ggplot(data2, aes(LocalServices, JuiceBars)) + geom_point()

ggplot(data2, aes(Beaches, Bakeries)) + geom_point()

ggplot(data2, aes(Bakeries, ViewPoints)) + geom_point()

# OUTLIERS #####################################################################
# post-PCA checking for outliers
# that might be useful to remove from cluster analysis
# using data3
X <- subset(data2, select = -c(UserID))
pca <- prcomp(X, center = TRUE, scale. = TRUE)
pca_label <- cbind(data2, as.data.frame(pca$x))
data3 <- pca_label %>%
dplyr::select(PC1:PC7) %>%
scale() %>% as.data.frame()
data_melt <- data3 %>%
pivot_longer(cols = 1:7, values_to = "coord") %>%
mutate(name = as_factor(name))
data_melt %>%
ggplot(aes(name, coord, fill = name)) +
# ylim(0,5) +
geom_boxplot()

data3 %>%
ggplot(aes(PC5, PC6)) + geom_point()

data3 %>% GGally::ggpairs(mapping = aes(alpha=0.01))
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2

# probably nothing worth excluding