# NK Project 3 EDA

# PREP #########################################################################
# Packages
pacman::p_load(
  pacman, rio, tidyverse, magrittr, janitor,  # general stuff
  psych,        # EDA
  visdat,       # missingness
  data.table,   # working with data.tables
  corrplot,     # correlation plot
  FactoMineR,   # EDA, PCA, MFA
  factoextra,   # extract and visualize PCA/MFA
  nFactors,     # how many factors/components to retain
  cluster,      # clustering algorithms
  NbClust,      # number of clusters
  clValid       # ?
)


# Data
data <- import("data/Travel_Review.xlsx", range = "A1:Y5457",
               na = "0")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in L2714 / R2714C12: got '2 2.'
# clean variable names
data %<>% rename(SwimmingPools = `Swimming Pools`)
str(data)
## 'data.frame':    5456 obs. of  25 variables:
##  $ UserID              : chr  "User 1" "User 2" "User 3" "User 4" ...
##  $ Churches            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Resorts             : num  NA NA NA 0.5 NA NA 5 5 5 5 ...
##  $ Beaches             : num  3.63 3.63 3.63 3.63 3.63 3.63 3.63 3.63 3.64 3.64 ...
##  $ Parks               : num  3.65 3.65 3.63 3.63 3.63 3.63 3.63 3.63 3.64 3.64 ...
##  $ Theatres            : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ Museums             : num  2.92 2.92 2.92 2.92 2.92 2.92 2.92 2.92 2.92 2.92 ...
##  $ Malls               : num  5 5 5 5 5 5 3.03 5 3.03 5 ...
##  $ Zoo                 : num  2.35 2.64 2.64 2.35 2.64 2.63 2.35 2.63 2.62 2.35 ...
##  $ Restaurants         : num  2.33 2.33 2.33 2.33 2.33 2.33 2.33 2.33 2.32 2.32 ...
##  $ Pubs_Bars           : num  2.64 2.65 2.64 2.64 2.64 2.65 2.64 2.64 2.63 2.63 ...
##  $ LocalServices       : num  1.7 1.7 1.7 1.73 1.7 1.71 1.73 1.7 1.71 1.69 ...
##  $ Burger_PizzaShops   : num  1.69 1.69 1.69 1.69 1.69 1.69 1.68 1.68 1.67 1.67 ...
##  $ Hotels_OtherLodgings: num  1.7 1.7 1.7 1.7 1.7 1.69 1.69 1.69 1.68 1.67 ...
##  $ JuiceBars           : num  1.72 1.72 1.72 1.72 1.72 1.72 1.71 1.71 1.7 1.7 ...
##  $ ArtGalleries        : num  1.74 1.74 1.74 1.74 1.74 1.74 1.75 1.74 0.75 0.74 ...
##  $ DanceClubs          : num  0.59 0.59 0.59 0.59 0.59 0.59 0.59 0.6 0.6 0.59 ...
##  $ SwimmingPools       : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 NA NA ...
##  $ Gyms                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Bakeries            : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 NA ...
##  $ BeautySpas          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Cafes               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ ViewPoints          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Monuments           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Gardens             : num  NA NA NA NA NA NA NA NA NA NA ...
# DATA CLEANING ################################################################

# Missings
vis_miss(data, cluster = T, sort_miss = T)

  # 3.9% missing total
  # Bakeries has most missing, 19.2%

sum(is.na(data))
## [1] 5322
colSums(is.na(data)) %>% sort(decreasing = T)
##             Bakeries                 Gyms           BeautySpas 
##                 1046                 1017                  896 
##                Cafes        SwimmingPools           ViewPoints 
##                  604                  479                  345 
##            Monuments              Gardens             Churches 
##                  302                  226                  195 
##           DanceClubs              Resorts              Beaches 
##                  112                   90                    4 
##         ArtGalleries        LocalServices    Burger_PizzaShops 
##                    4                    1                    1 
##               UserID                Parks             Theatres 
##                    0                    0                    0 
##              Museums                Malls                  Zoo 
##                    0                    0                    0 
##          Restaurants            Pubs_Bars Hotels_OtherLodgings 
##                    0                    0                    0 
##            JuiceBars 
##                    0
sum(complete.cases(data))   # 3724 complete cases
## [1] 3724
sum(complete.cases(data))/nrow(data)   # 68.3% complete
## [1] 0.6825513
# data %<>% drop_na()
complete <- data[complete.cases(data),]
incomplete <- data[!complete.cases(data),]




# Standardizing Ratings by User

# # create objects to save into
# user_avgs <- data.frame(user_mean = rep(NA_real_, nrow(complete)),
#                         user_sd = rep(NA_real_, nrow(complete)))
# data2 <- complete
# 
# # find mean & sd for each user
# for (i in 1:nrow(data2)) {
#   user_avgs[i, ] <- c(rowMeans(data2[i,-1]), sd(data2[i,-1]))
# }
# 
# for (i in 1:nrow(data2)) {
#   data2[i,-1] <- data2[i,-1] %>% 
#     mutate(across(.fns = function(x) {
#       scale(x, center = user_avgs[i,1], scale = user_avgs[i,2])
#     }))
# }

# MUCH easier and faster way:
data2 <- complete
data2[,-1] <- t(data2[,-1]) %>% scale() %>% t()
head(data2)
##     UserID   Churches    Resorts    Beaches      Parks Theatres  Museums
## 69 User 69 -0.6091162 -0.6505058 -0.6173941 -0.5925604 1.890813 2.842772
## 70 User 70 -0.7217397 -0.6777013 -0.6703616  2.3903039 1.553575 2.390304
## 71 User 71 -0.6634526 -0.6193246 -0.6119699 -0.5972605 1.616495 2.447573
## 72 User 72 -0.5867433 -0.5950317 -0.5867433 -0.5618784 1.916327 1.145514
## 73 User 73 -0.6785527 -0.6350286 -0.6277746 -0.6060126 1.555684 2.368134
## 74 User 74 -0.7170883 -0.6864435 -0.6741855  1.8448168 1.158373 1.844817
##        Malls       Zoo Restaurants Pubs_Bars LocalServices Burger_PizzaShops
## 69 1.1292449 1.1209670   1.0381879 1.0464658    0.40906672       -0.08760788
## 70 0.8783205 0.8563013   0.7902438 0.4893150    0.22508491       -0.20795889
## 71 2.4475731 0.9104469   0.8369002 0.8369002    0.27059059       -0.17068965
## 72 2.8529057 1.1206487   1.0294773 1.0294773    0.39127734       -0.11430962
## 73 0.8592986 0.8447906   0.7649964 0.7649964    0.19918319       -0.22880375
## 74 1.8448168 0.5577353   0.4841878 0.4780588    0.00612896       -0.36773757
##    Hotels_OtherLodgings  JuiceBars ArtGalleries DanceClubs SwimmingPools
## 69           -0.1786649 -0.2117765   -0.1952207 -0.1786649    -0.8408977
## 70           -0.2960356 -0.3253945   -0.3180548 -0.3033753    -0.2813561
## 71           -0.2810097 -0.2810097   -0.2736550 -0.2589457    -0.2442364
## 72           -0.2303460 -0.2386343   -0.2303460 -0.2137694    -0.8105277
## 73           -0.3158519 -0.3448680   -0.3376140 -0.3231060    -0.8453951
## 74           -0.4474140 -0.4596720   -0.4535430 -0.4412851    -0.8703123
##          Gyms   Bakeries BeautySpas      Cafes ViewPoints  Monuments    Gardens
## 69 -0.8160640 -0.8574535 -0.8822872 -0.6753395 -0.6918953 -0.7001732 -0.6918953
## 70 -0.8538547 -0.8978931 -0.9125725 -0.8978931 -0.7290794 -0.7437588 -0.7364191
## 71 -0.8031913 -0.8473193 -0.8546740 -0.8399647 -0.6708072 -0.6855166 -0.6634526
## 72 -0.8105277 -0.8519693 -0.8685459 -0.8519693 -0.6447615 -0.6613381 -0.6281849
## 73 -0.8308871 -0.8744112 -0.8816652 -0.8744112  2.3681339 -0.7003148 -0.6205206
## 74 -0.8519254 -0.8948281 -0.9009571 -0.8948281  1.8448168 -0.7354751 -0.6680566
# complete[1,]
# user_avgs[1,]
# data2[2,]
# DATA EXPLORATION - FULL DATA SET #############################################




numvars <- data2[,sapply(data2, is.numeric)==T]


psy <- psych::describe(data2[,-1])
arrange(psy, median)
##                      vars    n  mean   sd median trimmed  mad   min  max range
## Gyms                   18 3724 -0.71 0.69  -0.86   -0.86 0.18 -1.35 4.16  5.51
## SwimmingPools          17 3724 -0.72 0.62  -0.84   -0.84 0.17 -1.33 4.02  5.35
## Bakeries               19 3724 -0.52 0.98  -0.84   -0.78 0.22 -1.49 4.32  5.81
## BeautySpas             20 3724 -0.51 0.97  -0.82   -0.73 0.24 -3.15 4.26  7.40
## DanceClubs             16 3724 -0.60 0.71  -0.79   -0.77 0.18 -1.51 3.33  4.84
## Cafes                  21 3724 -0.63 0.73  -0.78   -0.77 0.24 -3.72 2.56  6.28
## Monuments              23 3724 -0.17 0.94  -0.64   -0.34 0.30 -1.18 3.29  4.48
## ViewPoints             22 3724  0.03 1.14  -0.62   -0.12 0.43 -1.79 3.63  5.42
## ArtGalleries           15 3724  0.14 1.21  -0.55    0.01 0.49 -1.58 4.16  5.75
## Gardens                24 3724 -0.19 0.87  -0.52   -0.38 0.35 -1.11 3.74  4.85
## Churches                1 3724 -0.29 0.60  -0.46   -0.40 0.39 -1.10 2.95  4.05
## JuiceBars              14 3724  0.04 1.05  -0.41   -0.10 0.56 -1.54 4.00  5.54
## Hotels_OtherLodgings   13 3724  0.03 0.92  -0.28   -0.10 0.57 -1.51 3.48  4.99
## Burger_PizzaShops      12 3724 -0.08 0.76  -0.26   -0.22 0.53 -1.14 2.77  3.91
## LocalServices          11 3724  0.17 0.82  -0.12    0.07 0.61 -1.07 2.83  3.90
## Zoo                     8 3724  0.18 0.69  -0.06    0.11 0.56 -1.01 2.54  3.55
## Resorts                 2 3724  0.27 0.92  -0.05    0.17 0.72 -0.91 4.13  5.05
## Beaches                 3 3724  0.30 0.79   0.00    0.18 0.51 -0.87 4.32  5.19
## Pubs_Bars              10 3724  0.30 0.79   0.19    0.23 0.75 -1.09 2.84  3.92
## Parks                   4 3724  0.53 0.87   0.26    0.44 0.79 -0.78 4.18  4.96
## Museums                 6 3724  0.48 0.84   0.31    0.40 0.80 -0.86 2.85  3.71
## Restaurants             9 3724  0.60 0.93   0.38    0.56 1.02 -1.07 2.85  3.92
## Theatres                5 3724  0.57 0.86   0.39    0.50 0.93 -0.77 3.61  4.38
## Malls                   7 3724  0.79 0.97   0.57    0.77 1.27 -0.91 4.04  4.95
##                      skew kurtosis   se
## Gyms                 4.09    17.69 0.01
## SwimmingPools        4.50    21.41 0.01
## Bakeries             2.69     6.69 0.02
## BeautySpas           2.34     5.42 0.02
## DanceClubs           3.08     8.95 0.01
## Cafes                1.99     5.87 0.01
## Monuments            1.40     0.62 0.02
## ViewPoints           0.96    -0.62 0.02
## ArtGalleries         0.87    -0.75 0.02
## Gardens              1.85     2.59 0.01
## Churches             1.95     4.41 0.01
## JuiceBars            1.11    -0.18 0.02
## Hotels_OtherLodgings 1.15    -0.01 0.02
## Burger_PizzaShops    1.47     1.42 0.01
## LocalServices        1.04     0.08 0.01
## Zoo                  0.90     0.05 0.01
## Resorts              0.95    -0.14 0.02
## Beaches              1.32     1.32 0.01
## Pubs_Bars            0.73    -0.48 0.01
## Parks                0.82    -0.20 0.01
## Museums              0.77    -0.45 0.01
## Restaurants          0.42    -1.12 0.02
## Theatres             0.62    -0.73 0.01
## Malls                0.22    -1.18 0.02
# SwimmingPools & Gyms is VERY skewed
arrange(psy, skew)
##                      vars    n  mean   sd median trimmed  mad   min  max range
## Malls                   7 3724  0.79 0.97   0.57    0.77 1.27 -0.91 4.04  4.95
## Restaurants             9 3724  0.60 0.93   0.38    0.56 1.02 -1.07 2.85  3.92
## Theatres                5 3724  0.57 0.86   0.39    0.50 0.93 -0.77 3.61  4.38
## Pubs_Bars              10 3724  0.30 0.79   0.19    0.23 0.75 -1.09 2.84  3.92
## Museums                 6 3724  0.48 0.84   0.31    0.40 0.80 -0.86 2.85  3.71
## Parks                   4 3724  0.53 0.87   0.26    0.44 0.79 -0.78 4.18  4.96
## ArtGalleries           15 3724  0.14 1.21  -0.55    0.01 0.49 -1.58 4.16  5.75
## Zoo                     8 3724  0.18 0.69  -0.06    0.11 0.56 -1.01 2.54  3.55
## Resorts                 2 3724  0.27 0.92  -0.05    0.17 0.72 -0.91 4.13  5.05
## ViewPoints             22 3724  0.03 1.14  -0.62   -0.12 0.43 -1.79 3.63  5.42
## LocalServices          11 3724  0.17 0.82  -0.12    0.07 0.61 -1.07 2.83  3.90
## JuiceBars              14 3724  0.04 1.05  -0.41   -0.10 0.56 -1.54 4.00  5.54
## Hotels_OtherLodgings   13 3724  0.03 0.92  -0.28   -0.10 0.57 -1.51 3.48  4.99
## Beaches                 3 3724  0.30 0.79   0.00    0.18 0.51 -0.87 4.32  5.19
## Monuments              23 3724 -0.17 0.94  -0.64   -0.34 0.30 -1.18 3.29  4.48
## Burger_PizzaShops      12 3724 -0.08 0.76  -0.26   -0.22 0.53 -1.14 2.77  3.91
## Gardens                24 3724 -0.19 0.87  -0.52   -0.38 0.35 -1.11 3.74  4.85
## Churches                1 3724 -0.29 0.60  -0.46   -0.40 0.39 -1.10 2.95  4.05
## Cafes                  21 3724 -0.63 0.73  -0.78   -0.77 0.24 -3.72 2.56  6.28
## BeautySpas             20 3724 -0.51 0.97  -0.82   -0.73 0.24 -3.15 4.26  7.40
## Bakeries               19 3724 -0.52 0.98  -0.84   -0.78 0.22 -1.49 4.32  5.81
## DanceClubs             16 3724 -0.60 0.71  -0.79   -0.77 0.18 -1.51 3.33  4.84
## Gyms                   18 3724 -0.71 0.69  -0.86   -0.86 0.18 -1.35 4.16  5.51
## SwimmingPools          17 3724 -0.72 0.62  -0.84   -0.84 0.17 -1.33 4.02  5.35
##                      skew kurtosis   se
## Malls                0.22    -1.18 0.02
## Restaurants          0.42    -1.12 0.02
## Theatres             0.62    -0.73 0.01
## Pubs_Bars            0.73    -0.48 0.01
## Museums              0.77    -0.45 0.01
## Parks                0.82    -0.20 0.01
## ArtGalleries         0.87    -0.75 0.02
## Zoo                  0.90     0.05 0.01
## Resorts              0.95    -0.14 0.02
## ViewPoints           0.96    -0.62 0.02
## LocalServices        1.04     0.08 0.01
## JuiceBars            1.11    -0.18 0.02
## Hotels_OtherLodgings 1.15    -0.01 0.02
## Beaches              1.32     1.32 0.01
## Monuments            1.40     0.62 0.02
## Burger_PizzaShops    1.47     1.42 0.01
## Gardens              1.85     2.59 0.01
## Churches             1.95     4.41 0.01
## Cafes                1.99     5.87 0.01
## BeautySpas           2.34     5.42 0.02
## Bakeries             2.69     6.69 0.02
## DanceClubs           3.08     8.95 0.01
## Gyms                 4.09    17.69 0.01
## SwimmingPools        4.50    21.41 0.01
arrange(psy, range)
##                      vars    n  mean   sd median trimmed  mad   min  max range
## Zoo                     8 3724  0.18 0.69  -0.06    0.11 0.56 -1.01 2.54  3.55
## Museums                 6 3724  0.48 0.84   0.31    0.40 0.80 -0.86 2.85  3.71
## LocalServices          11 3724  0.17 0.82  -0.12    0.07 0.61 -1.07 2.83  3.90
## Burger_PizzaShops      12 3724 -0.08 0.76  -0.26   -0.22 0.53 -1.14 2.77  3.91
## Restaurants             9 3724  0.60 0.93   0.38    0.56 1.02 -1.07 2.85  3.92
## Pubs_Bars              10 3724  0.30 0.79   0.19    0.23 0.75 -1.09 2.84  3.92
## Churches                1 3724 -0.29 0.60  -0.46   -0.40 0.39 -1.10 2.95  4.05
## Theatres                5 3724  0.57 0.86   0.39    0.50 0.93 -0.77 3.61  4.38
## Monuments              23 3724 -0.17 0.94  -0.64   -0.34 0.30 -1.18 3.29  4.48
## DanceClubs             16 3724 -0.60 0.71  -0.79   -0.77 0.18 -1.51 3.33  4.84
## Gardens                24 3724 -0.19 0.87  -0.52   -0.38 0.35 -1.11 3.74  4.85
## Malls                   7 3724  0.79 0.97   0.57    0.77 1.27 -0.91 4.04  4.95
## Parks                   4 3724  0.53 0.87   0.26    0.44 0.79 -0.78 4.18  4.96
## Hotels_OtherLodgings   13 3724  0.03 0.92  -0.28   -0.10 0.57 -1.51 3.48  4.99
## Resorts                 2 3724  0.27 0.92  -0.05    0.17 0.72 -0.91 4.13  5.05
## Beaches                 3 3724  0.30 0.79   0.00    0.18 0.51 -0.87 4.32  5.19
## SwimmingPools          17 3724 -0.72 0.62  -0.84   -0.84 0.17 -1.33 4.02  5.35
## ViewPoints             22 3724  0.03 1.14  -0.62   -0.12 0.43 -1.79 3.63  5.42
## Gyms                   18 3724 -0.71 0.69  -0.86   -0.86 0.18 -1.35 4.16  5.51
## JuiceBars              14 3724  0.04 1.05  -0.41   -0.10 0.56 -1.54 4.00  5.54
## ArtGalleries           15 3724  0.14 1.21  -0.55    0.01 0.49 -1.58 4.16  5.75
## Bakeries               19 3724 -0.52 0.98  -0.84   -0.78 0.22 -1.49 4.32  5.81
## Cafes                  21 3724 -0.63 0.73  -0.78   -0.77 0.24 -3.72 2.56  6.28
## BeautySpas             20 3724 -0.51 0.97  -0.82   -0.73 0.24 -3.15 4.26  7.40
##                      skew kurtosis   se
## Zoo                  0.90     0.05 0.01
## Museums              0.77    -0.45 0.01
## LocalServices        1.04     0.08 0.01
## Burger_PizzaShops    1.47     1.42 0.01
## Restaurants          0.42    -1.12 0.02
## Pubs_Bars            0.73    -0.48 0.01
## Churches             1.95     4.41 0.01
## Theatres             0.62    -0.73 0.01
## Monuments            1.40     0.62 0.02
## DanceClubs           3.08     8.95 0.01
## Gardens              1.85     2.59 0.01
## Malls                0.22    -1.18 0.02
## Parks                0.82    -0.20 0.01
## Hotels_OtherLodgings 1.15    -0.01 0.02
## Resorts              0.95    -0.14 0.02
## Beaches              1.32     1.32 0.01
## SwimmingPools        4.50    21.41 0.01
## ViewPoints           0.96    -0.62 0.02
## Gyms                 4.09    17.69 0.01
## JuiceBars            1.11    -0.18 0.02
## ArtGalleries         0.87    -0.75 0.02
## Bakeries             2.69     6.69 0.02
## Cafes                1.99     5.87 0.01
## BeautySpas           2.34     5.42 0.02
# UNIVARIATE

hist(data$Churches) # skews low

hist(data$Resorts)

hist(data$Beaches)

hist(data$Parks)

hist(data$Theatres) # lots of 5s

hist(data$Museums) # lots of 5s

hist(data$Malls) # lots of 5s

hist(data$Zoo)

hist(data$Restaurants)

hist(data$Pubs_Bars)

hist(data$LocalServices)

hist(data$Burger_PizzaShops)

hist(data$Hotels_OtherLodgings)

hist(data$JuiceBars)

hist(data$ArtGalleries)

hist(data$DanceClubs) # lots of 1s

hist(data$SwimmingPools) # lots of 1s

hist(data$Gyms) # lots of 1s

hist(data$Bakeries) # lots of 1s

hist(data$BeautySpas) # lots of 1s

hist(data$Cafes) # lots of 1s

hist(data$ViewPoints)

hist(data$Monuments)

hist(data$Gardens)

par(mfrow=c(4,1))
par(mfrow=c(1,1))

hist(data2$Malls) 

hist(data2$Restaurants)

hist(data2$Theatres)

hist(data2$Pubs_Bars)

hist(data2$Museums) 

hist(data2$Parks)

hist(data2$ArtGalleries)

hist(data2$Zoo)

hist(data2$Resorts)

hist(data2$ViewPoints)

hist(data2$LocalServices)

hist(data2$JuiceBars)

hist(data2$Hotels_OtherLodgings)

hist(data2$Beaches)

hist(data2$Monuments)

hist(data2$Burger_PizzaShops)

hist(data2$Gardens)

hist(data2$Churches) 

hist(data2$Cafes)      # range

hist(data2$BeautySpas) # range

hist(data2$Bakeries) 

hist(data2$DanceClubs) 

hist(data2$Gyms)          # skew

hist(data2$SwimmingPools) # skew

summary(data2$BeautySpas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.1454 -0.9766 -0.8243 -0.5069 -0.6442  4.2574
summary(data2$Cafes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.7168 -0.9556 -0.7821 -0.6344 -0.6400  2.5628
summary(data2$Gyms)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.3518 -0.9838 -0.8593 -0.7133 -0.7397  4.1610
summary(data2$SwimmingPools)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.3274 -0.9554 -0.8444 -0.7227 -0.7218  4.0243
boxplot(data2$BeautySpas, horizontal = T)

boxplot(data2$Cafes, horizontal = T)

boxplot(data2$Gyms, horizontal = T)

boxplot(data2$SwimmingPools, horizontal = T)

data2 %>% dplyr::select(Gyms, SwimmingPools) %>% 
  pivot_longer(cols = 1:2) %>% 
  ggplot(aes(value, fill = name)) + 
  facet_wrap(.~name, nrow = 2, ncol = 1) +
  theme_bw() +
  geom_histogram(binwidth = 0.1, color = "grey20") + 
  scale_x_continuous(breaks = seq(-1.5, 4.5, by=0.5)) +
  scale_fill_manual(values = c("lightgreen", "lightskyblue")) + 
  labs(title = "Extreme Skewness in Average User Rating",
       x = "Average User Rating", y = "Count",
       caption = "*Rating of 0 indicates a rating equal to a user's average rating across all categories") + 
  theme(legend.position = "none",
        plot.title = element_text(hjust=0.5, face = "bold", size = 13),
        plot.subtitle = element_text(hjust = 0.5))

data2 %>% dplyr::select(BeautySpas, Cafes) %>% 
  pivot_longer(cols = 1:2) %>% 
  ggplot(aes(value, fill = name)) + 
  facet_wrap(.~name, nrow = 2, ncol = 1) +
  theme_bw() +
  geom_histogram(binwidth = 0.1, color = "grey20") + 
  scale_x_continuous(breaks = seq(-4, 4.5, by=0.5)) +
  scale_fill_manual(values = c("plum", "goldenrod1")) + 
  labs(title = "Largest Ranges in Average User Rating",
       x = "Average User Rating", y = "Count",
       caption = "*Rating of 0 indicates a rating equal to a user's average rating across all categories") + 
  theme(legend.position = "none",
        plot.title = element_text(hjust=0.5, face = "bold", size = 13),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text = element_text(size = 8),
        plot.background = element_rect(color = "white"))

# CORRELATION

# Corrplot with user-standardized data
c <- cor(data2[,-1])
cc <- data[,-1] %>% drop_na() %>% cor()

corrplot(c, method = "color",
         # addCoef.col = T, number.digits = 2, number.cex = 0.5,
         type = "full",
         diag = T,
         addgrid.col = "darkgrey",
         # mar = c(0,0,2,0),
         # title = "Variable Correlation Plot",
         order  = "FPC",
         # hclust.method = "ward.D2",
         addrect = 6,
         tl.col = "black",
         tl.cex = 0.8,
         tl.srt = 45
)
text(x = 12.5, y = 30, font = 2, col = "black", cex = 1.2,
     labels = "Variable Correlation Plot")

# making a list of correlations
cor_df <- tibble(var1 = rep(NA_character_, nrow(c)),
                 var2 = rep(NA_character_, ncol(c)),
                 cor = rep(NA_real_, ncol(c)))

count <- 1
for (i in 1:(ncol(c)-1)) {
  for (j in (i+1):ncol(c)) {
    cor_df[count, 1] <- rownames(c)[i]
    cor_df[count, 2] <- colnames(c)[j]
    cor_df[count, 3] <- c[i, j]
    count <- count+1
  }
}

cor_df %>% 
  # filter(cor > 0.4) %>%
  # filter(var1 == "DanceClubs" | var2 == "DanceClubs") %>% 
  arrange(-abs(cor))
## # A tibble: 276 x 3
##    var1        var2             cor
##    <chr>       <chr>          <dbl>
##  1 Parks       Theatres       0.573
##  2 Zoo         Restaurants    0.539
##  3 Restaurants Pubs_Bars      0.536
##  4 Zoo         Pubs_Bars      0.529
##  5 Malls       Restaurants    0.516
##  6 Malls       ViewPoints    -0.501
##  7 Malls       Zoo            0.496
##  8 Churches    Gardens        0.492
##  9 Pubs_Bars   LocalServices  0.475
## 10 Restaurants Gardens       -0.442
## # ... with 266 more rows
# finding range of correlations per variable
count <- 1
cor_range <- data.frame(var = rep(NA_character_, length(rownames(c))), 
                min = rep(NA_real_, length(rownames(c))),
                max = rep(NA_real_, length(rownames(c))))
for (i in rownames(c)) {
  cor_range[count,] <- 
    cor_df %>% 
    filter(var1 == i | var2 == i) %>% 
    summarise(var = i,
              min = min(cor),
              max = max(cor))
  count <- count + 1
}
print(cor_range)
##                     var        min       max
## 1              Churches -0.4015975 0.4921240
## 2               Resorts -0.2803196 0.3130604
## 3               Beaches -0.2890621 0.3130604
## 4                 Parks -0.3340272 0.5725764
## 5              Theatres -0.3631907 0.5725764
## 6               Museums -0.2264539 0.4408501
## 7                 Malls -0.5007488 0.5155590
## 8                   Zoo -0.3528443 0.5386676
## 9           Restaurants -0.4421413 0.5386676
## 10            Pubs_Bars -0.3929903 0.5355326
## 11        LocalServices -0.2996600 0.4748105
## 12    Burger_PizzaShops -0.3713415 0.3910358
## 13 Hotels_OtherLodgings -0.2803196 0.4055654
## 14            JuiceBars -0.3383527 0.4055654
## 15         ArtGalleries -0.3631907 0.2723495
## 16           DanceClubs -0.1584600 0.1140728
## 17        SwimmingPools -0.2344388 0.4176647
## 18                 Gyms -0.2793281 0.4176647
## 19             Bakeries -0.3349344 0.3464021
## 20           BeautySpas -0.2996600 0.1784818
## 21                Cafes -0.2974572 0.3365528
## 22           ViewPoints -0.5007488 0.4284096
## 23            Monuments -0.3733976 0.4284096
## 24              Gardens -0.4421413 0.4921240
cor_range %>% 
  filter(min > -0.25, max < 0.25)
##          var      min       max
## 1 DanceClubs -0.15846 0.1140728
# exclude DanceClubs?




# BIVARIATE

# greatest positive corr
ggplot(data2, aes(Parks, Theatres)) + geom_point()

ggplot(data2, aes(Restaurants, Zoo)) + geom_point()

ggplot(data2, aes(Restaurants, Pubs_Bars)) + geom_point()

ggplot(data2, aes(Zoo, Pubs_Bars)) + geom_point()

ggplot(data2, aes(Malls, Restaurants)) + geom_point()

# greatest negative corr
ggplot(data2, aes(Malls, ViewPoints)) + geom_point()

# no corr
ggplot(data2, aes(LocalServices, JuiceBars)) + geom_point()

ggplot(data2, aes(Beaches, Bakeries)) + geom_point()

ggplot(data2, aes(Bakeries, ViewPoints)) + geom_point()

# OUTLIERS #####################################################################

# post-PCA checking for outliers
  # that might be useful to remove from cluster analysis

# using data3
X <- subset(data2, select = -c(UserID))
pca <- prcomp(X, center = TRUE, scale. = TRUE)
pca_label <- cbind(data2, as.data.frame(pca$x))
data3 <- pca_label %>% 
  dplyr::select(PC1:PC7) %>%
  scale() %>% as.data.frame()


data_melt <- data3 %>% 
  pivot_longer(cols = 1:7, values_to = "coord") %>% 
  mutate(name = as_factor(name))

data_melt %>% 
  ggplot(aes(name, coord, fill = name)) + 
  # ylim(0,5) + 
  geom_boxplot()

data3 %>% 
  ggplot(aes(PC5, PC6)) + geom_point()

data3 %>% GGally::ggpairs(mapping = aes(alpha=0.01))
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# probably nothing worth excluding