NLB project analysis Group1

mydata <-read.table("./analiza-anketa.csv", header = TRUE, sep = ";", dec= ",")

mydata <- mydata[-1, ] #delete first row in which the questions are written

mydata$ID <- seq(1,nrow(mydata))

head(mydata)

##   Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 2   0   0   0   0   0   0   0   1   0   0   0   0       -2  8         -2  1  5
## 3   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  3  2
## 4   1   0   0   1   0   0   0   0   0   0   0   0       -2  1         -2  1  2
## 5   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  2  4
## 6   0   0   0   1   0   0   0   0   0   0   1   0       -2 11         -2  2  2
## 7   0   0   1   0   0   0   0   0   0   0   0   0       -2  3         -2  1  2
##   Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 2  3   4   5   6   4   5   5   6   5   6   5   4   5   4   5   6   4   1   0
## 3  4   5   6   3   6   6   7   5   5   7   6   3   2   5   6   3   5   0   1
## 4  3   3   5   4   5   5   6   6   4   5   5   5   5   4   5   5   6   1   1
## 5  5   6   4   5   4   4   2   6   6   7   6   4   3   5   6   4   6   1   0
## 6  3   4   6   6   4   5   5   6   6   7   5   6   6   6   7   6   6   0   0
## 7  5   2   5   5   3   6   6   6   6   7   5   6   5   5   6   5   5   0   0
##   Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 2   1   0   0   0   0   0   0   1   0       -2   0   1   0   0   0   1   0   0
## 3   0   0   0   1   1   0   0   0   0       -2   0   0   1   0   1   0   0   0
## 4   0   0   0   1   0   0   0   0   0       -2   0   0   1   1   0   0   0   0
## 5   1   0   0   1   0   0   0   0   0       -2   0   0   0   0   1   0   1   0
## 6   0   0   0   1   0   0   1   1   0       -2   0   1   0   1   0   0   1   0
## 7   1   0   0   1   0   1   0   0   0       -2   0   0   0   0   1   0   1   0
##   Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 2   0   1   0       -2    5    5    5    4    5    4    1    1    1    0    0
## 3   0   1   0       -2    2    3    1    6    5    4    0    1    1    0    1
## 4   0   1   0       -2    2    2    2    2    3    2    0    1    1    1    0
## 5   0   1   0       -2    6    7    2    2    6    2    1    1    0    1    0
## 6   0   0   0       -2    2    2    5    2    3    2    0    1    1    0    0
## 7   0   1   0       -2    5    5    5    5    5    3    0    0    1    1    1
##   Q11f      Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b
## 2    0             -2    5    6    4    6    6    6    5    5    6    0    0
## 3    0             -2    6    2    3    2    7    7    3    7    6    0    1
## 4    0             -2    5    5    4    4    5    6    5    6    6    0    1
## 5    0             -2    6    2    7    6    7    7    2    7    7    0    1
## 6    1 Letalske karte    5    6    2    4    6    6    5    6    6    0    0
## 7    0             -2    5    3    5    6    7    6    4    6    6    0    0
##   Q14c Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e
## 2    0    1    0    1    0    1    0    0        -2    0    0    1    0    1
## 3    0    1    1    0    0    0    0    0        -2    1    1    0    0    1
## 4    1    0    1    0    0    0    0    0        -2    1    1    0    0    1
## 5    1    1    0    0    0    0    0    0        -2    0    0    0    1    1
## 6    1    0    1    0    1    0    0    0        -2    1    1    0    0    1
## 7    0    1    1    0    0    1    0    0        -2    1    0    0    0    0
##   Q15f Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b
## 2    1    0        -2    5    5    5    5    5    5    5    5    6    8    8
## 3    0    0        -2    3    2    6    5    6    4    1    2    6    6   -1
## 4    0    0        -2    5    5    6    5    5    5    4    5    5    5    8
## 5    0    1 Apple Pay    4    4    6    6    6    6    4    4    6    6    8
## 6    0    0        -2    4    4    5    6    6    4    4    5    3    2    8
## 7    0    0        -2    4    4    6    6    6    5    4    4    6    8    8
##   Q17c Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e
## 2    8    8    8    8    8    8    8    8    8    8    8    8    8    8    8
## 3   -1   -1   -1   -1    5   -1   -1   -1   -1   -1    5   -1   -1   -1   -1
## 4    8    8    6    8    5    8    8    8    6    8    3    3    4    4    3
## 5    8    8    6    8    5    8    8    8    6    8    6    8    8    6    6
## 6    6    8    7    6    3    8    5    8    6    6    4    5    6    5    6
## 7    6    8    8    8    8    8    6    8    8    8    8    8    5    8    8
##   Q19f Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 2    8  23   2   1         -2   4   1         -2   1   1         -2   3   3  1
## 3   -1  20   1   5         -2   3   5         -2   1   1         -2   2   1  2
## 4    3  26   1   2         -2   5   2         -2   5   2         -2   7   3  3
## 5    8  25   2   2         -2   5   1         -2   3   2         -2   1   1  4
## 6    6  24   1   5         -2   5   1         -2   3   2         -2   4   1  5
## 7    8  25   2   4         -2   5   2         -2   4   2         -2   1   2  6

Description

#creating new variable and informing R that we have non-numerical variable
mydata$GenderF <- factor(mydata$Q21,
 levels = c(2, 1, 4),
 labels =c("F", "M", "Prefer not to say"))

mydata$ResidenceF <- factor(mydata$Q22,
 levels = c(1, 2,3,4,5,6),
 labels =c("With parents", "Studio apartment ", "Shared apartment", "Owned apartment", "Student dorms", "Other"))


mydata$ResidenceF <- factor(mydata$Q22,
 levels = c(1, 2,3,4,5,6),
 labels =c("With parents", "Away from parents ", "Away from parents", "Away from parents", "", "Away from parents"))

mydata$EducationF <- factor(mydata$Q23,
 levels = c(4,1,2,3,5,6),
 labels =c("Bachelor degree", "Primary school", "Secondary school ", "Gymnasium", "Masters degree", "Phd"))

mydata$StatusF <- factor(mydata$Q24,
 levels = c(1,2,3,4,5,6),
 labels =c("Student, with a student job", "Employed", "Unemployed ", "Self-employed", "Student, without a student job", "Other"))

mydata$Personal_Monthly_IncomeF <- factor(mydata$Q25,
 levels = c(1,2,3,4,5),
 labels =c("0-500", "501-1000", "1001-1500", "1501-2000", "More than 2000"))

mydata$Marital_StatusF <- factor(mydata$Q26,
 levels = c(1,2,3,4),
 labels =c("In a relationship", "Single", "Married", "Other"))

mydata$RegionF <- factor(mydata$Q27,
 levels = c(1,2,3,4,5,6,7,8),
 labels =c("Osrednjeslovenska", "Stajerska", "Gorenjska ", "Notranjska", "Koroska", "Primorska","Dolenjska","Prekmurje"))

mydata$LiveF <- factor(mydata$Q28,
 levels = c(1,2,3),
 labels =c("Town","Suburb", "The countryside"))

mydata$BankF <- factor(mydata$Q2,
 levels = c(1,2,3,4,5,6,7,8,9,10,11,12),
 labels =c("NLB", "NKBM", "SKB ", "Revolut", "N26", "Unicredit","Addiko","Gorenjska banka", "Sparkasse", "Delezna banka", "Intesa Saopolo","Other"))

Clustering

#13e- preprosta in intuitivna aplikacija
#13f - Varnost
#7j - Preglednost informacij o pogojih in stroških
#7l - Ugled banke
#10a- Starši (pomembna vloga) pri odprtju novega bančnega računa
#7a - pogoji za dolgoročno posojilo

mydata <- mydata[mydata$Q13e > 0, ]
mydata <- mydata[mydata$Q13f > 0, ]
mydata <- mydata[mydata$Q7j > 0, ]
mydata <- mydata[mydata$Q7l > 0, ]
mydata <- mydata[mydata$Q10a > 0, ]


mydata$Q13e <- as.numeric(mydata$Q13e)
mydata$Q13f <- as.numeric(mydata$Q13f)
mydata$Q7j <- as.numeric(mydata$Q7j)
mydata$Q7l <- as.numeric(mydata$Q7l)
mydata$Q10a <- as.numeric(mydata$Q10a)

summary(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]) #Describing clustering variables

##       Q13e            Q13f           Q7j             Q7l             Q10a      
##  Min.   :1.000   Min.   :3.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:6.000   1st Qu.:6.00   1st Qu.:5.000   1st Qu.:5.000   1st Qu.:4.000  
##  Median :7.000   Median :7.00   Median :6.000   Median :6.000   Median :5.000  
##  Mean   :6.387   Mean   :6.38   Mean   :5.813   Mean   :5.487   Mean   :4.853  
##  3rd Qu.:7.000   3rd Qu.:7.00   3rd Qu.:7.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.00   Max.   :7.000   Max.   :7.000   Max.   :7.000

#Saving standardized cluster variables into new data frame
mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))

mydata$Dissimilarity <- sqrt(mydata_clu_std$Q13e^2 + mydata_clu_std$Q13f^2 + mydata_clu_std$Q7j^2 + 
                             mydata_clu_std$Q7l^2 + mydata_clu_std$Q10a^2) #Finding outliers

head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) #Finding units with highest value of dissimilarity

##      ID Dissimilarity
## 38   37      7.599202
## 97   96      5.062270
## 124 123      4.978508
## 102 101      4.441051
## 34   33      4.399388
## 35   34      4.381617

print(mydata[37, ]) #Showing customer ID37

##    Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 38   1   0   0   1   0   0   0   0   0   0   0   0       -2  1         -2  2  1
##    Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 38  5   6   6   1   5   7   7   1   2   1   1   1   1   1   1   1   1   1   1
##    Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 38   0   0   1   0   0   0   0   0   0       -2   0   0   1   0   0   0   1   0
##    Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 38   0   1   0       -2    7    5    3    1    1    1    1    1    0    1    0
##    Q11f Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b Q14c
## 38    0        -2    6    1    7    1    1    7    5    6    7    0    1    0
##    Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e Q15f
## 38    1    1    0    0    0    0    0        -2    1    1    0    0    1    0
##    Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b Q17c
## 38    0        -2    4    4    7    7    5    4    2    4    7    7    8    5
##    Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e Q19f
## 38    8    7    8    5    8    8    8    7    8    7    8    6    8    5    8
##    Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 38  -1   2   1         -2   4   5         -2   1   2         -2   1   1 37
##    GenderF   ResidenceF      EducationF                        StatusF
## 38       F With parents Bachelor degree Student, without a student job
##    Personal_Monthly_IncomeF Marital_StatusF           RegionF LiveF BankF
## 38                    0-500          Single Osrednjeslovenska  Town   NLB
##    Dissimilarity
## 38      7.599202

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mydata <- mydata %>%
  filter(!ID %in% c(37)) #Removing ID37 from original data frame

mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))

#install.packages("factoextra")
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.3.2

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#Finding Eudlidean distances, based on 6 Cluster variables, then saving them into object Distances

Distances <- get_dist(mydata_clu_std, 
                      method = "euclidian")

Distances2 <- Distances^2

fviz_dist(Distances2, #Showing matrix of distances
          gradient = list(low = "darkred",
                          mid = "grey95",
                          high = "white"))

library(factoextra) 
get_clust_tendency(mydata_clu_std, #Hopkins statistics
                   n = nrow(mydata_clu_std) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.7159374
## 
## $plot
## NULL

Educlidean distance

library(dplyr)
WARD <- mydata_clu_std %>% #Selecting variables
  get_dist(method = "euclidean") %>%  #Selecting distance
  hclust(method = "ward.D2") #Selecting algorithm         

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 149

library(factoextra)
fviz_dend(WARD) #Dendrogram

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

mydata$ClusterWard <- cutree(WARD, 
                             k = 4) #Number of groups

head(mydata[c("ID", "ClusterWard")])

##   ID ClusterWard
## 1  1           1
## 2  2           2
## 3  3           2
## 4  4           3
## 5  5           2
## 6  6           3

#Showing the positions of initial leaders, used as starting point for k-means clustering

Leaders_initial <- aggregate(mydata_clu_std, 
                             by = list(mydata$ClusterWard), 
                             FUN = mean)

Leaders_initial

##   Group.1       Q13e       Q13f        Q7j        Q7l       Q10a
## 1       1 -1.1417980 -1.1771576 -0.3658404 -0.3762346  0.3916587
## 2       2  0.1765311  0.2048837  0.2418637  0.3145441 -1.7760976
## 3       3  0.1948608  0.1688476  0.1119981  0.4194214  0.4331117
## 4       4  0.5190073  0.6411094 -0.2764593 -1.6484563 -0.1160904

library(factoextra) 

kmeans_clu <- hkmeans(mydata_clu_std, #Data
                      k = 4, #Number of groups
                      hc.metric = "euclidean", #Distance for hierar. clus.
                      hc.method = "ward.D2") #Algorithm for hierar. clus.

kmeans_clu

## Hierarchical K-means clustering with 4 clusters of sizes 32, 23, 69, 25
## 
## Cluster means:
##          Q13e       Q13f        Q7j        Q7l       Q10a
## 1 -0.77432259 -1.3523878 -0.5968513 -0.2835846  0.4401102
## 2  0.06097423  0.2876994  0.3218533  0.3458933 -1.8572402
## 3  0.15660750  0.3414177  0.3469481  0.5396884  0.4715530
## 4  0.50279993  0.5240600 -0.4897122 -1.4447734 -0.1561663
## 
## Clustering vector:
##   [1] 1 2 2 3 2 3 3 3 2 4 3 3 3 2 1 2 3 1 3 2 3 1 1 3 4 3 2 1 4 1 3 3 1 1 1 2 1
##  [38] 2 3 2 3 3 4 3 2 4 3 4 3 3 3 3 3 3 1 3 4 3 2 3 3 3 3 3 2 1 3 3 4 3 3 2 3 3
##  [75] 2 1 3 3 1 1 1 4 4 3 1 1 3 3 3 2 4 4 3 1 1 4 2 2 1 3 1 3 1 2 4 3 4 3 3 3 1
## [112] 1 3 4 3 3 3 2 3 1 4 3 3 1 3 1 2 1 4 3 4 3 1 3 4 3 4 3 3 3 4 4 3 1 3 4 2 4
## [149] 3
## 
## Within cluster sum of squares by cluster:
## [1] 137.79060  63.18242 136.08827  93.13940
##  (between_SS / total_SS =  41.9 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "data"        
## [11] "hclust"

fviz_cluster(kmeans_clu,
             palette = "jama",
             repel = FALSE,
             ggtheme =theme_classic())

library(dplyr)
mydata <- mydata %>%
  filter(!ID %in% c(99, 94, 34, 25, 29, 121)) 

mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))

library(factoextra) 

kmeans_clu <- hkmeans(mydata_clu_std, #Data
                      k = 4, #Number of groups
                      hc.metric = "euclidean", #Distance for hierar. clus.
                      hc.method = "ward.D2") #Algorithm for hierar. clus.

kmeans_clu

## Hierarchical K-means clustering with 4 clusters of sizes 34, 22, 82, 5
## 
## Cluster means:
##         Q13e        Q13f        Q7j        Q7l        Q10a
## 1 -0.2779416 -1.12470153 -0.8283715 -0.5733667  0.24354354
## 2  0.2331418  0.26228217  0.3062059  0.2277391 -1.86869224
## 3  0.2848879  0.39477858  0.2441486  0.1343476  0.39575396
## 4 -3.8079822  0.01956003  0.2815832  0.6935415  0.07578474
## 
## Clustering vector:
##   [1] 1 2 2 3 2 3 3 3 2 3 3 3 3 2 1 2 3 1 3 2 3 1 1 3 3 2 1 1 3 3 1 1 2 1 4 3 2
##  [38] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 1 3 3 3 2 3 3 3 3 3 2 1 4 3 1 3 3 2 3 3 2 1 3
##  [75] 3 1 1 1 3 2 3 1 1 3 3 3 2 3 3 4 1 3 2 4 3 1 3 1 2 3 4 3 3 3 3 1 1 3 1 3 3
## [112] 3 2 1 1 3 3 1 3 1 2 1 3 3 1 3 1 3 3 3 3 3 3 3 3 3 3 1 3 1 2 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 152.05724  51.98429 201.85358  25.22622
##  (between_SS / total_SS =  39.3 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "data"        
## [11] "hclust"

fviz_cluster(kmeans_clu,
             palette = "jama",
             repel = FALSE,
             ggtheme =theme_classic())

Centroids <- kmeans_clu$centers
round(Centroids, 3)

##     Q13e   Q13f    Q7j    Q7l   Q10a
## 1 -0.278 -1.125 -0.828 -0.573  0.244
## 2  0.233  0.262  0.306  0.228 -1.869
## 3  0.285  0.395  0.244  0.134  0.396
## 4 -3.808  0.020  0.282  0.694  0.076

library(ggplot2)
library(tidyr)

Picture <- as.data.frame(Centroids)
Picture$id <- 1:nrow(Picture)
Picture <- pivot_longer(Picture, cols = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))

Picture$Group <- factor(Picture$id, 
                        levels = c(1, 2, 3, 4), 
                        labels = c("1", "2", "3", "4"))

Picture$nameFactor <- factor(Picture$name, 
                             levels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"), 
                             labels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))

#Showing the lines
ggplot(Picture, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 3) +
  geom_line(aes(group = id, linetype = Group, col = Group), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")

Using Spearman correlation coefficient

mydata <-read.table("./analiza-anketa.csv", header = TRUE, sep = ";", dec= ",")

mydata <- mydata[-1, ] #delete first row in which the questions are written

mydata$ID <- seq(1,nrow(mydata))

head(mydata)

##   Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 2   0   0   0   0   0   0   0   1   0   0   0   0       -2  8         -2  1  5
## 3   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  3  2
## 4   1   0   0   1   0   0   0   0   0   0   0   0       -2  1         -2  1  2
## 5   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  2  4
## 6   0   0   0   1   0   0   0   0   0   0   1   0       -2 11         -2  2  2
## 7   0   0   1   0   0   0   0   0   0   0   0   0       -2  3         -2  1  2
##   Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 2  3   4   5   6   4   5   5   6   5   6   5   4   5   4   5   6   4   1   0
## 3  4   5   6   3   6   6   7   5   5   7   6   3   2   5   6   3   5   0   1
## 4  3   3   5   4   5   5   6   6   4   5   5   5   5   4   5   5   6   1   1
## 5  5   6   4   5   4   4   2   6   6   7   6   4   3   5   6   4   6   1   0
## 6  3   4   6   6   4   5   5   6   6   7   5   6   6   6   7   6   6   0   0
## 7  5   2   5   5   3   6   6   6   6   7   5   6   5   5   6   5   5   0   0
##   Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 2   1   0   0   0   0   0   0   1   0       -2   0   1   0   0   0   1   0   0
## 3   0   0   0   1   1   0   0   0   0       -2   0   0   1   0   1   0   0   0
## 4   0   0   0   1   0   0   0   0   0       -2   0   0   1   1   0   0   0   0
## 5   1   0   0   1   0   0   0   0   0       -2   0   0   0   0   1   0   1   0
## 6   0   0   0   1   0   0   1   1   0       -2   0   1   0   1   0   0   1   0
## 7   1   0   0   1   0   1   0   0   0       -2   0   0   0   0   1   0   1   0
##   Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 2   0   1   0       -2    5    5    5    4    5    4    1    1    1    0    0
## 3   0   1   0       -2    2    3    1    6    5    4    0    1    1    0    1
## 4   0   1   0       -2    2    2    2    2    3    2    0    1    1    1    0
## 5   0   1   0       -2    6    7    2    2    6    2    1    1    0    1    0
## 6   0   0   0       -2    2    2    5    2    3    2    0    1    1    0    0
## 7   0   1   0       -2    5    5    5    5    5    3    0    0    1    1    1
##   Q11f      Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b
## 2    0             -2    5    6    4    6    6    6    5    5    6    0    0
## 3    0             -2    6    2    3    2    7    7    3    7    6    0    1
## 4    0             -2    5    5    4    4    5    6    5    6    6    0    1
## 5    0             -2    6    2    7    6    7    7    2    7    7    0    1
## 6    1 Letalske karte    5    6    2    4    6    6    5    6    6    0    0
## 7    0             -2    5    3    5    6    7    6    4    6    6    0    0
##   Q14c Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e
## 2    0    1    0    1    0    1    0    0        -2    0    0    1    0    1
## 3    0    1    1    0    0    0    0    0        -2    1    1    0    0    1
## 4    1    0    1    0    0    0    0    0        -2    1    1    0    0    1
## 5    1    1    0    0    0    0    0    0        -2    0    0    0    1    1
## 6    1    0    1    0    1    0    0    0        -2    1    1    0    0    1
## 7    0    1    1    0    0    1    0    0        -2    1    0    0    0    0
##   Q15f Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b
## 2    1    0        -2    5    5    5    5    5    5    5    5    6    8    8
## 3    0    0        -2    3    2    6    5    6    4    1    2    6    6   -1
## 4    0    0        -2    5    5    6    5    5    5    4    5    5    5    8
## 5    0    1 Apple Pay    4    4    6    6    6    6    4    4    6    6    8
## 6    0    0        -2    4    4    5    6    6    4    4    5    3    2    8
## 7    0    0        -2    4    4    6    6    6    5    4    4    6    8    8
##   Q17c Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e
## 2    8    8    8    8    8    8    8    8    8    8    8    8    8    8    8
## 3   -1   -1   -1   -1    5   -1   -1   -1   -1   -1    5   -1   -1   -1   -1
## 4    8    8    6    8    5    8    8    8    6    8    3    3    4    4    3
## 5    8    8    6    8    5    8    8    8    6    8    6    8    8    6    6
## 6    6    8    7    6    3    8    5    8    6    6    4    5    6    5    6
## 7    6    8    8    8    8    8    6    8    8    8    8    8    5    8    8
##   Q19f Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 2    8  23   2   1         -2   4   1         -2   1   1         -2   3   3  1
## 3   -1  20   1   5         -2   3   5         -2   1   1         -2   2   1  2
## 4    3  26   1   2         -2   5   2         -2   5   2         -2   7   3  3
## 5    8  25   2   2         -2   5   1         -2   3   2         -2   1   1  4
## 6    6  24   1   5         -2   5   1         -2   3   2         -2   4   1  5
## 7    8  25   2   4         -2   5   2         -2   4   2         -2   1   2  6

mydata <- mydata[mydata$Q13e > 0, ]
mydata <- mydata[mydata$Q13f > 0, ]
mydata <- mydata[mydata$Q7j > 0, ]
mydata <- mydata[mydata$Q7l > 0, ]
mydata <- mydata[mydata$Q10a > 0, ]


mydata$Q13e <- as.numeric(mydata$Q13e)
mydata$Q13f <- as.numeric(mydata$Q13f)
mydata$Q7j <- as.numeric(mydata$Q7j)
mydata$Q7l <- as.numeric(mydata$Q7l)
mydata$Q10a <- as.numeric(mydata$Q10a)



library(dplyr)
mydata <- mydata %>%
  filter(!ID %in% c(37)) #Removing ID37 from original data frame

mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")])) 


library(dplyr)
WARD <- mydata_clu_std %>% #Selecting variables
  get_dist(method = "spearman") %>%#Selecting distance
  hclust(method = "ward.D") #Selecting algorithm         

WARD

## 
## Call:
## hclust(d = ., method = "ward.D")
## 
## Cluster method   : ward.D 
## Number of objects: 149

library(factoextra)
fviz_dend(WARD) #Dendrogram

mydata$ClusterWard <- cutree(WARD, 
                             k = 3) #Number of groups

head(mydata[c("ID", "ClusterWard")])

##   ID ClusterWard
## 1  1           1
## 2  2           1
## 3  3           2
## 4  4           1
## 5  5           2
## 6  6           3

#Showing the positions of initial leaders, used as starting point for k-means clustering

Leaders_initial <- aggregate(mydata_clu_std, 
                             by = list(mydata$ClusterWard), 
                             FUN = mean)

Leaders_initial

##   Group.1       Q13e        Q13f         Q7j          Q7l       Q10a
## 1       1  0.2681796  0.58583604 -0.29924274 -0.537584391 -0.1710966
## 2       2 -0.0358253  0.04793232  0.45038760  0.782974895 -0.5789471
## 3       3 -0.3046238 -0.77323723 -0.01065265  0.003189433  0.7083880

Centroids <- Leaders_initial
round(Centroids, 3)

##   Group.1   Q13e   Q13f    Q7j    Q7l   Q10a
## 1       1  0.268  0.586 -0.299 -0.538 -0.171
## 2       2 -0.036  0.048  0.450  0.783 -0.579
## 3       3 -0.305 -0.773 -0.011  0.003  0.708

library(ggplot2)
library(tidyr)

Picture <- as.data.frame(Centroids)
Picture$id <- 1:nrow(Picture)
Picture <- pivot_longer(Picture, cols = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))

Picture$Group <- factor(Picture$id, 
                        levels = c(1, 2, 3), 
                        labels = c("1", "2", "3"))

Picture$nameFactor <- factor(Picture$name, 
                             levels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"), 
                             labels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))

#Showing the lines
ggplot(Picture, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 3) +
  geom_line(aes(group = id, linetype = Group, col = Group), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")

table(mydata$ClusterWard)

## 
##  1  2  3 
## 60 41 48

#Checking if cluster variables are okay (if they differentiate)
fit <- aov(cbind(Q13e, Q13f, Q7j, Q7l, Q10a) ~ as.factor(ClusterWard), 
           data = mydata)

summary(fit)

##  Response Q13e :
##                         Df  Sum Sq Mean Sq F value  Pr(>F)  
## as.factor(ClusterWard)   2   7.294  3.6469  4.6272 0.01126 *
## Residuals              146 115.069  0.7881                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q13f :
##                         Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterWard)   2 32.352 16.1759  36.558 1.345e-13 ***
## Residuals              146 64.601  0.4425                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7j :
##                         Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterWard)   2  18.271  9.1354  7.4438 0.0008349 ***
## Residuals              146 179.179  1.2273                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7l :
##                         Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterWard)   2  68.652  34.326  29.384 1.887e-11 ***
## Residuals              146 170.556   1.168                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q10a :
##                         Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterWard)   2 126.28  63.141  26.655 1.356e-10 ***
## Residuals              146 345.85   2.369                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mydata <- mydata[mydata$Q20 > 0, ]


mydata$Q20 <- as.numeric(mydata$Q20)


aggregate(mydata$Q20, 
          by = list(mydata$ClusterWard), 
          FUN = mean)

##   Group.1        x
## 1       1 22.45763
## 2       2 22.73171
## 3       3 21.86957

mydata <- mydata[mydata$Q21 > 0, ]

mydata$Q21 <- as.numeric(mydata$Q21)

aggregate(mydata$Q21, 
          by = list(mydata$ClusterWard), 
          FUN = mean)

##   Group.1        x
## 1       1 1.689655
## 2       2 1.512195
## 3       3 1.608696

Age

fit <- aov(Q20 ~ as.factor(ClusterWard), 
           data = mydata)

summary.aov(fit)

##                         Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterWard)   2   17.6   8.801   1.713  0.184
## Residuals              142  729.7   5.139

#creating new variable and informing R that we have non-numerical variable
mydata$GenderF <- factor(mydata$Q21,
 levels = c(2, 1, 4),
 labels =c("F", "M", "Prefer not to say"))

mydata$ResidenceF <- factor(mydata$Q22,
 levels = c(1, 2,3,4,5,6),
 labels =c("With parents", "Studio apartment ", "Shared apartment", "Owned apartment", "Student dorms", "Other"))


mydata$ResidenceF <- factor(mydata$Q22,
 levels = c(1, 2,3,4,5,6),
 labels =c("With parents", "Away from parents ", "Away from parents", "Away from parents", "", "Away from parents"))

mydata$EducationF <- factor(mydata$Q23,
 levels = c(4,1,2,3,5,6),
 labels =c("Bachelor degree", "Primary school", "Secondary school ", "Gymnasium", "Masters degree", "Phd"))

mydata$StatusF <- factor(mydata$Q24,
 levels = c(1,2,3,4,5,6),
 labels =c("Student, with a student job", "Employed", "Unemployed ", "Self-employed", "Student, without a student job", "Other"))

mydata$Personal_Monthly_IncomeF <- factor(mydata$Q25,
 levels = c(1,2,3,4,5),
 labels =c("0-500", "501-1000", "1001-1500", "1501-2000", "More than 2000"))

mydata$Marital_StatusF <- factor(mydata$Q26,
 levels = c(1,2,3,4),
 labels =c("In a relationship", "Single", "Married", "Other"))

mydata$RegionF <- factor(mydata$Q27,
 levels = c(1,2,3,4,5,6,7,8),
 labels =c("Osrednjeslovenska", "Stajerska", "Gorenjska ", "Notranjska", "Koroska", "Primorska","Dolenjska","Prekmurje"))

mydata$LiveF <- factor(mydata$Q28,
 levels = c(1,2,3),
 labels =c("Town","Suburb", "The countryside"))

mydata$BankF <- factor(mydata$Q2,
 levels = c(1,2,3,4,5,6,7,8,9,10,11,12),
 labels =c("NLB", "NKBM", "SKB ", "Revolut", "N26", "Unicredit","Addiko","Gorenjska banka", "Sparkasse", "Delezna banka", "Intesa Saopolo","Other"))

Gender

chi_square <- chisq.test(mydata$GenderF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$GenderF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$GenderF and as.factor(mydata$ClusterWard)
## X-squared = 8.1946, df = 4, p-value = 0.0847

round (chi_square$res, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF          1     2     3
##   F                  0.95 -1.28  0.14
##   M                 -1.08  1.38 -0.09
##   Prefer not to say -0.63  1.35 -0.56

round (chi_square$observed, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF       1  2  3
##   F                 40 18 28
##   M                 18 22 18
##   Prefer not to say  0  1  0

addmargins(round(prop.table(chi_square$observed), 3))

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF          1     2     3   Sum
##   F                 0.276 0.124 0.193 0.593
##   M                 0.124 0.152 0.124 0.400
##   Prefer not to say 0.000 0.007 0.000 0.007
##   Sum               0.400 0.283 0.317 1.000

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF          1     2     3   Sum
##   F                 0.465 0.209 0.326 1.000
##   M                 0.310 0.379 0.310 0.999
##   Prefer not to say 0.000 1.000 0.000 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF          1     2     3
##   F                 0.690 0.439 0.609
##   M                 0.310 0.537 0.391
##   Prefer not to say 0.000 0.024 0.000
##   Sum               1.000 1.000 1.000

Residence

chi_square <- chisq.test(mydata$ResidenceF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$ResidenceF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$ResidenceF and as.factor(mydata$ClusterWard)
## X-squared = 8.2105, df = 6, p-value = 0.2231

round (chi_square$res, 2)

##                     as.factor(mydata$ClusterWard)
## mydata$ResidenceF        1     2     3
##   With parents       -0.48 -0.42  0.93
##   Away from parents  -0.31  1.00 -0.60
##   Away from parents   0.00  0.99 -0.93
##                       1.23 -1.45 -0.01

round (chi_square$observed, 2)

##                     as.factor(mydata$ClusterWard)
## mydata$ResidenceF     1  2  3
##   With parents       27 19 28
##   Away from parents   6  7  4
##   Away from parents  14 13  8
##                      11  2  6

addmargins(round(prop.table(chi_square$observed), 3))

##                     as.factor(mydata$ClusterWard)
## mydata$ResidenceF        1     2     3   Sum
##   With parents       0.186 0.131 0.193 0.510
##   Away from parents  0.041 0.048 0.028 0.117
##   Away from parents  0.097 0.090 0.055 0.242
##                      0.076 0.014 0.041 0.131
##   Sum                0.400 0.283 0.317 1.000

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                     as.factor(mydata$ClusterWard)
## mydata$ResidenceF        1     2     3   Sum
##   With parents       0.365 0.257 0.378 1.000
##   Away from parents  0.353 0.412 0.235 1.000
##   Away from parents  0.400 0.371 0.229 1.000
##                      0.579 0.105 0.316 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                     as.factor(mydata$ClusterWard)
## mydata$ResidenceF        1     2     3
##   With parents       0.466 0.463 0.609
##   Away from parents  0.103 0.171 0.087
##   Away from parents  0.241 0.317 0.174
##                      0.190 0.049 0.130
##   Sum                1.000 1.000 1.000

Education

chi_square <- chisq.test(mydata$EducationF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$EducationF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EducationF and as.factor(mydata$ClusterWard)
## X-squared = 7.2304, df = 6, p-value = 0.3001

round (chi_square$res, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF       1     2     3
##   Bachelor degree    0.31 -0.38  0.01
##   Secondary school  -1.63  1.83  0.11
##   Gymnasium          0.64 -0.70 -0.06
##   Masters degree    -0.16  0.22 -0.03

round (chi_square$observed, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF    1  2  3
##   Bachelor degree   28 17 21
##   Secondary school   2  8  5
##   Gymnasium         22 11 15
##   Masters degree     6  5  5

addmargins(round(prop.table(chi_square$observed), 3))

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF       1     2     3   Sum
##   Bachelor degree   0.193 0.117 0.145 0.455
##   Secondary school  0.014 0.055 0.034 0.103
##   Gymnasium         0.152 0.076 0.103 0.331
##   Masters degree    0.041 0.034 0.034 0.109
##   Sum               0.400 0.282 0.316 0.998

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF       1     2     3   Sum
##   Bachelor degree   0.424 0.258 0.318 1.000
##   Secondary school  0.133 0.533 0.333 0.999
##   Gymnasium         0.458 0.229 0.312 0.999
##   Masters degree    0.375 0.312 0.312 0.999

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF       1     2     3
##   Bachelor degree   0.483 0.415 0.457
##   Secondary school  0.034 0.195 0.109
##   Gymnasium         0.379 0.268 0.326
##   Masters degree    0.103 0.122 0.109
##   Sum               0.999 1.000 1.001

Status

chi_square <- chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$StatusF and as.factor(mydata$ClusterWard)
## X-squared = 9.421, df = 10, p-value = 0.4927

round (chi_square$res, 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3
##   Student, with a student job     0.04 -0.08  0.03
##   Employed                       -0.32  1.48 -1.04
##   Unemployed                      0.73 -0.92  0.05
##   Self-employed                   1.11 -0.12 -1.13
##   Student, without a student job -0.31 -0.84  1.14
##   Other                          -0.18  0.16  0.05

round (chi_square$observed, 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                    1  2  3
##   Student, with a student job    29 20 23
##   Employed                        9 11  5
##   Unemployed                      2  0  1
##   Self-employed                   3  1  0
##   Student, without a student job 14  8 16
##   Other                           1  1  1

addmargins(round(prop.table(chi_square$observed), 3))

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3   Sum
##   Student, with a student job    0.200 0.138 0.159 0.497
##   Employed                       0.062 0.076 0.034 0.172
##   Unemployed                     0.014 0.000 0.007 0.021
##   Self-employed                  0.021 0.007 0.000 0.028
##   Student, without a student job 0.097 0.055 0.110 0.262
##   Other                          0.007 0.007 0.007 0.021
##   Sum                            0.401 0.283 0.317 1.001

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3   Sum
##   Student, with a student job    0.403 0.278 0.319 1.000
##   Employed                       0.360 0.440 0.200 1.000
##   Unemployed                     0.667 0.000 0.333 1.000
##   Self-employed                  0.750 0.250 0.000 1.000
##   Student, without a student job 0.368 0.211 0.421 1.000
##   Other                          0.333 0.333 0.333 0.999

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3
##   Student, with a student job    0.500 0.488 0.500
##   Employed                       0.155 0.268 0.109
##   Unemployed                     0.034 0.000 0.022
##   Self-employed                  0.052 0.024 0.000
##   Student, without a student job 0.241 0.195 0.348
##   Other                          0.017 0.024 0.022
##   Sum                            0.999 0.999 1.001

Personal monthly income

chi_square <- chisq.test(mydata$Personal_Monthly_IncomeF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$Personal_Monthly_IncomeF,
## as.factor(mydata$ClusterWard)): Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Personal_Monthly_IncomeF and as.factor(mydata$ClusterWard)
## X-squared = 14.564, df = 8, p-value = 0.0682

round (chi_square$res, 2)

##                 as.factor(mydata$ClusterWard)
##                      1     2     3
##   0-500           0.08 -1.70  1.52
##   501-1000        0.34  0.65 -0.99
##   1001-1500      -0.55  1.16 -0.48
##   1501-2000       0.67 -0.04 -0.72
##   More than 2000 -1.00  1.89 -0.66

round (chi_square$observed, 2)

##                 as.factor(mydata$ClusterWard)
##                   1  2  3
##   0-500          28 12 29
##   501-1000       14 11  7
##   1001-1500       5  7  4
##   1501-2000       9  5  4
##   More than 2000  2  6  2

addmargins(round(prop.table(chi_square$observed), 3))

##                 as.factor(mydata$ClusterWard)
##                      1     2     3   Sum
##   0-500          0.193 0.083 0.200 0.476
##   501-1000       0.097 0.076 0.048 0.221
##   1001-1500      0.034 0.048 0.028 0.110
##   1501-2000      0.062 0.034 0.028 0.124
##   More than 2000 0.014 0.041 0.014 0.069
##   Sum            0.400 0.282 0.318 1.000

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                 as.factor(mydata$ClusterWard)
##                      1     2     3   Sum
##   0-500          0.406 0.174 0.420 1.000
##   501-1000       0.438 0.344 0.219 1.001
##   1001-1500      0.312 0.438 0.250 1.000
##   1501-2000      0.500 0.278 0.222 1.000
##   More than 2000 0.200 0.600 0.200 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                 as.factor(mydata$ClusterWard)
##                      1     2     3
##   0-500          0.483 0.293 0.630
##   501-1000       0.241 0.268 0.152
##   1001-1500      0.086 0.171 0.087
##   1501-2000      0.155 0.122 0.087
##   More than 2000 0.034 0.146 0.043
##   Sum            0.999 1.000 0.999

Status

chi_square <- chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$StatusF and as.factor(mydata$ClusterWard)
## X-squared = 9.421, df = 10, p-value = 0.4927

round (chi_square$res, 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3
##   Student, with a student job     0.04 -0.08  0.03
##   Employed                       -0.32  1.48 -1.04
##   Unemployed                      0.73 -0.92  0.05
##   Self-employed                   1.11 -0.12 -1.13
##   Student, without a student job -0.31 -0.84  1.14
##   Other                          -0.18  0.16  0.05

round (chi_square$observed, 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                    1  2  3
##   Student, with a student job    29 20 23
##   Employed                        9 11  5
##   Unemployed                      2  0  1
##   Self-employed                   3  1  0
##   Student, without a student job 14  8 16
##   Other                           1  1  1

addmargins(round(prop.table(chi_square$observed), 3))

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3   Sum
##   Student, with a student job    0.200 0.138 0.159 0.497
##   Employed                       0.062 0.076 0.034 0.172
##   Unemployed                     0.014 0.000 0.007 0.021
##   Self-employed                  0.021 0.007 0.000 0.028
##   Student, without a student job 0.097 0.055 0.110 0.262
##   Other                          0.007 0.007 0.007 0.021
##   Sum                            0.401 0.283 0.317 1.001

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3   Sum
##   Student, with a student job    0.403 0.278 0.319 1.000
##   Employed                       0.360 0.440 0.200 1.000
##   Unemployed                     0.667 0.000 0.333 1.000
##   Self-employed                  0.750 0.250 0.000 1.000
##   Student, without a student job 0.368 0.211 0.421 1.000
##   Other                          0.333 0.333 0.333 0.999

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3
##   Student, with a student job    0.500 0.488 0.500
##   Employed                       0.155 0.268 0.109
##   Unemployed                     0.034 0.000 0.022
##   Self-employed                  0.052 0.024 0.000
##   Student, without a student job 0.241 0.195 0.348
##   Other                          0.017 0.024 0.022
##   Sum                            0.999 0.999 1.001

Marital status

chi_square <- chisq.test(mydata$Marital_StatusF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$Marital_StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Marital_StatusF and as.factor(mydata$ClusterWard)
## X-squared = 5.0417, df = 4, p-value = 0.283

round (chi_square$res, 2)

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF     1     2     3
##      In a relationship  0.08 -1.02  0.88
##      Single            -0.18  1.04 -0.78
##      Married            0.95 -0.53 -0.56

round (chi_square$observed, 2)

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF  1  2  3
##      In a relationship 28 15 26
##      Single            29 26 20
##      Married            1  0  0

addmargins(round(prop.table(chi_square$observed), 3))

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF     1     2     3   Sum
##      In a relationship 0.193 0.103 0.179 0.475
##      Single            0.200 0.179 0.138 0.517
##      Married           0.007 0.000 0.000 0.007
##      Sum               0.400 0.282 0.317 0.999

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF     1     2     3   Sum
##      In a relationship 0.406 0.217 0.377 1.000
##      Single            0.387 0.347 0.267 1.001
##      Married           1.000 0.000 0.000 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF     1     2     3
##      In a relationship 0.483 0.366 0.565
##      Single            0.500 0.634 0.435
##      Married           0.017 0.000 0.000
##      Sum               1.000 1.000 1.000

Region

chi_square <- chisq.test(mydata$RegionF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$RegionF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$RegionF and as.factor(mydata$ClusterWard)
## X-squared = 17.638, df = 14, p-value = 0.2238

round (chi_square$res, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF          1     2     3
##   Osrednjeslovenska  0.54 -0.54 -0.09
##   Stajerska         -0.25  0.50 -0.19
##   Gorenjska          0.37 -0.59  0.15
##   Notranjska         0.09  0.87 -0.93
##   Koroska            1.34 -0.75 -0.80
##   Primorska         -1.14 -0.63  1.88
##   Dolenjska         -1.23  1.82 -0.34
##   Prekmurje          0.73 -0.92  0.05

round (chi_square$observed, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF       1  2  3
##   Osrednjeslovenska 19 10 13
##   Stajerska         15 13 12
##   Gorenjska         12  6  9
##   Notranjska         5  5  2
##   Koroska            2  0  0
##   Primorska          2  2  7
##   Dolenjska          1  5  2
##   Prekmurje          2  0  1

addmargins(round(prop.table(chi_square$observed), 3))

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF          1     2     3   Sum
##   Osrednjeslovenska 0.131 0.069 0.090 0.290
##   Stajerska         0.103 0.090 0.083 0.276
##   Gorenjska         0.083 0.041 0.062 0.186
##   Notranjska        0.034 0.034 0.014 0.082
##   Koroska           0.014 0.000 0.000 0.014
##   Primorska         0.014 0.014 0.048 0.076
##   Dolenjska         0.007 0.034 0.014 0.055
##   Prekmurje         0.014 0.000 0.007 0.021
##   Sum               0.400 0.282 0.318 1.000

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF          1     2     3   Sum
##   Osrednjeslovenska 0.452 0.238 0.310 1.000
##   Stajerska         0.375 0.325 0.300 1.000
##   Gorenjska         0.444 0.222 0.333 0.999
##   Notranjska        0.417 0.417 0.167 1.001
##   Koroska           1.000 0.000 0.000 1.000
##   Primorska         0.182 0.182 0.636 1.000
##   Dolenjska         0.125 0.625 0.250 1.000
##   Prekmurje         0.667 0.000 0.333 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF          1     2     3
##   Osrednjeslovenska 0.328 0.244 0.283
##   Stajerska         0.259 0.317 0.261
##   Gorenjska         0.207 0.146 0.196
##   Notranjska        0.086 0.122 0.043
##   Koroska           0.034 0.000 0.000
##   Primorska         0.034 0.049 0.152
##   Dolenjska         0.017 0.122 0.043
##   Prekmurje         0.034 0.000 0.022
##   Sum               0.999 1.000 1.000

Live

chi_square <- chisq.test(mydata$LiveF, as.factor(mydata$ClusterWard))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$LiveF and as.factor(mydata$ClusterWard)
## X-squared = 6.5434, df = 4, p-value = 0.1621

round (chi_square$res, 2)

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF          1     2     3
##   Town             0.41  1.03 -1.43
##   Suburb          -0.18 -0.77  0.92
##   The countryside -0.38 -0.69  1.08

round (chi_square$observed, 2)

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF       1  2  3
##   Town            31 25 16
##   Suburb          11  6 12
##   The countryside 16 10 18

addmargins(round(prop.table(chi_square$observed), 3))

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF          1     2     3   Sum
##   Town            0.214 0.172 0.110 0.496
##   Suburb          0.076 0.041 0.083 0.200
##   The countryside 0.110 0.069 0.124 0.303
##   Sum             0.400 0.282 0.317 0.999

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF          1     2     3   Sum
##   Town            0.431 0.347 0.222 1.000
##   Suburb          0.379 0.207 0.414 1.000
##   The countryside 0.364 0.227 0.409 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF          1     2     3
##   Town            0.534 0.610 0.348
##   Suburb          0.190 0.146 0.261
##   The countryside 0.276 0.244 0.391
##   Sum             1.000 1.000 1.000

Bank

chi_square <- chisq.test(mydata$BankF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$BankF, as.factor(mydata$ClusterWard)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$BankF and as.factor(mydata$ClusterWard)
## X-squared = 20.67, df = 22, p-value = 0.5412

round (chi_square$res, 2)

##                  as.factor(mydata$ClusterWard)
## mydata$BankF          1     2     3
##   NLB              0.81 -0.98  0.01
##   NKBM            -0.84  0.12  0.83
##   SKB             -0.67  1.07 -0.26
##   Revolut         -0.89  1.91 -0.80
##   N26             -0.18  0.16  0.05
##   Unicredit       -0.48  0.73 -0.15
##   Addiko          -0.63 -0.53  1.21
##   Gorenjska banka  1.44 -0.48 -1.16
##   Sparkasse       -0.63 -0.53  1.21
##   Delezna banka   -0.47  0.82 -0.24
##   Intesa Saopolo   0.39  0.23 -0.65
##   Other            0.29 -0.63  0.27

round (chi_square$observed, 2)

##                  as.factor(mydata$ClusterWard)
## mydata$BankF       1  2  3
##   NLB             21  9 14
##   NKBM            13 12 16
##   SKB              3  5  3
##   Revolut          0  2  0
##   N26              1  1  1
##   Unicredit        2  3  2
##   Addiko           0  0  1
##   Gorenjska banka  9  3  2
##   Sparkasse        0  0  1
##   Delezna banka    1  2  1
##   Intesa Saopolo   3  2  1
##   Other            5  2  4

addmargins(round(prop.table(chi_square$observed), 3))

##                  as.factor(mydata$ClusterWard)
## mydata$BankF          1     2     3   Sum
##   NLB             0.145 0.062 0.097 0.304
##   NKBM            0.090 0.083 0.110 0.283
##   SKB             0.021 0.034 0.021 0.076
##   Revolut         0.000 0.014 0.000 0.014
##   N26             0.007 0.007 0.007 0.021
##   Unicredit       0.014 0.021 0.014 0.049
##   Addiko          0.000 0.000 0.007 0.007
##   Gorenjska banka 0.062 0.021 0.014 0.097
##   Sparkasse       0.000 0.000 0.007 0.007
##   Delezna banka   0.007 0.014 0.007 0.028
##   Intesa Saopolo  0.021 0.014 0.007 0.042
##   Other           0.034 0.014 0.028 0.076
##   Sum             0.401 0.284 0.319 1.004

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                  as.factor(mydata$ClusterWard)
## mydata$BankF          1     2     3   Sum
##   NLB             0.477 0.205 0.318 1.000
##   NKBM            0.317 0.293 0.390 1.000
##   SKB             0.273 0.455 0.273 1.001
##   Revolut         0.000 1.000 0.000 1.000
##   N26             0.333 0.333 0.333 0.999
##   Unicredit       0.286 0.429 0.286 1.001
##   Addiko          0.000 0.000 1.000 1.000
##   Gorenjska banka 0.643 0.214 0.143 1.000
##   Sparkasse       0.000 0.000 1.000 1.000
##   Delezna banka   0.250 0.500 0.250 1.000
##   Intesa Saopolo  0.500 0.333 0.167 1.000
##   Other           0.455 0.182 0.364 1.001

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                  as.factor(mydata$ClusterWard)
## mydata$BankF          1     2     3
##   NLB             0.362 0.220 0.304
##   NKBM            0.224 0.293 0.348
##   SKB             0.052 0.122 0.065
##   Revolut         0.000 0.049 0.000
##   N26             0.017 0.024 0.022
##   Unicredit       0.034 0.073 0.043
##   Addiko          0.000 0.000 0.022
##   Gorenjska banka 0.155 0.073 0.043
##   Sparkasse       0.000 0.000 0.022
##   Delezna banka   0.017 0.049 0.022
##   Intesa Saopolo  0.052 0.049 0.022
##   Other           0.086 0.049 0.087
##   Sum             0.999 1.001 1.000

NLB project analysis Group1

Anej Levpuscek

2024-01-24

Educlidean distance

Using Spearman correlation coefficient