NLB project analysis Group1

#the data were partially cleaned in 1KA and Excel
mydata <-read.table("./analiza-anketa.csv", header = TRUE, sep = ";", dec= ",")

mydata <- mydata[-1, ] #delete first row in which the questions are written

mydata$ID <- seq(1,nrow(mydata))

head(mydata)

##   Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 2   0   0   0   0   0   0   0   1   0   0   0   0       -2  8         -2  1  5
## 3   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  3  2
## 4   1   0   0   1   0   0   0   0   0   0   0   0       -2  1         -2  1  2
## 5   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  2  4
## 6   0   0   0   1   0   0   0   0   0   0   1   0       -2 11         -2  2  2
## 7   0   0   1   0   0   0   0   0   0   0   0   0       -2  3         -2  1  2
##   Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 2  3   4   5   6   4   5   5   6   5   6   5   4   5   4   5   6   4   1   0
## 3  4   5   6   3   6   6   7   5   5   7   6   3   2   5   6   3   5   0   1
## 4  3   3   5   4   5   5   6   6   4   5   5   5   5   4   5   5   6   1   1
## 5  5   6   4   5   4   4   2   6   6   7   6   4   3   5   6   4   6   1   0
## 6  3   4   6   6   4   5   5   6   6   7   5   6   6   6   7   6   6   0   0
## 7  5   2   5   5   3   6   6   6   6   7   5   6   5   5   6   5   5   0   0
##   Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 2   1   0   0   0   0   0   0   1   0       -2   0   1   0   0   0   1   0   0
## 3   0   0   0   1   1   0   0   0   0       -2   0   0   1   0   1   0   0   0
## 4   0   0   0   1   0   0   0   0   0       -2   0   0   1   1   0   0   0   0
## 5   1   0   0   1   0   0   0   0   0       -2   0   0   0   0   1   0   1   0
## 6   0   0   0   1   0   0   1   1   0       -2   0   1   0   1   0   0   1   0
## 7   1   0   0   1   0   1   0   0   0       -2   0   0   0   0   1   0   1   0
##   Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 2   0   1   0       -2    5    5    5    4    5    4    1    1    1    0    0
## 3   0   1   0       -2    2    3    1    6    5    4    0    1    1    0    1
## 4   0   1   0       -2    2    2    2    2    3    2    0    1    1    1    0
## 5   0   1   0       -2    6    7    2    2    6    2    1    1    0    1    0
## 6   0   0   0       -2    2    2    5    2    3    2    0    1    1    0    0
## 7   0   1   0       -2    5    5    5    5    5    3    0    0    1    1    1
##   Q11f      Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b
## 2    0             -2    5    6    4    6    6    6    5    5    6    0    0
## 3    0             -2    6    2    3    2    7    7    3    7    6    0    1
## 4    0             -2    5    5    4    4    5    6    5    6    6    0    1
## 5    0             -2    6    2    7    6    7    7    2    7    7    0    1
## 6    1 Letalske karte    5    6    2    4    6    6    5    6    6    0    0
## 7    0             -2    5    3    5    6    7    6    4    6    6    0    0
##   Q14c Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e
## 2    0    1    0    1    0    1    0    0        -2    0    0    1    0    1
## 3    0    1    1    0    0    0    0    0        -2    1    1    0    0    1
## 4    1    0    1    0    0    0    0    0        -2    1    1    0    0    1
## 5    1    1    0    0    0    0    0    0        -2    0    0    0    1    1
## 6    1    0    1    0    1    0    0    0        -2    1    1    0    0    1
## 7    0    1    1    0    0    1    0    0        -2    1    0    0    0    0
##   Q15f Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b
## 2    1    0        -2    5    5    5    5    5    5    5    5    6    8    8
## 3    0    0        -2    3    2    6    5    6    4    1    2    6    6   -1
## 4    0    0        -2    5    5    6    5    5    5    4    5    5    5    8
## 5    0    1 Apple Pay    4    4    6    6    6    6    4    4    6    6    8
## 6    0    0        -2    4    4    5    6    6    4    4    5    3    2    8
## 7    0    0        -2    4    4    6    6    6    5    4    4    6    8    8
##   Q17c Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e
## 2    8    8    8    8    8    8    8    8    8    8    8    8    8    8    8
## 3   -1   -1   -1   -1    5   -1   -1   -1   -1   -1    5   -1   -1   -1   -1
## 4    8    8    6    8    5    8    8    8    6    8    3    3    4    4    3
## 5    8    8    6    8    5    8    8    8    6    8    6    8    8    6    6
## 6    6    8    7    6    3    8    5    8    6    6    4    5    6    5    6
## 7    6    8    8    8    8    8    6    8    8    8    8    8    5    8    8
##   Q19f Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 2    8  23   2   1         -2   4   1         -2   1   1         -2   3   3  1
## 3   -1  20   1   5         -2   3   5         -2   1   1         -2   2   1  2
## 4    3  26   1   2         -2   5   2         -2   5   2         -2   7   3  3
## 5    8  25   2   2         -2   5   1         -2   3   2         -2   1   1  4
## 6    6  24   1   5         -2   5   1         -2   3   2         -2   4   1  5
## 7    8  25   2   4         -2   5   2         -2   4   2         -2   1   2  6

#creating new variable and informing R that we have non-numerical variable
mydata$GenderF <- factor(mydata$Q21,
 levels = c(2, 1, 4),
 labels =c("F", "M", "Prefer not to say"))

mydata$ResidenceF <- factor(mydata$Q22,
 levels = c(1, 2,3,4,5,6),
 labels =c("With parents", "Studio apartment ", "Shared apartment", "Owned apartment", "Student dorms", "Other"))


mydata$ResidenceF <- factor(mydata$Q22,
 levels = c(1, 2,3,4,5,6),
 labels =c("With parents", "Away from parents ", "Away from parents", "Away from parents", "", "Away from parents"))

mydata$EducationF <- factor(mydata$Q23,
 levels = c(4,1,2,3,5,6),
 labels =c("Bachelor degree", "Primary school", "Secondary school ", "Gymnasium", "Masters degree", "Phd"))

mydata$StatusF <- factor(mydata$Q24,
 levels = c(1,2,3,4,5,6),
 labels =c("Student, with a student job", "Employed", "Unemployed ", "Self-employed", "Student, without a student job", "Other"))

mydata$Personal_Monthly_IncomeF <- factor(mydata$Q25,
 levels = c(1,2,3,4,5),
 labels =c("0-500", "501-1000", "1001-1500", "1501-2000", "More than 2000"))

mydata$Marital_StatusF <- factor(mydata$Q26,
 levels = c(1,2,3,4),
 labels =c("In a relationship", "Single", "Married", "Other"))

mydata$RegionF <- factor(mydata$Q27,
 levels = c(1,2,3,4,5,6,7,8),
 labels =c("Osrednjeslovenska", "Stajerska", "Gorenjska ", "Notranjska", "Koroska", "Primorska","Dolenjska","Prekmurje"))

mydata$LiveF <- factor(mydata$Q28,
 levels = c(1,2,3),
 labels =c("Town","Suburb", "The countryside"))

mydata$BankF <- factor(mydata$Q2,
 levels = c(1,2,3,4,5,6,7,8,9,10,11,12),
 labels =c("NLB", "NKBM", "SKB ", "Revolut", "N26", "Unicredit","Addiko","Gorenjska banka", "Sparkasse", "Delezna banka", "Intesa Saopolo","Other"))

Clustering

#13e- preprosta in intuitivna aplikacija
#13f - Varnost
#7j - Preglednost informacij o pogojih in stroških
#7l - Ugled banke
#10a- Starši (pomembna vloga) pri odprtju novega bančnega računa
#7a - pogoji za dolgoročno posojilo

mydata <- mydata[mydata$Q13e > 0, ]
mydata <- mydata[mydata$Q13f > 0, ]
mydata <- mydata[mydata$Q7j > 0, ]
mydata <- mydata[mydata$Q7l > 0, ]
mydata <- mydata[mydata$Q10a > 0, ]


mydata$Q13e <- as.numeric(mydata$Q13e)
mydata$Q13f <- as.numeric(mydata$Q13f)
mydata$Q7j <- as.numeric(mydata$Q7j)
mydata$Q7l <- as.numeric(mydata$Q7l)
mydata$Q10a <- as.numeric(mydata$Q10a)

summary(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]) #Describing clustering variables

##       Q13e            Q13f           Q7j             Q7l             Q10a      
##  Min.   :1.000   Min.   :3.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:6.000   1st Qu.:6.00   1st Qu.:5.000   1st Qu.:5.000   1st Qu.:4.000  
##  Median :7.000   Median :7.00   Median :6.000   Median :6.000   Median :5.000  
##  Mean   :6.387   Mean   :6.38   Mean   :5.813   Mean   :5.487   Mean   :4.853  
##  3rd Qu.:7.000   3rd Qu.:7.00   3rd Qu.:7.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.00   Max.   :7.000   Max.   :7.000   Max.   :7.000

#Saving standardized cluster variables into new data frame
mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))

mydata$Dissimilarity <- sqrt(mydata_clu_std$Q13e^2 + mydata_clu_std$Q13f^2 + mydata_clu_std$Q7j^2 + 
                             mydata_clu_std$Q7l^2 + mydata_clu_std$Q10a^2) #Finding outliers

head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) #Finding units with highest value of dissimilarity

##      ID Dissimilarity
## 38   37      7.599202
## 97   96      5.062270
## 124 123      4.978508
## 102 101      4.441051
## 34   33      4.399388
## 35   34      4.381617

print(mydata[37, ]) #Showing customer ID37

##    Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 38   1   0   0   1   0   0   0   0   0   0   0   0       -2  1         -2  2  1
##    Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 38  5   6   6   1   5   7   7   1   2   1   1   1   1   1   1   1   1   1   1
##    Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 38   0   0   1   0   0   0   0   0   0       -2   0   0   1   0   0   0   1   0
##    Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 38   0   1   0       -2    7    5    3    1    1    1    1    1    0    1    0
##    Q11f Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b Q14c
## 38    0        -2    6    1    7    1    1    7    5    6    7    0    1    0
##    Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e Q15f
## 38    1    1    0    0    0    0    0        -2    1    1    0    0    1    0
##    Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b Q17c
## 38    0        -2    4    4    7    7    5    4    2    4    7    7    8    5
##    Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e Q19f
## 38    8    7    8    5    8    8    8    7    8    7    8    6    8    5    8
##    Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 38  -1   2   1         -2   4   5         -2   1   2         -2   1   1 37
##    GenderF   ResidenceF      EducationF                        StatusF
## 38       F With parents Bachelor degree Student, without a student job
##    Personal_Monthly_IncomeF Marital_StatusF           RegionF LiveF BankF
## 38                    0-500          Single Osrednjeslovenska  Town   NLB
##    Dissimilarity
## 38      7.599202

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mydata <- mydata %>%
  filter(!ID %in% c(37)) #Removing ID37 from original data frame

mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))

#install.packages("factoextra")
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.3.2

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#Finding Eudlidean distances, based on 6 Cluster variables, then saving them into object Distances

Distances <- get_dist(mydata_clu_std, 
                      method = "euclidian")

Distances2 <- Distances^2

fviz_dist(Distances2, #Showing matrix of distances
          gradient = list(low = "darkred",
                          mid = "grey95",
                          high = "white"))

library(factoextra) 
get_clust_tendency(mydata_clu_std, #Hopkins statistics
                   n = nrow(mydata_clu_std) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.7159374
## 
## $plot
## NULL

Educlidean distance

library(dplyr)
WARD <- mydata_clu_std %>% #Selecting variables
  get_dist(method = "euclidean") %>%  #Selecting distance
  hclust(method = "ward.D2") #Selecting algorithm         

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 149

library(factoextra)
fviz_dend(WARD) #Dendrogram

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

mydata$ClusterWard <- cutree(WARD, 
                             k = 4) #Number of groups

head(mydata[c("ID", "ClusterWard")])

##   ID ClusterWard
## 1  1           1
## 2  2           2
## 3  3           2
## 4  4           3
## 5  5           2
## 6  6           3

#Showing the positions of initial leaders

Leaders_initial <- aggregate(mydata_clu_std, 
                             by = list(mydata$ClusterWard), 
                             FUN = mean)

Leaders_initial

##   Group.1       Q13e       Q13f        Q7j        Q7l       Q10a
## 1       1 -1.1417980 -1.1771576 -0.3658404 -0.3762346  0.3916587
## 2       2  0.1765311  0.2048837  0.2418637  0.3145441 -1.7760976
## 3       3  0.1948608  0.1688476  0.1119981  0.4194214  0.4331117
## 4       4  0.5190073  0.6411094 -0.2764593 -1.6484563 -0.1160904

Using Spearman correlation coefficient

mydata <-read.table("./analiza-anketa.csv", header = TRUE, sep = ";", dec= ",")

mydata <- mydata[-1, ] #delete first row in which the questions are written

mydata$ID <- seq(1,nrow(mydata))

head(mydata)

##   Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 2   0   0   0   0   0   0   0   1   0   0   0   0       -2  8         -2  1  5
## 3   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  3  2
## 4   1   0   0   1   0   0   0   0   0   0   0   0       -2  1         -2  1  2
## 5   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  2  4
## 6   0   0   0   1   0   0   0   0   0   0   1   0       -2 11         -2  2  2
## 7   0   0   1   0   0   0   0   0   0   0   0   0       -2  3         -2  1  2
##   Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 2  3   4   5   6   4   5   5   6   5   6   5   4   5   4   5   6   4   1   0
## 3  4   5   6   3   6   6   7   5   5   7   6   3   2   5   6   3   5   0   1
## 4  3   3   5   4   5   5   6   6   4   5   5   5   5   4   5   5   6   1   1
## 5  5   6   4   5   4   4   2   6   6   7   6   4   3   5   6   4   6   1   0
## 6  3   4   6   6   4   5   5   6   6   7   5   6   6   6   7   6   6   0   0
## 7  5   2   5   5   3   6   6   6   6   7   5   6   5   5   6   5   5   0   0
##   Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 2   1   0   0   0   0   0   0   1   0       -2   0   1   0   0   0   1   0   0
## 3   0   0   0   1   1   0   0   0   0       -2   0   0   1   0   1   0   0   0
## 4   0   0   0   1   0   0   0   0   0       -2   0   0   1   1   0   0   0   0
## 5   1   0   0   1   0   0   0   0   0       -2   0   0   0   0   1   0   1   0
## 6   0   0   0   1   0   0   1   1   0       -2   0   1   0   1   0   0   1   0
## 7   1   0   0   1   0   1   0   0   0       -2   0   0   0   0   1   0   1   0
##   Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 2   0   1   0       -2    5    5    5    4    5    4    1    1    1    0    0
## 3   0   1   0       -2    2    3    1    6    5    4    0    1    1    0    1
## 4   0   1   0       -2    2    2    2    2    3    2    0    1    1    1    0
## 5   0   1   0       -2    6    7    2    2    6    2    1    1    0    1    0
## 6   0   0   0       -2    2    2    5    2    3    2    0    1    1    0    0
## 7   0   1   0       -2    5    5    5    5    5    3    0    0    1    1    1
##   Q11f      Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b
## 2    0             -2    5    6    4    6    6    6    5    5    6    0    0
## 3    0             -2    6    2    3    2    7    7    3    7    6    0    1
## 4    0             -2    5    5    4    4    5    6    5    6    6    0    1
## 5    0             -2    6    2    7    6    7    7    2    7    7    0    1
## 6    1 Letalske karte    5    6    2    4    6    6    5    6    6    0    0
## 7    0             -2    5    3    5    6    7    6    4    6    6    0    0
##   Q14c Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e
## 2    0    1    0    1    0    1    0    0        -2    0    0    1    0    1
## 3    0    1    1    0    0    0    0    0        -2    1    1    0    0    1
## 4    1    0    1    0    0    0    0    0        -2    1    1    0    0    1
## 5    1    1    0    0    0    0    0    0        -2    0    0    0    1    1
## 6    1    0    1    0    1    0    0    0        -2    1    1    0    0    1
## 7    0    1    1    0    0    1    0    0        -2    1    0    0    0    0
##   Q15f Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b
## 2    1    0        -2    5    5    5    5    5    5    5    5    6    8    8
## 3    0    0        -2    3    2    6    5    6    4    1    2    6    6   -1
## 4    0    0        -2    5    5    6    5    5    5    4    5    5    5    8
## 5    0    1 Apple Pay    4    4    6    6    6    6    4    4    6    6    8
## 6    0    0        -2    4    4    5    6    6    4    4    5    3    2    8
## 7    0    0        -2    4    4    6    6    6    5    4    4    6    8    8
##   Q17c Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e
## 2    8    8    8    8    8    8    8    8    8    8    8    8    8    8    8
## 3   -1   -1   -1   -1    5   -1   -1   -1   -1   -1    5   -1   -1   -1   -1
## 4    8    8    6    8    5    8    8    8    6    8    3    3    4    4    3
## 5    8    8    6    8    5    8    8    8    6    8    6    8    8    6    6
## 6    6    8    7    6    3    8    5    8    6    6    4    5    6    5    6
## 7    6    8    8    8    8    8    6    8    8    8    8    8    5    8    8
##   Q19f Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 2    8  23   2   1         -2   4   1         -2   1   1         -2   3   3  1
## 3   -1  20   1   5         -2   3   5         -2   1   1         -2   2   1  2
## 4    3  26   1   2         -2   5   2         -2   5   2         -2   7   3  3
## 5    8  25   2   2         -2   5   1         -2   3   2         -2   1   1  4
## 6    6  24   1   5         -2   5   1         -2   3   2         -2   4   1  5
## 7    8  25   2   4         -2   5   2         -2   4   2         -2   1   2  6

mydata <- mydata[mydata$Q13e > 0, ]
mydata <- mydata[mydata$Q13f > 0, ]
mydata <- mydata[mydata$Q7j > 0, ]
mydata <- mydata[mydata$Q7l > 0, ]
mydata <- mydata[mydata$Q10a > 0, ]


mydata$Q13e <- as.numeric(mydata$Q13e)
mydata$Q13f <- as.numeric(mydata$Q13f)
mydata$Q7j <- as.numeric(mydata$Q7j)
mydata$Q7l <- as.numeric(mydata$Q7l)
mydata$Q10a <- as.numeric(mydata$Q10a)



library(dplyr)
mydata <- mydata %>%
  filter(!ID %in% c(37)) #Removing ID37 from original data frame

mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")])) 


library(dplyr)
WARD <- mydata_clu_std %>% #Selecting variables
  get_dist(method = "spearman") %>%#Selecting distance
  hclust(method = "ward.D") #Selecting algorithm         

WARD

## 
## Call:
## hclust(d = ., method = "ward.D")
## 
## Cluster method   : ward.D 
## Number of objects: 149

library(factoextra)
fviz_dend(WARD) #Dendrogram

mydata$ClusterWard <- cutree(WARD, 
                             k = 3) #Number of groups

head(mydata[c("ID", "ClusterWard")])

##   ID ClusterWard
## 1  1           1
## 2  2           1
## 3  3           2
## 4  4           1
## 5  5           2
## 6  6           3

#Showing the positions of initial leaders, used as starting point for k-means clustering

Leaders_initial <- aggregate(mydata_clu_std, 
                             by = list(mydata$ClusterWard), 
                             FUN = mean)

Leaders_initial

##   Group.1       Q13e        Q13f         Q7j          Q7l       Q10a
## 1       1  0.2681796  0.58583604 -0.29924274 -0.537584391 -0.1710966
## 2       2 -0.0358253  0.04793232  0.45038760  0.782974895 -0.5789471
## 3       3 -0.3046238 -0.77323723 -0.01065265  0.003189433  0.7083880

Centroids <- Leaders_initial
round(Centroids, 3)

##   Group.1   Q13e   Q13f    Q7j    Q7l   Q10a
## 1       1  0.268  0.586 -0.299 -0.538 -0.171
## 2       2 -0.036  0.048  0.450  0.783 -0.579
## 3       3 -0.305 -0.773 -0.011  0.003  0.708

library(ggplot2)
library(tidyr)

Picture <- as.data.frame(Centroids)
Picture$id <- 1:nrow(Picture)
Picture <- pivot_longer(Picture, cols = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))

Picture$Group <- factor(Picture$id, 
                        levels = c(1, 2, 3), 
                        labels = c("1", "2", "3"))

Picture$nameFactor <- factor(Picture$name, 
                             levels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"), 
                             labels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))

#Showing the lines
ggplot(Picture, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 3) +
  geom_line(aes(group = id, linetype = Group, col = Group), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")

table(mydata$ClusterWard)

## 
##  1  2  3 
## 60 41 48

#Checking if cluster variables are okay (if they differentiate)
fit <- aov(cbind(Q13e, Q13f, Q7j, Q7l, Q10a) ~ as.factor(ClusterWard), 
           data = mydata)

summary(fit)

##  Response Q13e :
##                         Df  Sum Sq Mean Sq F value  Pr(>F)  
## as.factor(ClusterWard)   2   7.294  3.6469  4.6272 0.01126 *
## Residuals              146 115.069  0.7881                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q13f :
##                         Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterWard)   2 32.352 16.1759  36.558 1.345e-13 ***
## Residuals              146 64.601  0.4425                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7j :
##                         Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterWard)   2  18.271  9.1354  7.4438 0.0008349 ***
## Residuals              146 179.179  1.2273                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7l :
##                         Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterWard)   2  68.652  34.326  29.384 1.887e-11 ***
## Residuals              146 170.556   1.168                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q10a :
##                         Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterWard)   2 126.28  63.141  26.655 1.356e-10 ***
## Residuals              146 345.85   2.369                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mydata <- mydata[mydata$Q20 > 0, ]


mydata$Q20 <- as.numeric(mydata$Q20)


aggregate(mydata$Q20, 
          by = list(mydata$ClusterWard), 
          FUN = mean)

##   Group.1        x
## 1       1 22.45763
## 2       2 22.73171
## 3       3 21.86957

mydata <- mydata[mydata$Q21 > 0, ]

mydata$Q21 <- as.numeric(mydata$Q21)

aggregate(mydata$Q21, 
          by = list(mydata$ClusterWard), 
          FUN = mean)

##   Group.1        x
## 1       1 1.689655
## 2       2 1.512195
## 3       3 1.608696

Age

fit <- aov(Q20 ~ as.factor(ClusterWard), 
           data = mydata)

summary.aov(fit)

##                         Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterWard)   2   17.6   8.801   1.713  0.184
## Residuals              142  729.7   5.139

aggregate(mydata$Q20, 
          by = list(mydata$ClusterWard), 
          FUN = mean)

##   Group.1        x
## 1       1 22.48276
## 2       2 22.73171
## 3       3 21.86957

#creating new variable and informing R that we have non-numerical variable
mydata$GenderF <- factor(mydata$Q21,
 levels = c(2, 1, 4),
 labels =c("F", "M", "Prefer not to say"))

mydata$ResidenceF <- factor(mydata$Q22,
 levels = c(1, 2,3,4,5,6),
 labels =c("With parents", "Studio apartment ", "Shared apartment", "Owned apartment", "Student dorms", "Other"))


mydata$EducationF <- factor(mydata$Q23,
 levels = c(4,1,2,3,5,6),
 labels =c("Bachelor degree", "Primary school", "Secondary school ", "Gymnasium", "Masters degree", "Phd"))

mydata$StatusF <- factor(mydata$Q24,
 levels = c(1,2,3,4,5,6),
 labels =c("Student, with a student job", "Employed", "Unemployed ", "Self-employed", "Student, without a student job", "Other"))

mydata$Personal_Monthly_IncomeF <- factor(mydata$Q25,
 levels = c(1,2,3,4,5),
 labels =c("0-500", "501-1000", "1001-1500", "1501-2000", "More than 2000"))

mydata$Marital_StatusF <- factor(mydata$Q26,
 levels = c(1,2,3,4),
 labels =c("In a relationship", "Single", "Married", "Other"))

mydata$RegionF <- factor(mydata$Q27,
 levels = c(1,2,3,4,5,6,7,8),
 labels =c("Osrednjeslovenska", "Stajerska", "Gorenjska ", "Notranjska", "Koroska", "Primorska","Dolenjska","Prekmurje"))

mydata$LiveF <- factor(mydata$Q28,
 levels = c(1,2,3),
 labels =c("Town","Suburb", "The countryside"))

mydata$BankF <- factor(mydata$Q2,
 levels = c(1,2,3,4,5,6,7,8,9,10,11,12),
 labels =c("NLB", "NKBM", "SKB ", "Revolut", "N26", "Unicredit","Addiko","Gorenjska banka", "Sparkasse", "Delezna banka", "Intesa Saopolo","Other"))

Gender

chi_square <- chisq.test(mydata$GenderF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$GenderF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$GenderF and as.factor(mydata$ClusterWard)
## X-squared = 8.1946, df = 4, p-value = 0.0847

round (chi_square$res, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF          1     2     3
##   F                  0.95 -1.28  0.14
##   M                 -1.08  1.38 -0.09
##   Prefer not to say -0.63  1.35 -0.56

round (chi_square$observed, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF       1  2  3
##   F                 40 18 28
##   M                 18 22 18
##   Prefer not to say  0  1  0

addmargins(round(prop.table(chi_square$observed), 3))

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF          1     2     3   Sum
##   F                 0.276 0.124 0.193 0.593
##   M                 0.124 0.152 0.124 0.400
##   Prefer not to say 0.000 0.007 0.000 0.007
##   Sum               0.400 0.283 0.317 1.000

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF          1     2     3   Sum
##   F                 0.465 0.209 0.326 1.000
##   M                 0.310 0.379 0.310 0.999
##   Prefer not to say 0.000 1.000 0.000 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                    as.factor(mydata$ClusterWard)
## mydata$GenderF          1     2     3
##   F                 0.690 0.439 0.609
##   M                 0.310 0.537 0.391
##   Prefer not to say 0.000 0.024 0.000
##   Sum               1.000 1.000 1.000

Residence

chi_square <- chisq.test(mydata$ResidenceF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$ResidenceF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$ResidenceF and as.factor(mydata$ClusterWard)
## X-squared = 10.978, df = 10, p-value = 0.3593

round (chi_square$res, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$ResidenceF       1     2     3
##   With parents      -0.48 -0.42  0.93
##   Studio apartment  -0.31  1.00 -0.60
##   Shared apartment   0.00  1.41 -1.33
##   Owned apartment   -0.09  0.17 -0.06
##   Student dorms      1.23 -1.45 -0.01
##   Other              0.22 -0.75  0.46

round (chi_square$observed, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$ResidenceF    1  2  3
##   With parents      27 19 28
##   Studio apartment   6  7  4
##   Shared apartment   8  9  3
##   Owned apartment    5  4  4
##   Student dorms     11  2  6
##   Other              1  0  1

addmargins(round(prop.table(chi_square$observed), 3))

##                    as.factor(mydata$ClusterWard)
## mydata$ResidenceF       1     2     3   Sum
##   With parents      0.186 0.131 0.193 0.510
##   Studio apartment  0.041 0.048 0.028 0.117
##   Shared apartment  0.055 0.062 0.021 0.138
##   Owned apartment   0.034 0.028 0.028 0.090
##   Student dorms     0.076 0.014 0.041 0.131
##   Other             0.007 0.000 0.007 0.014
##   Sum               0.399 0.283 0.318 1.000

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                    as.factor(mydata$ClusterWard)
## mydata$ResidenceF       1     2     3   Sum
##   With parents      0.365 0.257 0.378 1.000
##   Studio apartment  0.353 0.412 0.235 1.000
##   Shared apartment  0.400 0.450 0.150 1.000
##   Owned apartment   0.385 0.308 0.308 1.001
##   Student dorms     0.579 0.105 0.316 1.000
##   Other             0.500 0.000 0.500 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                    as.factor(mydata$ClusterWard)
## mydata$ResidenceF       1     2     3
##   With parents      0.466 0.463 0.609
##   Studio apartment  0.103 0.171 0.087
##   Shared apartment  0.138 0.220 0.065
##   Owned apartment   0.086 0.098 0.087
##   Student dorms     0.190 0.049 0.130
##   Other             0.017 0.000 0.022
##   Sum               1.000 1.001 1.000

Education

chi_square <- chisq.test(mydata$EducationF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$EducationF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$EducationF and as.factor(mydata$ClusterWard)
## X-squared = 7.2304, df = 6, p-value = 0.3001

round (chi_square$res, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF       1     2     3
##   Bachelor degree    0.31 -0.38  0.01
##   Secondary school  -1.63  1.83  0.11
##   Gymnasium          0.64 -0.70 -0.06
##   Masters degree    -0.16  0.22 -0.03

round (chi_square$observed, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF    1  2  3
##   Bachelor degree   28 17 21
##   Secondary school   2  8  5
##   Gymnasium         22 11 15
##   Masters degree     6  5  5

addmargins(round(prop.table(chi_square$observed), 3))

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF       1     2     3   Sum
##   Bachelor degree   0.193 0.117 0.145 0.455
##   Secondary school  0.014 0.055 0.034 0.103
##   Gymnasium         0.152 0.076 0.103 0.331
##   Masters degree    0.041 0.034 0.034 0.109
##   Sum               0.400 0.282 0.316 0.998

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF       1     2     3   Sum
##   Bachelor degree   0.424 0.258 0.318 1.000
##   Secondary school  0.133 0.533 0.333 0.999
##   Gymnasium         0.458 0.229 0.312 0.999
##   Masters degree    0.375 0.312 0.312 0.999

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                    as.factor(mydata$ClusterWard)
## mydata$EducationF       1     2     3
##   Bachelor degree   0.483 0.415 0.457
##   Secondary school  0.034 0.195 0.109
##   Gymnasium         0.379 0.268 0.326
##   Masters degree    0.103 0.122 0.109
##   Sum               0.999 1.000 1.001

Status

chi_square <- chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$StatusF and as.factor(mydata$ClusterWard)
## X-squared = 9.421, df = 10, p-value = 0.4927

round (chi_square$res, 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3
##   Student, with a student job     0.04 -0.08  0.03
##   Employed                       -0.32  1.48 -1.04
##   Unemployed                      0.73 -0.92  0.05
##   Self-employed                   1.11 -0.12 -1.13
##   Student, without a student job -0.31 -0.84  1.14
##   Other                          -0.18  0.16  0.05

round (chi_square$observed, 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                    1  2  3
##   Student, with a student job    29 20 23
##   Employed                        9 11  5
##   Unemployed                      2  0  1
##   Self-employed                   3  1  0
##   Student, without a student job 14  8 16
##   Other                           1  1  1

addmargins(round(prop.table(chi_square$observed), 3))

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3   Sum
##   Student, with a student job    0.200 0.138 0.159 0.497
##   Employed                       0.062 0.076 0.034 0.172
##   Unemployed                     0.014 0.000 0.007 0.021
##   Self-employed                  0.021 0.007 0.000 0.028
##   Student, without a student job 0.097 0.055 0.110 0.262
##   Other                          0.007 0.007 0.007 0.021
##   Sum                            0.401 0.283 0.317 1.001

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3   Sum
##   Student, with a student job    0.403 0.278 0.319 1.000
##   Employed                       0.360 0.440 0.200 1.000
##   Unemployed                     0.667 0.000 0.333 1.000
##   Self-employed                  0.750 0.250 0.000 1.000
##   Student, without a student job 0.368 0.211 0.421 1.000
##   Other                          0.333 0.333 0.333 0.999

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3
##   Student, with a student job    0.500 0.488 0.500
##   Employed                       0.155 0.268 0.109
##   Unemployed                     0.034 0.000 0.022
##   Self-employed                  0.052 0.024 0.000
##   Student, without a student job 0.241 0.195 0.348
##   Other                          0.017 0.024 0.022
##   Sum                            0.999 0.999 1.001

Personal monthly income

chi_square <- chisq.test(mydata$Personal_Monthly_IncomeF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$Personal_Monthly_IncomeF,
## as.factor(mydata$ClusterWard)): Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Personal_Monthly_IncomeF and as.factor(mydata$ClusterWard)
## X-squared = 14.564, df = 8, p-value = 0.0682

round (chi_square$res, 2)

##                 as.factor(mydata$ClusterWard)
##                      1     2     3
##   0-500           0.08 -1.70  1.52
##   501-1000        0.34  0.65 -0.99
##   1001-1500      -0.55  1.16 -0.48
##   1501-2000       0.67 -0.04 -0.72
##   More than 2000 -1.00  1.89 -0.66

round (chi_square$observed, 2)

##                 as.factor(mydata$ClusterWard)
##                   1  2  3
##   0-500          28 12 29
##   501-1000       14 11  7
##   1001-1500       5  7  4
##   1501-2000       9  5  4
##   More than 2000  2  6  2

addmargins(round(prop.table(chi_square$observed), 3))

##                 as.factor(mydata$ClusterWard)
##                      1     2     3   Sum
##   0-500          0.193 0.083 0.200 0.476
##   501-1000       0.097 0.076 0.048 0.221
##   1001-1500      0.034 0.048 0.028 0.110
##   1501-2000      0.062 0.034 0.028 0.124
##   More than 2000 0.014 0.041 0.014 0.069
##   Sum            0.400 0.282 0.318 1.000

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                 as.factor(mydata$ClusterWard)
##                      1     2     3   Sum
##   0-500          0.406 0.174 0.420 1.000
##   501-1000       0.438 0.344 0.219 1.001
##   1001-1500      0.312 0.438 0.250 1.000
##   1501-2000      0.500 0.278 0.222 1.000
##   More than 2000 0.200 0.600 0.200 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                 as.factor(mydata$ClusterWard)
##                      1     2     3
##   0-500          0.483 0.293 0.630
##   501-1000       0.241 0.268 0.152
##   1001-1500      0.086 0.171 0.087
##   1501-2000      0.155 0.122 0.087
##   More than 2000 0.034 0.146 0.043
##   Sum            0.999 1.000 0.999

Status

chi_square <- chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$StatusF and as.factor(mydata$ClusterWard)
## X-squared = 9.421, df = 10, p-value = 0.4927

round (chi_square$res, 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3
##   Student, with a student job     0.04 -0.08  0.03
##   Employed                       -0.32  1.48 -1.04
##   Unemployed                      0.73 -0.92  0.05
##   Self-employed                   1.11 -0.12 -1.13
##   Student, without a student job -0.31 -0.84  1.14
##   Other                          -0.18  0.16  0.05

round (chi_square$observed, 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                    1  2  3
##   Student, with a student job    29 20 23
##   Employed                        9 11  5
##   Unemployed                      2  0  1
##   Self-employed                   3  1  0
##   Student, without a student job 14  8 16
##   Other                           1  1  1

addmargins(round(prop.table(chi_square$observed), 3))

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3   Sum
##   Student, with a student job    0.200 0.138 0.159 0.497
##   Employed                       0.062 0.076 0.034 0.172
##   Unemployed                     0.014 0.000 0.007 0.021
##   Self-employed                  0.021 0.007 0.000 0.028
##   Student, without a student job 0.097 0.055 0.110 0.262
##   Other                          0.007 0.007 0.007 0.021
##   Sum                            0.401 0.283 0.317 1.001

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3   Sum
##   Student, with a student job    0.403 0.278 0.319 1.000
##   Employed                       0.360 0.440 0.200 1.000
##   Unemployed                     0.667 0.000 0.333 1.000
##   Self-employed                  0.750 0.250 0.000 1.000
##   Student, without a student job 0.368 0.211 0.421 1.000
##   Other                          0.333 0.333 0.333 0.999

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                                 as.factor(mydata$ClusterWard)
## mydata$StatusF                       1     2     3
##   Student, with a student job    0.500 0.488 0.500
##   Employed                       0.155 0.268 0.109
##   Unemployed                     0.034 0.000 0.022
##   Self-employed                  0.052 0.024 0.000
##   Student, without a student job 0.241 0.195 0.348
##   Other                          0.017 0.024 0.022
##   Sum                            0.999 0.999 1.001

Marital status

chi_square <- chisq.test(mydata$Marital_StatusF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$Marital_StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Marital_StatusF and as.factor(mydata$ClusterWard)
## X-squared = 5.0417, df = 4, p-value = 0.283

round (chi_square$res, 2)

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF     1     2     3
##      In a relationship  0.08 -1.02  0.88
##      Single            -0.18  1.04 -0.78
##      Married            0.95 -0.53 -0.56

round (chi_square$observed, 2)

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF  1  2  3
##      In a relationship 28 15 26
##      Single            29 26 20
##      Married            1  0  0

addmargins(round(prop.table(chi_square$observed), 3))

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF     1     2     3   Sum
##      In a relationship 0.193 0.103 0.179 0.475
##      Single            0.200 0.179 0.138 0.517
##      Married           0.007 0.000 0.000 0.007
##      Sum               0.400 0.282 0.317 0.999

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF     1     2     3   Sum
##      In a relationship 0.406 0.217 0.377 1.000
##      Single            0.387 0.347 0.267 1.001
##      Married           1.000 0.000 0.000 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                       as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF     1     2     3
##      In a relationship 0.483 0.366 0.565
##      Single            0.500 0.634 0.435
##      Married           0.017 0.000 0.000
##      Sum               1.000 1.000 1.000

Region

chi_square <- chisq.test(mydata$RegionF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$RegionF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$RegionF and as.factor(mydata$ClusterWard)
## X-squared = 17.638, df = 14, p-value = 0.2238

round (chi_square$res, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF          1     2     3
##   Osrednjeslovenska  0.54 -0.54 -0.09
##   Stajerska         -0.25  0.50 -0.19
##   Gorenjska          0.37 -0.59  0.15
##   Notranjska         0.09  0.87 -0.93
##   Koroska            1.34 -0.75 -0.80
##   Primorska         -1.14 -0.63  1.88
##   Dolenjska         -1.23  1.82 -0.34
##   Prekmurje          0.73 -0.92  0.05

round (chi_square$observed, 2)

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF       1  2  3
##   Osrednjeslovenska 19 10 13
##   Stajerska         15 13 12
##   Gorenjska         12  6  9
##   Notranjska         5  5  2
##   Koroska            2  0  0
##   Primorska          2  2  7
##   Dolenjska          1  5  2
##   Prekmurje          2  0  1

addmargins(round(prop.table(chi_square$observed), 3))

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF          1     2     3   Sum
##   Osrednjeslovenska 0.131 0.069 0.090 0.290
##   Stajerska         0.103 0.090 0.083 0.276
##   Gorenjska         0.083 0.041 0.062 0.186
##   Notranjska        0.034 0.034 0.014 0.082
##   Koroska           0.014 0.000 0.000 0.014
##   Primorska         0.014 0.014 0.048 0.076
##   Dolenjska         0.007 0.034 0.014 0.055
##   Prekmurje         0.014 0.000 0.007 0.021
##   Sum               0.400 0.282 0.318 1.000

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF          1     2     3   Sum
##   Osrednjeslovenska 0.452 0.238 0.310 1.000
##   Stajerska         0.375 0.325 0.300 1.000
##   Gorenjska         0.444 0.222 0.333 0.999
##   Notranjska        0.417 0.417 0.167 1.001
##   Koroska           1.000 0.000 0.000 1.000
##   Primorska         0.182 0.182 0.636 1.000
##   Dolenjska         0.125 0.625 0.250 1.000
##   Prekmurje         0.667 0.000 0.333 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                    as.factor(mydata$ClusterWard)
## mydata$RegionF          1     2     3
##   Osrednjeslovenska 0.328 0.244 0.283
##   Stajerska         0.259 0.317 0.261
##   Gorenjska         0.207 0.146 0.196
##   Notranjska        0.086 0.122 0.043
##   Koroska           0.034 0.000 0.000
##   Primorska         0.034 0.049 0.152
##   Dolenjska         0.017 0.122 0.043
##   Prekmurje         0.034 0.000 0.022
##   Sum               0.999 1.000 1.000

Live

chi_square <- chisq.test(mydata$LiveF, as.factor(mydata$ClusterWard))
chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$LiveF and as.factor(mydata$ClusterWard)
## X-squared = 6.5434, df = 4, p-value = 0.1621

round (chi_square$res, 2)

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF          1     2     3
##   Town             0.41  1.03 -1.43
##   Suburb          -0.18 -0.77  0.92
##   The countryside -0.38 -0.69  1.08

round (chi_square$observed, 2)

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF       1  2  3
##   Town            31 25 16
##   Suburb          11  6 12
##   The countryside 16 10 18

addmargins(round(prop.table(chi_square$observed), 3))

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF          1     2     3   Sum
##   Town            0.214 0.172 0.110 0.496
##   Suburb          0.076 0.041 0.083 0.200
##   The countryside 0.110 0.069 0.124 0.303
##   Sum             0.400 0.282 0.317 0.999

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF          1     2     3   Sum
##   Town            0.431 0.347 0.222 1.000
##   Suburb          0.379 0.207 0.414 1.000
##   The countryside 0.364 0.227 0.409 1.000

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                  as.factor(mydata$ClusterWard)
## mydata$LiveF          1     2     3
##   Town            0.534 0.610 0.348
##   Suburb          0.190 0.146 0.261
##   The countryside 0.276 0.244 0.391
##   Sum             1.000 1.000 1.000

Bank

chi_square <- chisq.test(mydata$BankF, as.factor(mydata$ClusterWard))

## Warning in chisq.test(mydata$BankF, as.factor(mydata$ClusterWard)): Chi-squared
## approximation may be incorrect

chi_square

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$BankF and as.factor(mydata$ClusterWard)
## X-squared = 20.67, df = 22, p-value = 0.5412

round (chi_square$res, 2)

##                  as.factor(mydata$ClusterWard)
## mydata$BankF          1     2     3
##   NLB              0.81 -0.98  0.01
##   NKBM            -0.84  0.12  0.83
##   SKB             -0.67  1.07 -0.26
##   Revolut         -0.89  1.91 -0.80
##   N26             -0.18  0.16  0.05
##   Unicredit       -0.48  0.73 -0.15
##   Addiko          -0.63 -0.53  1.21
##   Gorenjska banka  1.44 -0.48 -1.16
##   Sparkasse       -0.63 -0.53  1.21
##   Delezna banka   -0.47  0.82 -0.24
##   Intesa Saopolo   0.39  0.23 -0.65
##   Other            0.29 -0.63  0.27

round (chi_square$observed, 2)

##                  as.factor(mydata$ClusterWard)
## mydata$BankF       1  2  3
##   NLB             21  9 14
##   NKBM            13 12 16
##   SKB              3  5  3
##   Revolut          0  2  0
##   N26              1  1  1
##   Unicredit        2  3  2
##   Addiko           0  0  1
##   Gorenjska banka  9  3  2
##   Sparkasse        0  0  1
##   Delezna banka    1  2  1
##   Intesa Saopolo   3  2  1
##   Other            5  2  4

addmargins(round(prop.table(chi_square$observed), 3))

##                  as.factor(mydata$ClusterWard)
## mydata$BankF          1     2     3   Sum
##   NLB             0.145 0.062 0.097 0.304
##   NKBM            0.090 0.083 0.110 0.283
##   SKB             0.021 0.034 0.021 0.076
##   Revolut         0.000 0.014 0.000 0.014
##   N26             0.007 0.007 0.007 0.021
##   Unicredit       0.014 0.021 0.014 0.049
##   Addiko          0.000 0.000 0.007 0.007
##   Gorenjska banka 0.062 0.021 0.014 0.097
##   Sparkasse       0.000 0.000 0.007 0.007
##   Delezna banka   0.007 0.014 0.007 0.028
##   Intesa Saopolo  0.021 0.014 0.007 0.042
##   Other           0.034 0.014 0.028 0.076
##   Sum             0.401 0.284 0.319 1.004

addmargins(round(prop.table(chi_square$observed, 1), 3), 2)

##                  as.factor(mydata$ClusterWard)
## mydata$BankF          1     2     3   Sum
##   NLB             0.477 0.205 0.318 1.000
##   NKBM            0.317 0.293 0.390 1.000
##   SKB             0.273 0.455 0.273 1.001
##   Revolut         0.000 1.000 0.000 1.000
##   N26             0.333 0.333 0.333 0.999
##   Unicredit       0.286 0.429 0.286 1.001
##   Addiko          0.000 0.000 1.000 1.000
##   Gorenjska banka 0.643 0.214 0.143 1.000
##   Sparkasse       0.000 0.000 1.000 1.000
##   Delezna banka   0.250 0.500 0.250 1.000
##   Intesa Saopolo  0.500 0.333 0.167 1.000
##   Other           0.455 0.182 0.364 1.001

addmargins(round(prop.table(chi_square$observed, 2), 3), 1)

##                  as.factor(mydata$ClusterWard)
## mydata$BankF          1     2     3
##   NLB             0.362 0.220 0.304
##   NKBM            0.224 0.293 0.348
##   SKB             0.052 0.122 0.065
##   Revolut         0.000 0.049 0.000
##   N26             0.017 0.024 0.022
##   Unicredit       0.034 0.073 0.043
##   Addiko          0.000 0.000 0.022
##   Gorenjska banka 0.155 0.073 0.043
##   Sparkasse       0.000 0.000 0.022
##   Delezna banka   0.017 0.049 0.022
##   Intesa Saopolo  0.052 0.049 0.022
##   Other           0.086 0.049 0.087
##   Sum             0.999 1.001 1.000

Analysis of competition

mydata2 <- read.table("~/IMB/Mutivariat analysis/Perception map data.csv", header = TRUE, dec = ".", sep = ";") 

mydata2 <- mydata2[-1, ] #delete first row in which the questions are written


mydata2$Q17a <- as.numeric(mydata2$Q17a)
mydata2$Q17b <- as.numeric(mydata2$Q17b)
mydata2$Q17c <- as.numeric(mydata2$Q17c)
mydata2$Q17d <- as.numeric(mydata2$Q17d)
mydata2$Q17e <- as.numeric(mydata2$Q17e)
mydata2$Q17f <- as.numeric(mydata2$Q17f)
mydata2$Q18a <- as.numeric(mydata2$Q18a)
mydata2$Q18b <- as.numeric(mydata2$Q18b)
mydata2$Q18c <- as.numeric(mydata2$Q18c)
mydata2$Q18d <- as.numeric(mydata2$Q18d)
mydata2$Q18e <- as.numeric(mydata2$Q18e)
mydata2$Q18f <- as.numeric(mydata2$Q18f)
mydata2$Q19a <- as.numeric(mydata2$Q19a)
mydata2$Q19b <- as.numeric(mydata2$Q19b)
mydata2$Q19c <- as.numeric(mydata2$Q19c)
mydata2$Q19d <- as.numeric(mydata2$Q19d)
mydata2$Q19e <- as.numeric(mydata2$Q19e)
mydata2$Q19f <- as.numeric(mydata2$Q19f)

colnames(mydata2)[colnames(mydata2) == "Q17a"] <- "NLB_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17b"] <- "NKBM_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17c"] <- "SKB_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17d"] <- "Unicredit_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17e"] <- "Revolut_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17f"] <- "N26_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q18a"] <- "NLB_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18b"] <- "NKBM_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18c"] <- "SKB_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18d"] <- "Unicredit_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18e"] <- "Revolut_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18f"] <- "N26_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q19a"] <- "NLB_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19b"] <- "NKBM_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19c"] <- "SKB_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19d"] <- "Unicredit_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19e"] <- "Revolut_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19f"] <- "N26_Reputation"



head(mydata2[1:6])

##   NLB_Mobile App NKBM_Mobile App SKB_Mobile App Unicredit_Mobile App
## 2              8               8              8                    8
## 3              6              -1             -1                   -1
## 4              5               8              8                    8
## 5              6               8              8                    8
## 6              2               8              6                    8
## 7              8               8              6                    8
##   Revolut_Mobile App N26_Mobile App
## 2                  8              8
## 3                 -1             -1
## 4                  6              8
## 5                  6              8
## 6                  7              6
## 7                  8              8

library(dplyr)
library(naniar)

#Replace all values between -99 and -1 with NA for all variables at once
mydata2 <- mydata2 %>%
             replace_with_na_all(condition = ~.x %in% c(-99:-1)) 

#Replace all NA with averages by columns
mydata2 <- mydata2 %>% 
  mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))  

head(mydata2[1:6])

## # A tibble: 6 × 6
##   `NLB_Mobile App` `NKBM_Mobile App` `SKB_Mobile App` `Unicredit_Mobile App`
##              <dbl>             <dbl>            <dbl>                  <dbl>
## 1                8              8                8                      8   
## 2                6              6.86             6.96                   7.24
## 3                5              8                8                      8   
## 4                6              8                8                      8   
## 5                2              8                6                      8   
## 6                8              8                6                      8   
## # ℹ 2 more variables: `Revolut_Mobile App` <dbl>, `N26_Mobile App` <dbl>

library(tibble)
mydata_PCA <- mydata2 %>% 
  pivot_longer(everything(), names_to = "name", values_to = "score")  %>% 
  separate(name, into = c("retailer", "dimension"), sep = "_")%>% 
  pivot_wider(names_from = dimension, values_from = score, values_fn = mean) %>%
  column_to_rownames(var = "retailer")

print(mydata_PCA)

##           Mobile App Customer experience Reputation
## NLB         6.540541            6.718121   6.067114
## NKBM        6.863946            6.926174   6.187919
## SKB         6.958621            7.034014   6.040816
## Unicredit   7.243056            7.340136   6.287671
## Revolut     7.000000            7.020270   6.391892
## N26         7.137931            7.289655   6.734694

library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 4.3.2

pca <- PCA(mydata_PCA, 
           scale.unit = TRUE, 
           graph = FALSE,
           ncp = 2)

print(pca$var$cor)

##                         Dim.1      Dim.2
## Mobile App          0.9514762 -0.2875578
## Customer experience 0.9703558 -0.2115676
## Reputation          0.8021391  0.5970290

library(factoextra)
fviz_pca_biplot(pca, 
                repel = TRUE)

NLB project analysis Group1

2024-01-24

Educlidean distance

Using Spearman correlation coefficient

Analysis of competition