#the data were partially cleaned in 1KA and Excel
mydata <-read.table("./analiza-anketa.csv", header = TRUE, sep = ";", dec= ",")
mydata <- mydata[-1, ] #delete first row in which the questions are written
mydata$ID <- seq(1,nrow(mydata))
head(mydata)
## Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 2 0 0 0 0 0 0 0 1 0 0 0 0 -2 8 -2 1 5
## 3 1 0 0 0 0 0 0 0 0 0 0 0 -2 1 -2 3 2
## 4 1 0 0 1 0 0 0 0 0 0 0 0 -2 1 -2 1 2
## 5 1 0 0 0 0 0 0 0 0 0 0 0 -2 1 -2 2 4
## 6 0 0 0 1 0 0 0 0 0 0 1 0 -2 11 -2 2 2
## 7 0 0 1 0 0 0 0 0 0 0 0 0 -2 3 -2 1 2
## Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 2 3 4 5 6 4 5 5 6 5 6 5 4 5 4 5 6 4 1 0
## 3 4 5 6 3 6 6 7 5 5 7 6 3 2 5 6 3 5 0 1
## 4 3 3 5 4 5 5 6 6 4 5 5 5 5 4 5 5 6 1 1
## 5 5 6 4 5 4 4 2 6 6 7 6 4 3 5 6 4 6 1 0
## 6 3 4 6 6 4 5 5 6 6 7 5 6 6 6 7 6 6 0 0
## 7 5 2 5 5 3 6 6 6 6 7 5 6 5 5 6 5 5 0 0
## Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 2 1 0 0 0 0 0 0 1 0 -2 0 1 0 0 0 1 0 0
## 3 0 0 0 1 1 0 0 0 0 -2 0 0 1 0 1 0 0 0
## 4 0 0 0 1 0 0 0 0 0 -2 0 0 1 1 0 0 0 0
## 5 1 0 0 1 0 0 0 0 0 -2 0 0 0 0 1 0 1 0
## 6 0 0 0 1 0 0 1 1 0 -2 0 1 0 1 0 0 1 0
## 7 1 0 0 1 0 1 0 0 0 -2 0 0 0 0 1 0 1 0
## Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 2 0 1 0 -2 5 5 5 4 5 4 1 1 1 0 0
## 3 0 1 0 -2 2 3 1 6 5 4 0 1 1 0 1
## 4 0 1 0 -2 2 2 2 2 3 2 0 1 1 1 0
## 5 0 1 0 -2 6 7 2 2 6 2 1 1 0 1 0
## 6 0 0 0 -2 2 2 5 2 3 2 0 1 1 0 0
## 7 0 1 0 -2 5 5 5 5 5 3 0 0 1 1 1
## Q11f Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b
## 2 0 -2 5 6 4 6 6 6 5 5 6 0 0
## 3 0 -2 6 2 3 2 7 7 3 7 6 0 1
## 4 0 -2 5 5 4 4 5 6 5 6 6 0 1
## 5 0 -2 6 2 7 6 7 7 2 7 7 0 1
## 6 1 Letalske karte 5 6 2 4 6 6 5 6 6 0 0
## 7 0 -2 5 3 5 6 7 6 4 6 6 0 0
## Q14c Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e
## 2 0 1 0 1 0 1 0 0 -2 0 0 1 0 1
## 3 0 1 1 0 0 0 0 0 -2 1 1 0 0 1
## 4 1 0 1 0 0 0 0 0 -2 1 1 0 0 1
## 5 1 1 0 0 0 0 0 0 -2 0 0 0 1 1
## 6 1 0 1 0 1 0 0 0 -2 1 1 0 0 1
## 7 0 1 1 0 0 1 0 0 -2 1 0 0 0 0
## Q15f Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b
## 2 1 0 -2 5 5 5 5 5 5 5 5 6 8 8
## 3 0 0 -2 3 2 6 5 6 4 1 2 6 6 -1
## 4 0 0 -2 5 5 6 5 5 5 4 5 5 5 8
## 5 0 1 Apple Pay 4 4 6 6 6 6 4 4 6 6 8
## 6 0 0 -2 4 4 5 6 6 4 4 5 3 2 8
## 7 0 0 -2 4 4 6 6 6 5 4 4 6 8 8
## Q17c Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e
## 2 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## 3 -1 -1 -1 -1 5 -1 -1 -1 -1 -1 5 -1 -1 -1 -1
## 4 8 8 6 8 5 8 8 8 6 8 3 3 4 4 3
## 5 8 8 6 8 5 8 8 8 6 8 6 8 8 6 6
## 6 6 8 7 6 3 8 5 8 6 6 4 5 6 5 6
## 7 6 8 8 8 8 8 6 8 8 8 8 8 5 8 8
## Q19f Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 2 8 23 2 1 -2 4 1 -2 1 1 -2 3 3 1
## 3 -1 20 1 5 -2 3 5 -2 1 1 -2 2 1 2
## 4 3 26 1 2 -2 5 2 -2 5 2 -2 7 3 3
## 5 8 25 2 2 -2 5 1 -2 3 2 -2 1 1 4
## 6 6 24 1 5 -2 5 1 -2 3 2 -2 4 1 5
## 7 8 25 2 4 -2 5 2 -2 4 2 -2 1 2 6
#creating new variable and informing R that we have non-numerical variable
mydata$GenderF <- factor(mydata$Q21,
levels = c(2, 1, 4),
labels =c("F", "M", "Prefer not to say"))
mydata$ResidenceF <- factor(mydata$Q22,
levels = c(1, 2,3,4,5,6),
labels =c("With parents", "Studio apartment ", "Shared apartment", "Owned apartment", "Student dorms", "Other"))
mydata$ResidenceF <- factor(mydata$Q22,
levels = c(1, 2,3,4,5,6),
labels =c("With parents", "Away from parents ", "Away from parents", "Away from parents", "", "Away from parents"))
mydata$EducationF <- factor(mydata$Q23,
levels = c(4,1,2,3,5,6),
labels =c("Bachelor degree", "Primary school", "Secondary school ", "Gymnasium", "Masters degree", "Phd"))
mydata$StatusF <- factor(mydata$Q24,
levels = c(1,2,3,4,5,6),
labels =c("Student, with a student job", "Employed", "Unemployed ", "Self-employed", "Student, without a student job", "Other"))
mydata$Personal_Monthly_IncomeF <- factor(mydata$Q25,
levels = c(1,2,3,4,5),
labels =c("0-500", "501-1000", "1001-1500", "1501-2000", "More than 2000"))
mydata$Marital_StatusF <- factor(mydata$Q26,
levels = c(1,2,3,4),
labels =c("In a relationship", "Single", "Married", "Other"))
mydata$RegionF <- factor(mydata$Q27,
levels = c(1,2,3,4,5,6,7,8),
labels =c("Osrednjeslovenska", "Stajerska", "Gorenjska ", "Notranjska", "Koroska", "Primorska","Dolenjska","Prekmurje"))
mydata$LiveF <- factor(mydata$Q28,
levels = c(1,2,3),
labels =c("Town","Suburb", "The countryside"))
mydata$BankF <- factor(mydata$Q2,
levels = c(1,2,3,4,5,6,7,8,9,10,11,12),
labels =c("NLB", "NKBM", "SKB ", "Revolut", "N26", "Unicredit","Addiko","Gorenjska banka", "Sparkasse", "Delezna banka", "Intesa Saopolo","Other"))
Clustering
#13e- preprosta in intuitivna aplikacija
#13f - Varnost
#7j - Preglednost informacij o pogojih in stroških
#7l - Ugled banke
#10a- Starši (pomembna vloga) pri odprtju novega bančnega računa
#7a - pogoji za dolgoročno posojilo
mydata <- mydata[mydata$Q13e > 0, ]
mydata <- mydata[mydata$Q13f > 0, ]
mydata <- mydata[mydata$Q7j > 0, ]
mydata <- mydata[mydata$Q7l > 0, ]
mydata <- mydata[mydata$Q10a > 0, ]
mydata$Q13e <- as.numeric(mydata$Q13e)
mydata$Q13f <- as.numeric(mydata$Q13f)
mydata$Q7j <- as.numeric(mydata$Q7j)
mydata$Q7l <- as.numeric(mydata$Q7l)
mydata$Q10a <- as.numeric(mydata$Q10a)
summary(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]) #Describing clustering variables
## Q13e Q13f Q7j Q7l Q10a
## Min. :1.000 Min. :3.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:6.000 1st Qu.:6.00 1st Qu.:5.000 1st Qu.:5.000 1st Qu.:4.000
## Median :7.000 Median :7.00 Median :6.000 Median :6.000 Median :5.000
## Mean :6.387 Mean :6.38 Mean :5.813 Mean :5.487 Mean :4.853
## 3rd Qu.:7.000 3rd Qu.:7.00 3rd Qu.:7.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.00 Max. :7.000 Max. :7.000 Max. :7.000
#Saving standardized cluster variables into new data frame
mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))
mydata$Dissimilarity <- sqrt(mydata_clu_std$Q13e^2 + mydata_clu_std$Q13f^2 + mydata_clu_std$Q7j^2 +
mydata_clu_std$Q7l^2 + mydata_clu_std$Q10a^2) #Finding outliers
head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) #Finding units with highest value of dissimilarity
## ID Dissimilarity
## 38 37 7.599202
## 97 96 5.062270
## 124 123 4.978508
## 102 101 4.441051
## 34 33 4.399388
## 35 34 4.381617
print(mydata[37, ]) #Showing customer ID37
## Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 38 1 0 0 1 0 0 0 0 0 0 0 0 -2 1 -2 2 1
## Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 38 5 6 6 1 5 7 7 1 2 1 1 1 1 1 1 1 1 1 1
## Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 38 0 0 1 0 0 0 0 0 0 -2 0 0 1 0 0 0 1 0
## Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 38 0 1 0 -2 7 5 3 1 1 1 1 1 0 1 0
## Q11f Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b Q14c
## 38 0 -2 6 1 7 1 1 7 5 6 7 0 1 0
## Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e Q15f
## 38 1 1 0 0 0 0 0 -2 1 1 0 0 1 0
## Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b Q17c
## 38 0 -2 4 4 7 7 5 4 2 4 7 7 8 5
## Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e Q19f
## 38 8 7 8 5 8 8 8 7 8 7 8 6 8 5 8
## Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 38 -1 2 1 -2 4 5 -2 1 2 -2 1 1 37
## GenderF ResidenceF EducationF StatusF
## 38 F With parents Bachelor degree Student, without a student job
## Personal_Monthly_IncomeF Marital_StatusF RegionF LiveF BankF
## 38 0-500 Single Osrednjeslovenska Town NLB
## Dissimilarity
## 38 7.599202
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata <- mydata %>%
filter(!ID %in% c(37)) #Removing ID37 from original data frame
mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))
#install.packages("factoextra")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Finding Eudlidean distances, based on 6 Cluster variables, then saving them into object Distances
Distances <- get_dist(mydata_clu_std,
method = "euclidian")
Distances2 <- Distances^2
fviz_dist(Distances2, #Showing matrix of distances
gradient = list(low = "darkred",
mid = "grey95",
high = "white"))
library(factoextra)
get_clust_tendency(mydata_clu_std, #Hopkins statistics
n = nrow(mydata_clu_std) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.7159374
##
## $plot
## NULL
library(dplyr)
WARD <- mydata_clu_std %>% #Selecting variables
get_dist(method = "euclidean") %>% #Selecting distance
hclust(method = "ward.D2") #Selecting algorithm
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 149
library(factoextra)
fviz_dend(WARD) #Dendrogram
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
mydata$ClusterWard <- cutree(WARD,
k = 4) #Number of groups
head(mydata[c("ID", "ClusterWard")])
## ID ClusterWard
## 1 1 1
## 2 2 2
## 3 3 2
## 4 4 3
## 5 5 2
## 6 6 3
#Showing the positions of initial leaders
Leaders_initial <- aggregate(mydata_clu_std,
by = list(mydata$ClusterWard),
FUN = mean)
Leaders_initial
## Group.1 Q13e Q13f Q7j Q7l Q10a
## 1 1 -1.1417980 -1.1771576 -0.3658404 -0.3762346 0.3916587
## 2 2 0.1765311 0.2048837 0.2418637 0.3145441 -1.7760976
## 3 3 0.1948608 0.1688476 0.1119981 0.4194214 0.4331117
## 4 4 0.5190073 0.6411094 -0.2764593 -1.6484563 -0.1160904
mydata <-read.table("./analiza-anketa.csv", header = TRUE, sep = ";", dec= ",")
mydata <- mydata[-1, ] #delete first row in which the questions are written
mydata$ID <- seq(1,nrow(mydata))
head(mydata)
## Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 2 0 0 0 0 0 0 0 1 0 0 0 0 -2 8 -2 1 5
## 3 1 0 0 0 0 0 0 0 0 0 0 0 -2 1 -2 3 2
## 4 1 0 0 1 0 0 0 0 0 0 0 0 -2 1 -2 1 2
## 5 1 0 0 0 0 0 0 0 0 0 0 0 -2 1 -2 2 4
## 6 0 0 0 1 0 0 0 0 0 0 1 0 -2 11 -2 2 2
## 7 0 0 1 0 0 0 0 0 0 0 0 0 -2 3 -2 1 2
## Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 2 3 4 5 6 4 5 5 6 5 6 5 4 5 4 5 6 4 1 0
## 3 4 5 6 3 6 6 7 5 5 7 6 3 2 5 6 3 5 0 1
## 4 3 3 5 4 5 5 6 6 4 5 5 5 5 4 5 5 6 1 1
## 5 5 6 4 5 4 4 2 6 6 7 6 4 3 5 6 4 6 1 0
## 6 3 4 6 6 4 5 5 6 6 7 5 6 6 6 7 6 6 0 0
## 7 5 2 5 5 3 6 6 6 6 7 5 6 5 5 6 5 5 0 0
## Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 2 1 0 0 0 0 0 0 1 0 -2 0 1 0 0 0 1 0 0
## 3 0 0 0 1 1 0 0 0 0 -2 0 0 1 0 1 0 0 0
## 4 0 0 0 1 0 0 0 0 0 -2 0 0 1 1 0 0 0 0
## 5 1 0 0 1 0 0 0 0 0 -2 0 0 0 0 1 0 1 0
## 6 0 0 0 1 0 0 1 1 0 -2 0 1 0 1 0 0 1 0
## 7 1 0 0 1 0 1 0 0 0 -2 0 0 0 0 1 0 1 0
## Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 2 0 1 0 -2 5 5 5 4 5 4 1 1 1 0 0
## 3 0 1 0 -2 2 3 1 6 5 4 0 1 1 0 1
## 4 0 1 0 -2 2 2 2 2 3 2 0 1 1 1 0
## 5 0 1 0 -2 6 7 2 2 6 2 1 1 0 1 0
## 6 0 0 0 -2 2 2 5 2 3 2 0 1 1 0 0
## 7 0 1 0 -2 5 5 5 5 5 3 0 0 1 1 1
## Q11f Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b
## 2 0 -2 5 6 4 6 6 6 5 5 6 0 0
## 3 0 -2 6 2 3 2 7 7 3 7 6 0 1
## 4 0 -2 5 5 4 4 5 6 5 6 6 0 1
## 5 0 -2 6 2 7 6 7 7 2 7 7 0 1
## 6 1 Letalske karte 5 6 2 4 6 6 5 6 6 0 0
## 7 0 -2 5 3 5 6 7 6 4 6 6 0 0
## Q14c Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e
## 2 0 1 0 1 0 1 0 0 -2 0 0 1 0 1
## 3 0 1 1 0 0 0 0 0 -2 1 1 0 0 1
## 4 1 0 1 0 0 0 0 0 -2 1 1 0 0 1
## 5 1 1 0 0 0 0 0 0 -2 0 0 0 1 1
## 6 1 0 1 0 1 0 0 0 -2 1 1 0 0 1
## 7 0 1 1 0 0 1 0 0 -2 1 0 0 0 0
## Q15f Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b
## 2 1 0 -2 5 5 5 5 5 5 5 5 6 8 8
## 3 0 0 -2 3 2 6 5 6 4 1 2 6 6 -1
## 4 0 0 -2 5 5 6 5 5 5 4 5 5 5 8
## 5 0 1 Apple Pay 4 4 6 6 6 6 4 4 6 6 8
## 6 0 0 -2 4 4 5 6 6 4 4 5 3 2 8
## 7 0 0 -2 4 4 6 6 6 5 4 4 6 8 8
## Q17c Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e
## 2 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## 3 -1 -1 -1 -1 5 -1 -1 -1 -1 -1 5 -1 -1 -1 -1
## 4 8 8 6 8 5 8 8 8 6 8 3 3 4 4 3
## 5 8 8 6 8 5 8 8 8 6 8 6 8 8 6 6
## 6 6 8 7 6 3 8 5 8 6 6 4 5 6 5 6
## 7 6 8 8 8 8 8 6 8 8 8 8 8 5 8 8
## Q19f Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 2 8 23 2 1 -2 4 1 -2 1 1 -2 3 3 1
## 3 -1 20 1 5 -2 3 5 -2 1 1 -2 2 1 2
## 4 3 26 1 2 -2 5 2 -2 5 2 -2 7 3 3
## 5 8 25 2 2 -2 5 1 -2 3 2 -2 1 1 4
## 6 6 24 1 5 -2 5 1 -2 3 2 -2 4 1 5
## 7 8 25 2 4 -2 5 2 -2 4 2 -2 1 2 6
mydata <- mydata[mydata$Q13e > 0, ]
mydata <- mydata[mydata$Q13f > 0, ]
mydata <- mydata[mydata$Q7j > 0, ]
mydata <- mydata[mydata$Q7l > 0, ]
mydata <- mydata[mydata$Q10a > 0, ]
mydata$Q13e <- as.numeric(mydata$Q13e)
mydata$Q13f <- as.numeric(mydata$Q13f)
mydata$Q7j <- as.numeric(mydata$Q7j)
mydata$Q7l <- as.numeric(mydata$Q7l)
mydata$Q10a <- as.numeric(mydata$Q10a)
library(dplyr)
mydata <- mydata %>%
filter(!ID %in% c(37)) #Removing ID37 from original data frame
mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))
library(dplyr)
WARD <- mydata_clu_std %>% #Selecting variables
get_dist(method = "spearman") %>%#Selecting distance
hclust(method = "ward.D") #Selecting algorithm
WARD
##
## Call:
## hclust(d = ., method = "ward.D")
##
## Cluster method : ward.D
## Number of objects: 149
library(factoextra)
fviz_dend(WARD) #Dendrogram
mydata$ClusterWard <- cutree(WARD,
k = 3) #Number of groups
head(mydata[c("ID", "ClusterWard")])
## ID ClusterWard
## 1 1 1
## 2 2 1
## 3 3 2
## 4 4 1
## 5 5 2
## 6 6 3
#Showing the positions of initial leaders, used as starting point for k-means clustering
Leaders_initial <- aggregate(mydata_clu_std,
by = list(mydata$ClusterWard),
FUN = mean)
Leaders_initial
## Group.1 Q13e Q13f Q7j Q7l Q10a
## 1 1 0.2681796 0.58583604 -0.29924274 -0.537584391 -0.1710966
## 2 2 -0.0358253 0.04793232 0.45038760 0.782974895 -0.5789471
## 3 3 -0.3046238 -0.77323723 -0.01065265 0.003189433 0.7083880
Centroids <- Leaders_initial
round(Centroids, 3)
## Group.1 Q13e Q13f Q7j Q7l Q10a
## 1 1 0.268 0.586 -0.299 -0.538 -0.171
## 2 2 -0.036 0.048 0.450 0.783 -0.579
## 3 3 -0.305 -0.773 -0.011 0.003 0.708
library(ggplot2)
library(tidyr)
Picture <- as.data.frame(Centroids)
Picture$id <- 1:nrow(Picture)
Picture <- pivot_longer(Picture, cols = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))
Picture$Group <- factor(Picture$id,
levels = c(1, 2, 3),
labels = c("1", "2", "3"))
Picture$nameFactor <- factor(Picture$name,
levels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"),
labels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))
#Showing the lines
ggplot(Picture, aes(x = nameFactor, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 3) +
geom_line(aes(group = id, linetype = Group, col = Group), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")
table(mydata$ClusterWard)
##
## 1 2 3
## 60 41 48
#Checking if cluster variables are okay (if they differentiate)
fit <- aov(cbind(Q13e, Q13f, Q7j, Q7l, Q10a) ~ as.factor(ClusterWard),
data = mydata)
summary(fit)
## Response Q13e :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterWard) 2 7.294 3.6469 4.6272 0.01126 *
## Residuals 146 115.069 0.7881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q13f :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterWard) 2 32.352 16.1759 36.558 1.345e-13 ***
## Residuals 146 64.601 0.4425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q7j :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterWard) 2 18.271 9.1354 7.4438 0.0008349 ***
## Residuals 146 179.179 1.2273
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q7l :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterWard) 2 68.652 34.326 29.384 1.887e-11 ***
## Residuals 146 170.556 1.168
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Q10a :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterWard) 2 126.28 63.141 26.655 1.356e-10 ***
## Residuals 146 345.85 2.369
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mydata <- mydata[mydata$Q20 > 0, ]
mydata$Q20 <- as.numeric(mydata$Q20)
aggregate(mydata$Q20,
by = list(mydata$ClusterWard),
FUN = mean)
## Group.1 x
## 1 1 22.45763
## 2 2 22.73171
## 3 3 21.86957
mydata <- mydata[mydata$Q21 > 0, ]
mydata$Q21 <- as.numeric(mydata$Q21)
aggregate(mydata$Q21,
by = list(mydata$ClusterWard),
FUN = mean)
## Group.1 x
## 1 1 1.689655
## 2 2 1.512195
## 3 3 1.608696
Age
fit <- aov(Q20 ~ as.factor(ClusterWard),
data = mydata)
summary.aov(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterWard) 2 17.6 8.801 1.713 0.184
## Residuals 142 729.7 5.139
aggregate(mydata$Q20,
by = list(mydata$ClusterWard),
FUN = mean)
## Group.1 x
## 1 1 22.48276
## 2 2 22.73171
## 3 3 21.86957
#creating new variable and informing R that we have non-numerical variable
mydata$GenderF <- factor(mydata$Q21,
levels = c(2, 1, 4),
labels =c("F", "M", "Prefer not to say"))
mydata$ResidenceF <- factor(mydata$Q22,
levels = c(1, 2,3,4,5,6),
labels =c("With parents", "Studio apartment ", "Shared apartment", "Owned apartment", "Student dorms", "Other"))
mydata$EducationF <- factor(mydata$Q23,
levels = c(4,1,2,3,5,6),
labels =c("Bachelor degree", "Primary school", "Secondary school ", "Gymnasium", "Masters degree", "Phd"))
mydata$StatusF <- factor(mydata$Q24,
levels = c(1,2,3,4,5,6),
labels =c("Student, with a student job", "Employed", "Unemployed ", "Self-employed", "Student, without a student job", "Other"))
mydata$Personal_Monthly_IncomeF <- factor(mydata$Q25,
levels = c(1,2,3,4,5),
labels =c("0-500", "501-1000", "1001-1500", "1501-2000", "More than 2000"))
mydata$Marital_StatusF <- factor(mydata$Q26,
levels = c(1,2,3,4),
labels =c("In a relationship", "Single", "Married", "Other"))
mydata$RegionF <- factor(mydata$Q27,
levels = c(1,2,3,4,5,6,7,8),
labels =c("Osrednjeslovenska", "Stajerska", "Gorenjska ", "Notranjska", "Koroska", "Primorska","Dolenjska","Prekmurje"))
mydata$LiveF <- factor(mydata$Q28,
levels = c(1,2,3),
labels =c("Town","Suburb", "The countryside"))
mydata$BankF <- factor(mydata$Q2,
levels = c(1,2,3,4,5,6,7,8,9,10,11,12),
labels =c("NLB", "NKBM", "SKB ", "Revolut", "N26", "Unicredit","Addiko","Gorenjska banka", "Sparkasse", "Delezna banka", "Intesa Saopolo","Other"))
Gender
chi_square <- chisq.test(mydata$GenderF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$GenderF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$GenderF and as.factor(mydata$ClusterWard)
## X-squared = 8.1946, df = 4, p-value = 0.0847
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$GenderF 1 2 3
## F 0.95 -1.28 0.14
## M -1.08 1.38 -0.09
## Prefer not to say -0.63 1.35 -0.56
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$GenderF 1 2 3
## F 40 18 28
## M 18 22 18
## Prefer not to say 0 1 0
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$GenderF 1 2 3 Sum
## F 0.276 0.124 0.193 0.593
## M 0.124 0.152 0.124 0.400
## Prefer not to say 0.000 0.007 0.000 0.007
## Sum 0.400 0.283 0.317 1.000
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$GenderF 1 2 3 Sum
## F 0.465 0.209 0.326 1.000
## M 0.310 0.379 0.310 0.999
## Prefer not to say 0.000 1.000 0.000 1.000
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$GenderF 1 2 3
## F 0.690 0.439 0.609
## M 0.310 0.537 0.391
## Prefer not to say 0.000 0.024 0.000
## Sum 1.000 1.000 1.000
Residence
chi_square <- chisq.test(mydata$ResidenceF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$ResidenceF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$ResidenceF and as.factor(mydata$ClusterWard)
## X-squared = 10.978, df = 10, p-value = 0.3593
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$ResidenceF 1 2 3
## With parents -0.48 -0.42 0.93
## Studio apartment -0.31 1.00 -0.60
## Shared apartment 0.00 1.41 -1.33
## Owned apartment -0.09 0.17 -0.06
## Student dorms 1.23 -1.45 -0.01
## Other 0.22 -0.75 0.46
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$ResidenceF 1 2 3
## With parents 27 19 28
## Studio apartment 6 7 4
## Shared apartment 8 9 3
## Owned apartment 5 4 4
## Student dorms 11 2 6
## Other 1 0 1
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$ResidenceF 1 2 3 Sum
## With parents 0.186 0.131 0.193 0.510
## Studio apartment 0.041 0.048 0.028 0.117
## Shared apartment 0.055 0.062 0.021 0.138
## Owned apartment 0.034 0.028 0.028 0.090
## Student dorms 0.076 0.014 0.041 0.131
## Other 0.007 0.000 0.007 0.014
## Sum 0.399 0.283 0.318 1.000
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$ResidenceF 1 2 3 Sum
## With parents 0.365 0.257 0.378 1.000
## Studio apartment 0.353 0.412 0.235 1.000
## Shared apartment 0.400 0.450 0.150 1.000
## Owned apartment 0.385 0.308 0.308 1.001
## Student dorms 0.579 0.105 0.316 1.000
## Other 0.500 0.000 0.500 1.000
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$ResidenceF 1 2 3
## With parents 0.466 0.463 0.609
## Studio apartment 0.103 0.171 0.087
## Shared apartment 0.138 0.220 0.065
## Owned apartment 0.086 0.098 0.087
## Student dorms 0.190 0.049 0.130
## Other 0.017 0.000 0.022
## Sum 1.000 1.001 1.000
Education
chi_square <- chisq.test(mydata$EducationF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$EducationF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$EducationF and as.factor(mydata$ClusterWard)
## X-squared = 7.2304, df = 6, p-value = 0.3001
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$EducationF 1 2 3
## Bachelor degree 0.31 -0.38 0.01
## Secondary school -1.63 1.83 0.11
## Gymnasium 0.64 -0.70 -0.06
## Masters degree -0.16 0.22 -0.03
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$EducationF 1 2 3
## Bachelor degree 28 17 21
## Secondary school 2 8 5
## Gymnasium 22 11 15
## Masters degree 6 5 5
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$EducationF 1 2 3 Sum
## Bachelor degree 0.193 0.117 0.145 0.455
## Secondary school 0.014 0.055 0.034 0.103
## Gymnasium 0.152 0.076 0.103 0.331
## Masters degree 0.041 0.034 0.034 0.109
## Sum 0.400 0.282 0.316 0.998
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$EducationF 1 2 3 Sum
## Bachelor degree 0.424 0.258 0.318 1.000
## Secondary school 0.133 0.533 0.333 0.999
## Gymnasium 0.458 0.229 0.312 0.999
## Masters degree 0.375 0.312 0.312 0.999
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$EducationF 1 2 3
## Bachelor degree 0.483 0.415 0.457
## Secondary school 0.034 0.195 0.109
## Gymnasium 0.379 0.268 0.326
## Masters degree 0.103 0.122 0.109
## Sum 0.999 1.000 1.001
Status
chi_square <- chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$StatusF and as.factor(mydata$ClusterWard)
## X-squared = 9.421, df = 10, p-value = 0.4927
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3
## Student, with a student job 0.04 -0.08 0.03
## Employed -0.32 1.48 -1.04
## Unemployed 0.73 -0.92 0.05
## Self-employed 1.11 -0.12 -1.13
## Student, without a student job -0.31 -0.84 1.14
## Other -0.18 0.16 0.05
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3
## Student, with a student job 29 20 23
## Employed 9 11 5
## Unemployed 2 0 1
## Self-employed 3 1 0
## Student, without a student job 14 8 16
## Other 1 1 1
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3 Sum
## Student, with a student job 0.200 0.138 0.159 0.497
## Employed 0.062 0.076 0.034 0.172
## Unemployed 0.014 0.000 0.007 0.021
## Self-employed 0.021 0.007 0.000 0.028
## Student, without a student job 0.097 0.055 0.110 0.262
## Other 0.007 0.007 0.007 0.021
## Sum 0.401 0.283 0.317 1.001
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3 Sum
## Student, with a student job 0.403 0.278 0.319 1.000
## Employed 0.360 0.440 0.200 1.000
## Unemployed 0.667 0.000 0.333 1.000
## Self-employed 0.750 0.250 0.000 1.000
## Student, without a student job 0.368 0.211 0.421 1.000
## Other 0.333 0.333 0.333 0.999
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3
## Student, with a student job 0.500 0.488 0.500
## Employed 0.155 0.268 0.109
## Unemployed 0.034 0.000 0.022
## Self-employed 0.052 0.024 0.000
## Student, without a student job 0.241 0.195 0.348
## Other 0.017 0.024 0.022
## Sum 0.999 0.999 1.001
Personal monthly income
chi_square <- chisq.test(mydata$Personal_Monthly_IncomeF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$Personal_Monthly_IncomeF,
## as.factor(mydata$ClusterWard)): Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Personal_Monthly_IncomeF and as.factor(mydata$ClusterWard)
## X-squared = 14.564, df = 8, p-value = 0.0682
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## 1 2 3
## 0-500 0.08 -1.70 1.52
## 501-1000 0.34 0.65 -0.99
## 1001-1500 -0.55 1.16 -0.48
## 1501-2000 0.67 -0.04 -0.72
## More than 2000 -1.00 1.89 -0.66
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## 1 2 3
## 0-500 28 12 29
## 501-1000 14 11 7
## 1001-1500 5 7 4
## 1501-2000 9 5 4
## More than 2000 2 6 2
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## 1 2 3 Sum
## 0-500 0.193 0.083 0.200 0.476
## 501-1000 0.097 0.076 0.048 0.221
## 1001-1500 0.034 0.048 0.028 0.110
## 1501-2000 0.062 0.034 0.028 0.124
## More than 2000 0.014 0.041 0.014 0.069
## Sum 0.400 0.282 0.318 1.000
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## 1 2 3 Sum
## 0-500 0.406 0.174 0.420 1.000
## 501-1000 0.438 0.344 0.219 1.001
## 1001-1500 0.312 0.438 0.250 1.000
## 1501-2000 0.500 0.278 0.222 1.000
## More than 2000 0.200 0.600 0.200 1.000
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## 1 2 3
## 0-500 0.483 0.293 0.630
## 501-1000 0.241 0.268 0.152
## 1001-1500 0.086 0.171 0.087
## 1501-2000 0.155 0.122 0.087
## More than 2000 0.034 0.146 0.043
## Sum 0.999 1.000 0.999
Status
chi_square <- chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$StatusF and as.factor(mydata$ClusterWard)
## X-squared = 9.421, df = 10, p-value = 0.4927
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3
## Student, with a student job 0.04 -0.08 0.03
## Employed -0.32 1.48 -1.04
## Unemployed 0.73 -0.92 0.05
## Self-employed 1.11 -0.12 -1.13
## Student, without a student job -0.31 -0.84 1.14
## Other -0.18 0.16 0.05
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3
## Student, with a student job 29 20 23
## Employed 9 11 5
## Unemployed 2 0 1
## Self-employed 3 1 0
## Student, without a student job 14 8 16
## Other 1 1 1
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3 Sum
## Student, with a student job 0.200 0.138 0.159 0.497
## Employed 0.062 0.076 0.034 0.172
## Unemployed 0.014 0.000 0.007 0.021
## Self-employed 0.021 0.007 0.000 0.028
## Student, without a student job 0.097 0.055 0.110 0.262
## Other 0.007 0.007 0.007 0.021
## Sum 0.401 0.283 0.317 1.001
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3 Sum
## Student, with a student job 0.403 0.278 0.319 1.000
## Employed 0.360 0.440 0.200 1.000
## Unemployed 0.667 0.000 0.333 1.000
## Self-employed 0.750 0.250 0.000 1.000
## Student, without a student job 0.368 0.211 0.421 1.000
## Other 0.333 0.333 0.333 0.999
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$StatusF 1 2 3
## Student, with a student job 0.500 0.488 0.500
## Employed 0.155 0.268 0.109
## Unemployed 0.034 0.000 0.022
## Self-employed 0.052 0.024 0.000
## Student, without a student job 0.241 0.195 0.348
## Other 0.017 0.024 0.022
## Sum 0.999 0.999 1.001
Marital status
chi_square <- chisq.test(mydata$Marital_StatusF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$Marital_StatusF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$Marital_StatusF and as.factor(mydata$ClusterWard)
## X-squared = 5.0417, df = 4, p-value = 0.283
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF 1 2 3
## In a relationship 0.08 -1.02 0.88
## Single -0.18 1.04 -0.78
## Married 0.95 -0.53 -0.56
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF 1 2 3
## In a relationship 28 15 26
## Single 29 26 20
## Married 1 0 0
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF 1 2 3 Sum
## In a relationship 0.193 0.103 0.179 0.475
## Single 0.200 0.179 0.138 0.517
## Married 0.007 0.000 0.000 0.007
## Sum 0.400 0.282 0.317 0.999
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF 1 2 3 Sum
## In a relationship 0.406 0.217 0.377 1.000
## Single 0.387 0.347 0.267 1.001
## Married 1.000 0.000 0.000 1.000
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$Marital_StatusF 1 2 3
## In a relationship 0.483 0.366 0.565
## Single 0.500 0.634 0.435
## Married 0.017 0.000 0.000
## Sum 1.000 1.000 1.000
Region
chi_square <- chisq.test(mydata$RegionF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$RegionF, as.factor(mydata$ClusterWard)):
## Chi-squared approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$RegionF and as.factor(mydata$ClusterWard)
## X-squared = 17.638, df = 14, p-value = 0.2238
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$RegionF 1 2 3
## Osrednjeslovenska 0.54 -0.54 -0.09
## Stajerska -0.25 0.50 -0.19
## Gorenjska 0.37 -0.59 0.15
## Notranjska 0.09 0.87 -0.93
## Koroska 1.34 -0.75 -0.80
## Primorska -1.14 -0.63 1.88
## Dolenjska -1.23 1.82 -0.34
## Prekmurje 0.73 -0.92 0.05
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$RegionF 1 2 3
## Osrednjeslovenska 19 10 13
## Stajerska 15 13 12
## Gorenjska 12 6 9
## Notranjska 5 5 2
## Koroska 2 0 0
## Primorska 2 2 7
## Dolenjska 1 5 2
## Prekmurje 2 0 1
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$RegionF 1 2 3 Sum
## Osrednjeslovenska 0.131 0.069 0.090 0.290
## Stajerska 0.103 0.090 0.083 0.276
## Gorenjska 0.083 0.041 0.062 0.186
## Notranjska 0.034 0.034 0.014 0.082
## Koroska 0.014 0.000 0.000 0.014
## Primorska 0.014 0.014 0.048 0.076
## Dolenjska 0.007 0.034 0.014 0.055
## Prekmurje 0.014 0.000 0.007 0.021
## Sum 0.400 0.282 0.318 1.000
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$RegionF 1 2 3 Sum
## Osrednjeslovenska 0.452 0.238 0.310 1.000
## Stajerska 0.375 0.325 0.300 1.000
## Gorenjska 0.444 0.222 0.333 0.999
## Notranjska 0.417 0.417 0.167 1.001
## Koroska 1.000 0.000 0.000 1.000
## Primorska 0.182 0.182 0.636 1.000
## Dolenjska 0.125 0.625 0.250 1.000
## Prekmurje 0.667 0.000 0.333 1.000
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$RegionF 1 2 3
## Osrednjeslovenska 0.328 0.244 0.283
## Stajerska 0.259 0.317 0.261
## Gorenjska 0.207 0.146 0.196
## Notranjska 0.086 0.122 0.043
## Koroska 0.034 0.000 0.000
## Primorska 0.034 0.049 0.152
## Dolenjska 0.017 0.122 0.043
## Prekmurje 0.034 0.000 0.022
## Sum 0.999 1.000 1.000
Live
chi_square <- chisq.test(mydata$LiveF, as.factor(mydata$ClusterWard))
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$LiveF and as.factor(mydata$ClusterWard)
## X-squared = 6.5434, df = 4, p-value = 0.1621
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$LiveF 1 2 3
## Town 0.41 1.03 -1.43
## Suburb -0.18 -0.77 0.92
## The countryside -0.38 -0.69 1.08
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$LiveF 1 2 3
## Town 31 25 16
## Suburb 11 6 12
## The countryside 16 10 18
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$LiveF 1 2 3 Sum
## Town 0.214 0.172 0.110 0.496
## Suburb 0.076 0.041 0.083 0.200
## The countryside 0.110 0.069 0.124 0.303
## Sum 0.400 0.282 0.317 0.999
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$LiveF 1 2 3 Sum
## Town 0.431 0.347 0.222 1.000
## Suburb 0.379 0.207 0.414 1.000
## The countryside 0.364 0.227 0.409 1.000
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$LiveF 1 2 3
## Town 0.534 0.610 0.348
## Suburb 0.190 0.146 0.261
## The countryside 0.276 0.244 0.391
## Sum 1.000 1.000 1.000
Bank
chi_square <- chisq.test(mydata$BankF, as.factor(mydata$ClusterWard))
## Warning in chisq.test(mydata$BankF, as.factor(mydata$ClusterWard)): Chi-squared
## approximation may be incorrect
chi_square
##
## Pearson's Chi-squared test
##
## data: mydata$BankF and as.factor(mydata$ClusterWard)
## X-squared = 20.67, df = 22, p-value = 0.5412
round (chi_square$res, 2)
## as.factor(mydata$ClusterWard)
## mydata$BankF 1 2 3
## NLB 0.81 -0.98 0.01
## NKBM -0.84 0.12 0.83
## SKB -0.67 1.07 -0.26
## Revolut -0.89 1.91 -0.80
## N26 -0.18 0.16 0.05
## Unicredit -0.48 0.73 -0.15
## Addiko -0.63 -0.53 1.21
## Gorenjska banka 1.44 -0.48 -1.16
## Sparkasse -0.63 -0.53 1.21
## Delezna banka -0.47 0.82 -0.24
## Intesa Saopolo 0.39 0.23 -0.65
## Other 0.29 -0.63 0.27
round (chi_square$observed, 2)
## as.factor(mydata$ClusterWard)
## mydata$BankF 1 2 3
## NLB 21 9 14
## NKBM 13 12 16
## SKB 3 5 3
## Revolut 0 2 0
## N26 1 1 1
## Unicredit 2 3 2
## Addiko 0 0 1
## Gorenjska banka 9 3 2
## Sparkasse 0 0 1
## Delezna banka 1 2 1
## Intesa Saopolo 3 2 1
## Other 5 2 4
addmargins(round(prop.table(chi_square$observed), 3))
## as.factor(mydata$ClusterWard)
## mydata$BankF 1 2 3 Sum
## NLB 0.145 0.062 0.097 0.304
## NKBM 0.090 0.083 0.110 0.283
## SKB 0.021 0.034 0.021 0.076
## Revolut 0.000 0.014 0.000 0.014
## N26 0.007 0.007 0.007 0.021
## Unicredit 0.014 0.021 0.014 0.049
## Addiko 0.000 0.000 0.007 0.007
## Gorenjska banka 0.062 0.021 0.014 0.097
## Sparkasse 0.000 0.000 0.007 0.007
## Delezna banka 0.007 0.014 0.007 0.028
## Intesa Saopolo 0.021 0.014 0.007 0.042
## Other 0.034 0.014 0.028 0.076
## Sum 0.401 0.284 0.319 1.004
addmargins(round(prop.table(chi_square$observed, 1), 3), 2)
## as.factor(mydata$ClusterWard)
## mydata$BankF 1 2 3 Sum
## NLB 0.477 0.205 0.318 1.000
## NKBM 0.317 0.293 0.390 1.000
## SKB 0.273 0.455 0.273 1.001
## Revolut 0.000 1.000 0.000 1.000
## N26 0.333 0.333 0.333 0.999
## Unicredit 0.286 0.429 0.286 1.001
## Addiko 0.000 0.000 1.000 1.000
## Gorenjska banka 0.643 0.214 0.143 1.000
## Sparkasse 0.000 0.000 1.000 1.000
## Delezna banka 0.250 0.500 0.250 1.000
## Intesa Saopolo 0.500 0.333 0.167 1.000
## Other 0.455 0.182 0.364 1.001
addmargins(round(prop.table(chi_square$observed, 2), 3), 1)
## as.factor(mydata$ClusterWard)
## mydata$BankF 1 2 3
## NLB 0.362 0.220 0.304
## NKBM 0.224 0.293 0.348
## SKB 0.052 0.122 0.065
## Revolut 0.000 0.049 0.000
## N26 0.017 0.024 0.022
## Unicredit 0.034 0.073 0.043
## Addiko 0.000 0.000 0.022
## Gorenjska banka 0.155 0.073 0.043
## Sparkasse 0.000 0.000 0.022
## Delezna banka 0.017 0.049 0.022
## Intesa Saopolo 0.052 0.049 0.022
## Other 0.086 0.049 0.087
## Sum 0.999 1.001 1.000
mydata2 <- read.table("~/IMB/Mutivariat analysis/Perception map data.csv", header = TRUE, dec = ".", sep = ";")
mydata2 <- mydata2[-1, ] #delete first row in which the questions are written
mydata2$Q17a <- as.numeric(mydata2$Q17a)
mydata2$Q17b <- as.numeric(mydata2$Q17b)
mydata2$Q17c <- as.numeric(mydata2$Q17c)
mydata2$Q17d <- as.numeric(mydata2$Q17d)
mydata2$Q17e <- as.numeric(mydata2$Q17e)
mydata2$Q17f <- as.numeric(mydata2$Q17f)
mydata2$Q18a <- as.numeric(mydata2$Q18a)
mydata2$Q18b <- as.numeric(mydata2$Q18b)
mydata2$Q18c <- as.numeric(mydata2$Q18c)
mydata2$Q18d <- as.numeric(mydata2$Q18d)
mydata2$Q18e <- as.numeric(mydata2$Q18e)
mydata2$Q18f <- as.numeric(mydata2$Q18f)
mydata2$Q19a <- as.numeric(mydata2$Q19a)
mydata2$Q19b <- as.numeric(mydata2$Q19b)
mydata2$Q19c <- as.numeric(mydata2$Q19c)
mydata2$Q19d <- as.numeric(mydata2$Q19d)
mydata2$Q19e <- as.numeric(mydata2$Q19e)
mydata2$Q19f <- as.numeric(mydata2$Q19f)
colnames(mydata2)[colnames(mydata2) == "Q17a"] <- "NLB_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17b"] <- "NKBM_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17c"] <- "SKB_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17d"] <- "Unicredit_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17e"] <- "Revolut_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q17f"] <- "N26_Mobile App"
colnames(mydata2)[colnames(mydata2) == "Q18a"] <- "NLB_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18b"] <- "NKBM_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18c"] <- "SKB_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18d"] <- "Unicredit_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18e"] <- "Revolut_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q18f"] <- "N26_Customer experience"
colnames(mydata2)[colnames(mydata2) == "Q19a"] <- "NLB_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19b"] <- "NKBM_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19c"] <- "SKB_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19d"] <- "Unicredit_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19e"] <- "Revolut_Reputation"
colnames(mydata2)[colnames(mydata2) == "Q19f"] <- "N26_Reputation"
head(mydata2[1:6])
## NLB_Mobile App NKBM_Mobile App SKB_Mobile App Unicredit_Mobile App
## 2 8 8 8 8
## 3 6 -1 -1 -1
## 4 5 8 8 8
## 5 6 8 8 8
## 6 2 8 6 8
## 7 8 8 6 8
## Revolut_Mobile App N26_Mobile App
## 2 8 8
## 3 -1 -1
## 4 6 8
## 5 6 8
## 6 7 6
## 7 8 8
library(dplyr)
library(naniar)
#Replace all values between -99 and -1 with NA for all variables at once
mydata2 <- mydata2 %>%
replace_with_na_all(condition = ~.x %in% c(-99:-1))
#Replace all NA with averages by columns
mydata2 <- mydata2 %>%
mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))
head(mydata2[1:6])
## # A tibble: 6 × 6
## `NLB_Mobile App` `NKBM_Mobile App` `SKB_Mobile App` `Unicredit_Mobile App`
## <dbl> <dbl> <dbl> <dbl>
## 1 8 8 8 8
## 2 6 6.86 6.96 7.24
## 3 5 8 8 8
## 4 6 8 8 8
## 5 2 8 6 8
## 6 8 8 6 8
## # ℹ 2 more variables: `Revolut_Mobile App` <dbl>, `N26_Mobile App` <dbl>
library(tibble)
mydata_PCA <- mydata2 %>%
pivot_longer(everything(), names_to = "name", values_to = "score") %>%
separate(name, into = c("retailer", "dimension"), sep = "_")%>%
pivot_wider(names_from = dimension, values_from = score, values_fn = mean) %>%
column_to_rownames(var = "retailer")
print(mydata_PCA)
## Mobile App Customer experience Reputation
## NLB 6.540541 6.718121 6.067114
## NKBM 6.863946 6.926174 6.187919
## SKB 6.958621 7.034014 6.040816
## Unicredit 7.243056 7.340136 6.287671
## Revolut 7.000000 7.020270 6.391892
## N26 7.137931 7.289655 6.734694
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.2
pca <- PCA(mydata_PCA,
scale.unit = TRUE,
graph = FALSE,
ncp = 2)
print(pca$var$cor)
## Dim.1 Dim.2
## Mobile App 0.9514762 -0.2875578
## Customer experience 0.9703558 -0.2115676
## Reputation 0.8021391 0.5970290
library(factoextra)
fviz_pca_biplot(pca,
repel = TRUE)