NLB project analysis Group1

mydata <-read.table("~/IMB/Mutivariat analysis/analiza-anketa.csv", header = TRUE, sep = ";", dec= ",")

mydata <- mydata[-1, ] #delete first row in which the questions are written

mydata$ID <- seq(1,nrow(mydata))

head(mydata)

##   Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 2   0   0   0   0   0   0   0   1   0   0   0   0       -2  8         -2  1  5
## 3   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  3  2
## 4   1   0   0   1   0   0   0   0   0   0   0   0       -2  1         -2  1  2
## 5   1   0   0   0   0   0   0   0   0   0   0   0       -2  1         -2  2  4
## 6   0   0   0   1   0   0   0   0   0   0   1   0       -2 11         -2  2  2
## 7   0   0   1   0   0   0   0   0   0   0   0   0       -2  3         -2  1  2
##   Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 2  3   4   5   6   4   5   5   6   5   6   5   4   5   4   5   6   4   1   0
## 3  4   5   6   3   6   6   7   5   5   7   6   3   2   5   6   3   5   0   1
## 4  3   3   5   4   5   5   6   6   4   5   5   5   5   4   5   5   6   1   1
## 5  5   6   4   5   4   4   2   6   6   7   6   4   3   5   6   4   6   1   0
## 6  3   4   6   6   4   5   5   6   6   7   5   6   6   6   7   6   6   0   0
## 7  5   2   5   5   3   6   6   6   6   7   5   6   5   5   6   5   5   0   0
##   Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 2   1   0   0   0   0   0   0   1   0       -2   0   1   0   0   0   1   0   0
## 3   0   0   0   1   1   0   0   0   0       -2   0   0   1   0   1   0   0   0
## 4   0   0   0   1   0   0   0   0   0       -2   0   0   1   1   0   0   0   0
## 5   1   0   0   1   0   0   0   0   0       -2   0   0   0   0   1   0   1   0
## 6   0   0   0   1   0   0   1   1   0       -2   0   1   0   1   0   0   1   0
## 7   1   0   0   1   0   1   0   0   0       -2   0   0   0   0   1   0   1   0
##   Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 2   0   1   0       -2    5    5    5    4    5    4    1    1    1    0    0
## 3   0   1   0       -2    2    3    1    6    5    4    0    1    1    0    1
## 4   0   1   0       -2    2    2    2    2    3    2    0    1    1    1    0
## 5   0   1   0       -2    6    7    2    2    6    2    1    1    0    1    0
## 6   0   0   0       -2    2    2    5    2    3    2    0    1    1    0    0
## 7   0   1   0       -2    5    5    5    5    5    3    0    0    1    1    1
##   Q11f      Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b
## 2    0             -2    5    6    4    6    6    6    5    5    6    0    0
## 3    0             -2    6    2    3    2    7    7    3    7    6    0    1
## 4    0             -2    5    5    4    4    5    6    5    6    6    0    1
## 5    0             -2    6    2    7    6    7    7    2    7    7    0    1
## 6    1 Letalske karte    5    6    2    4    6    6    5    6    6    0    0
## 7    0             -2    5    3    5    6    7    6    4    6    6    0    0
##   Q14c Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e
## 2    0    1    0    1    0    1    0    0        -2    0    0    1    0    1
## 3    0    1    1    0    0    0    0    0        -2    1    1    0    0    1
## 4    1    0    1    0    0    0    0    0        -2    1    1    0    0    1
## 5    1    1    0    0    0    0    0    0        -2    0    0    0    1    1
## 6    1    0    1    0    1    0    0    0        -2    1    1    0    0    1
## 7    0    1    1    0    0    1    0    0        -2    1    0    0    0    0
##   Q15f Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b
## 2    1    0        -2    5    5    5    5    5    5    5    5    6    8    8
## 3    0    0        -2    3    2    6    5    6    4    1    2    6    6   -1
## 4    0    0        -2    5    5    6    5    5    5    4    5    5    5    8
## 5    0    1 Apple Pay    4    4    6    6    6    6    4    4    6    6    8
## 6    0    0        -2    4    4    5    6    6    4    4    5    3    2    8
## 7    0    0        -2    4    4    6    6    6    5    4    4    6    8    8
##   Q17c Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e
## 2    8    8    8    8    8    8    8    8    8    8    8    8    8    8    8
## 3   -1   -1   -1   -1    5   -1   -1   -1   -1   -1    5   -1   -1   -1   -1
## 4    8    8    6    8    5    8    8    8    6    8    3    3    4    4    3
## 5    8    8    6    8    5    8    8    8    6    8    6    8    8    6    6
## 6    6    8    7    6    3    8    5    8    6    6    4    5    6    5    6
## 7    6    8    8    8    8    8    6    8    8    8    8    8    5    8    8
##   Q19f Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 2    8  23   2   1         -2   4   1         -2   1   1         -2   3   3  1
## 3   -1  20   1   5         -2   3   5         -2   1   1         -2   2   1  2
## 4    3  26   1   2         -2   5   2         -2   5   2         -2   7   3  3
## 5    8  25   2   2         -2   5   1         -2   3   2         -2   1   1  4
## 6    6  24   1   5         -2   5   1         -2   3   2         -2   4   1  5
## 7    8  25   2   4         -2   5   2         -2   4   2         -2   1   2  6

Description

Clustering

#13e- preprosta in intuitivna aplikacija
#13f - Varnost
#7j - Preglednost informacij o pogojih in stroških
#7l - Ugled banke
#10a- Starši (pomembna vloga) pri odprtju novega bančnega računa
#7a - pogoji za dolgoročno posojilo

mydata <- mydata[mydata$Q13e > 0, ]
mydata <- mydata[mydata$Q13f > 0, ]
mydata <- mydata[mydata$Q7j > 0, ]
mydata <- mydata[mydata$Q7l > 0, ]
mydata <- mydata[mydata$Q10a > 0, ]


mydata$Q13e <- as.numeric(mydata$Q13e)
mydata$Q13f <- as.numeric(mydata$Q13f)
mydata$Q7j <- as.numeric(mydata$Q7j)
mydata$Q7l <- as.numeric(mydata$Q7l)
mydata$Q10a <- as.numeric(mydata$Q10a)

summary(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]) #Describing clustering variables

##       Q13e            Q13f           Q7j             Q7l             Q10a      
##  Min.   :1.000   Min.   :3.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:6.000   1st Qu.:6.00   1st Qu.:5.000   1st Qu.:5.000   1st Qu.:4.000  
##  Median :7.000   Median :7.00   Median :6.000   Median :6.000   Median :5.000  
##  Mean   :6.387   Mean   :6.38   Mean   :5.813   Mean   :5.487   Mean   :4.853  
##  3rd Qu.:7.000   3rd Qu.:7.00   3rd Qu.:7.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.00   Max.   :7.000   Max.   :7.000   Max.   :7.000

#Saving standardized cluster variables into new data frame
mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))

mydata$Dissimilarity <- sqrt(mydata_clu_std$Q13e^2 + mydata_clu_std$Q13f^2 + mydata_clu_std$Q7j^2 + 
                             mydata_clu_std$Q7l^2 + mydata_clu_std$Q10a^2) #Finding outliers

head(mydata[order(-mydata$Dissimilarity), c("ID", "Dissimilarity")]) #Finding units with highest value of dissimilarity

##      ID Dissimilarity
## 38   37      7.599202
## 97   96      5.062270
## 124 123      4.978508
## 102 101      4.441051
## 34   33      4.399388
## 35   34      4.381617

print(mydata[37, ]) #Showing customer ID37

##    Q1a Q1b Q1c Q1d Q1e Q1f Q1g Q1h Q1i Q1j Q1k Q1l Q1l_text Q2 Q2_12_text Q3 Q4
## 38   1   0   0   1   0   0   0   0   0   0   0   0       -2  1         -2  2  1
##    Q5 Q6a Q6b Q6c Q6d Q7a Q7b Q7c Q7d Q7e Q7f Q7g Q7h Q7i Q7j Q7k Q7l Q8a Q8b
## 38  5   6   6   1   5   7   7   1   2   1   1   1   1   1   1   1   1   1   1
##    Q8c Q8d Q8e Q8f Q8g Q8h Q8i Q8j Q8k Q8k_text Q9a Q9b Q9c Q9d Q9e Q9f Q9g Q9h
## 38   0   0   1   0   0   0   0   0   0       -2   0   0   1   0   0   0   1   0
##    Q9i Q9j Q9k Q9k_text Q10a Q10b Q10c Q10d Q10e Q10f Q11a Q11b Q11c Q11d Q11e
## 38   0   1   0       -2    7    5    3    1    1    1    1    1    0    1    0
##    Q11f Q11f_text Q13a Q13b Q13c Q13d Q13e Q13f Q13g Q13h Q13i Q14a Q14b Q14c
## 38    0        -2    6    1    7    1    1    7    5    6    7    0    1    0
##    Q14d Q14e Q14f Q14g Q14h Q14i Q14j Q14j_text Q15a Q15b Q15c Q15d Q15e Q15f
## 38    1    1    0    0    0    0    0        -2    1    1    0    0    1    0
##    Q15g Q15g_text Q16a Q16b Q16c Q16d Q16e Q16f Q16g Q16h Q16i Q17a Q17b Q17c
## 38    0        -2    4    4    7    7    5    4    2    4    7    7    8    5
##    Q17d Q17e Q17f Q18a Q18b Q18c Q18d Q18e Q18f Q19a Q19b Q19c Q19d Q19e Q19f
## 38    8    7    8    5    8    8    8    7    8    7    8    6    8    5    8
##    Q20 Q21 Q22 Q22_6_text Q23 Q24 Q24_6_text Q25 Q26 Q26_4_text Q27 Q28 ID
## 38  -1   2   1         -2   4   5         -2   1   2         -2   1   1 37
##    Dissimilarity
## 38      7.599202

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mydata <- mydata %>%
  filter(!ID %in% c(37)) #Removing ID37 from original data frame

mydata_clu_std <- as.data.frame(scale(mydata[c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a")]))

#install.packages("factoextra")
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.3.2

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#Finding Eudlidean distances, based on 6 Cluster variables, then saving them into object Distances

Distances <- get_dist(mydata_clu_std, 
                      method = "euclidian")

Distances2 <- Distances^2

fviz_dist(Distances2, #Showing matrix of distances
          gradient = list(low = "darkred",
                          mid = "grey95",
                          high = "white"))

library(factoextra) 
get_clust_tendency(mydata_clu_std, #Hopkins statistics
                   n = nrow(mydata_clu_std) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.7159374
## 
## $plot
## NULL

library(dplyr)
WARD <- mydata_clu_std %>% #Selecting variables
  get_dist(method = "euclidean") %>%  #Selecting distance
  hclust(method = "ward.D2") #Selecting algorithm         

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 149

library(factoextra)
fviz_dend(WARD) #Dendrogram

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

mydata$ClusterWard <- cutree(WARD, 
                             k = 4) #Number of groups

head(mydata[c("ID", "ClusterWard")])

##   ID ClusterWard
## 1  1           1
## 2  2           2
## 3  3           2
## 4  4           3
## 5  5           2
## 6  6           3

#Showing the positions of initial leaders, used as starting point for k-means clustering

Leaders_initial <- aggregate(mydata_clu_std, 
                             by = list(mydata$ClusterWard), 
                             FUN = mean)

Leaders_initial

##   Group.1       Q13e       Q13f        Q7j        Q7l       Q10a
## 1       1 -1.1417980 -1.1771576 -0.3658404 -0.3762346  0.3916587
## 2       2  0.1765311  0.2048837  0.2418637  0.3145441 -1.7760976
## 3       3  0.1948608  0.1688476  0.1119981  0.4194214  0.4331117
## 4       4  0.5190073  0.6411094 -0.2764593 -1.6484563 -0.1160904

library(factoextra) 

kmeans_clu <- hkmeans(mydata_clu_std, #Data
                      k = 4, #Number of groups
                      hc.metric = "euclidean", #Distance for hierar. clus.
                      hc.method = "ward.D2") #Algorithm for hierar. clus.

kmeans_clu

## Hierarchical K-means clustering with 4 clusters of sizes 32, 23, 69, 25
## 
## Cluster means:
##          Q13e       Q13f        Q7j        Q7l       Q10a
## 1 -0.77432259 -1.3523878 -0.5968513 -0.2835846  0.4401102
## 2  0.06097423  0.2876994  0.3218533  0.3458933 -1.8572402
## 3  0.15660750  0.3414177  0.3469481  0.5396884  0.4715530
## 4  0.50279993  0.5240600 -0.4897122 -1.4447734 -0.1561663
## 
## Clustering vector:
##   [1] 1 2 2 3 2 3 3 3 2 4 3 3 3 2 1 2 3 1 3 2 3 1 1 3 4 3 2 1 4 1 3 3 1 1 1 2 1
##  [38] 2 3 2 3 3 4 3 2 4 3 4 3 3 3 3 3 3 1 3 4 3 2 3 3 3 3 3 2 1 3 3 4 3 3 2 3 3
##  [75] 2 1 3 3 1 1 1 4 4 3 1 1 3 3 3 2 4 4 3 1 1 4 2 2 1 3 1 3 1 2 4 3 4 3 3 3 1
## [112] 1 3 4 3 3 3 2 3 1 4 3 3 1 3 1 2 1 4 3 4 3 1 3 4 3 4 3 3 3 4 4 3 1 3 4 2 4
## [149] 3
## 
## Within cluster sum of squares by cluster:
## [1] 137.79060  63.18242 136.08827  93.13940
##  (between_SS / total_SS =  41.9 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "data"        
## [11] "hclust"

fviz_cluster(kmeans_clu,
             palette = "jama",
             repel = FALSE,
             ggtheme =theme_classic())

mydata$ClusterK_MEANS <- kmeans_clu$cluster
head(mydata[, c("ID", "ClusterWard", "ClusterK_MEANS")])

##   ID ClusterWard ClusterK_MEANS
## 1  1           1              1
## 2  2           2              2
## 3  3           2              2
## 4  4           3              3
## 5  5           2              2
## 6  6           3              3

table(mydata$ClusterWard)

## 
##  1  2  3  4 
## 26 24 80 19

table(mydata$ClusterK_MEANS)

## 
##  1  2  3  4 
## 32 23 69 25

#Checking for reclassifications
table(mydata$ClusterWard, mydata$ClusterK_MEANS)

##    
##      1  2  3  4
##   1 21  1  2  2
##   2  1 21  1  1
##   3 10  1 65  4
##   4  0  0  1 18

Centroids <- kmeans_clu$centers
round(Centroids, 3)

##     Q13e   Q13f    Q7j    Q7l   Q10a
## 1 -0.774 -1.352 -0.597 -0.284  0.440
## 2  0.061  0.288  0.322  0.346 -1.857
## 3  0.157  0.341  0.347  0.540  0.472
## 4  0.503  0.524 -0.490 -1.445 -0.156

library(ggplot2)
library(tidyr)

Picture <- as.data.frame(Centroids)
Picture$id <- 1:nrow(Picture)
Picture <- pivot_longer(Picture, cols = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))

Picture$Group <- factor(Picture$id, 
                        levels = c(1, 2, 3, 4), 
                        labels = c("1", "2", "3", "4"))

Picture$nameFactor <- factor(Picture$name, 
                             levels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"), 
                             labels = c("Q13e", "Q13f", "Q7j", "Q7l", "Q10a"))

#Showing the lines
ggplot(Picture, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Group, col = Group), size = 3) +
  geom_line(aes(group = id, linetype = Group, col = Group), linewidth = 1) +
  ylab("Centroid") +
  xlab("Cluster variables")

#Checking if cluster variables are okay (if they differentiate)
fit <- aov(cbind(Q13e, Q13f, Q7j, Q7l, Q10a) ~ as.factor(ClusterK_MEANS), 
           data = mydata)

summary(fit)

##  Response Q13e :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MEANS)   3 22.558  7.5193  10.924 1.637e-06 ***
## Residuals                 145 99.804  0.6883                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q13f :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MEANS)   3 49.354 16.4513  50.115 < 2.2e-16 ***
## Residuals                 145 47.599  0.3283                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7j :
##                            Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MEANS)   3  37.466 12.4888  11.319 1.025e-06 ***
## Residuals                 145 159.983  1.1033                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q7l :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MEANS)   3 125.43  41.811  53.286 < 2.2e-16 ***
## Residuals                 145 113.78   0.785                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Q10a :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_MEANS)   3 323.75 107.917  105.45 < 2.2e-16 ***
## Residuals                 145 148.38   1.023                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mydata <- mydata[mydata$Q20 > 0, ]
mydata$Q20 <- as.numeric(mydata$Q20)

aggregate(mydata$Q20, 
          by = list(mydata$ClusterK_MEANS), 
          FUN = mean)

##   Group.1        x
## 1       1 21.80645
## 2       2 23.13043
## 3       3 22.25373
## 4       4 22.56000

fit <- aov(Q20 ~ as.factor(ClusterK_MEANS), 
           data = mydata)

summary.aov(fit)

##                            Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_MEANS)   3   24.9   8.297   1.627  0.186
## Residuals                 142  724.3   5.101

NLB project analysis Group1

Anej Levpuscek

2024-01-24