SoA entrance test

library(readxl)
library(ggplot2)
library(circlize)

## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

library(corrplot)

## corrplot 0.92 loaded

library(dplyr)

## 
## Присоединяю пакет: 'dplyr'

## Следующие объекты скрыты от 'package:stats':
## 
##     filter, lag

## Следующие объекты скрыты от 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(kohonen)
library(viridis)

## Загрузка требуемого пакета: viridisLite

library(qgraph)
library(gt)
library(gtsummary)

data <- read_excel("SoA - Admission case - Data - vF - 14.02.2023.xlsx")
str(data)

## tibble [224 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Main bank                                    : chr [1:224] "Bank Aster" "Bank Aster" "Bank Rose" "Bank Snowdrop" ...
##  $ Gender                                       : num [1:224] 2 2 1 1 2 2 2 2 2 2 ...
##  $ Age                                          : num [1:224] 38 37 35 47 22 42 26 26 39 31 ...
##  $ A serious and stable company                 : num [1:224] 5 4 5 5 5 5 5 5 4 5 ...
##  $ A progressive and modern company             : num [1:224] 4 4 4 4 5 4 4 4 3 5 ...
##  $ Service quality                              : num [1:224] 5 5 5 4 5 5 4 5 4 5 ...
##  $ Offers all common credit services            : num [1:224] 5 5 5 5 5 5 5 4 3 5 ...
##  $ Ease and speed of processing  credit products: num [1:224] 5 4 4 5 5 5 4 5 4 4 ...
##  $ Company’s lifetime on the market             : num [1:224] 3 4 5 4 4 5 4 5 4 4 ...
##  $ A trustworthy company                        : num [1:224] 5 5 5 5 5 5 5 5 5 5 ...

#описательные статистики 
as.data.frame(data) %>% tbl_summary(by = `Main bank`)

Characteristic	Bank Aster, N = 60¹	Bank Barberry, N = 44¹	Bank Bramble, N = 23¹	Bank Rose, N = 45¹	Bank Snowdrop, N = 52¹
Gender
1	29 (48%)	15 (34%)	13 (57%)	30 (67%)	28 (54%)
2	31 (52%)	29 (66%)	10 (43%)	15 (33%)	24 (46%)
Age	26 (19, 40)	30 (26, 43)	36 (36, 42)	41 (28, 47)	41 (24, 48)
A serious and stable company
1	0 (0%)	0 (0%)	0 (0%)	1 (2.2%)	2 (3.8%)
2	0 (0%)	0 (0%)	0 (0%)	4 (8.9%)	0 (0%)
3	0 (0%)	4 (9.1%)	0 (0%)	4 (8.9%)	5 (9.6%)
4	17 (28%)	17 (39%)	4 (17%)	15 (33%)	5 (9.6%)
5	43 (72%)	23 (52%)	19 (83%)	21 (47%)	40 (77%)
A progressive and modern company
1	0 (0%)	0 (0%)	0 (0%)	4 (8.9%)	6 (12%)
2	4 (6.7%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)
3	12 (20%)	8 (18%)	2 (8.7%)	7 (16%)	10 (19%)
4	33 (55%)	26 (59%)	14 (61%)	24 (53%)	21 (40%)
5	11 (18%)	10 (23%)	7 (30%)	10 (22%)	13 (25%)
9	0 (0%)	0 (0%)	0 (0%)	0 (0%)	2 (3.8%)
Service quality
1	0 (0%)	0 (0%)	0 (0%)	4 (8.9%)	1 (1.9%)
3	0 (0%)	1 (2.3%)	4 (17%)	0 (0%)	4 (7.7%)
4	20 (33%)	20 (45%)	1 (4.3%)	14 (31%)	12 (23%)
5	40 (67%)	23 (52%)	18 (78%)	27 (60%)	35 (67%)
Offers all common credit services
2	0 (0%)	0 (0%)	0 (0%)	0 (0%)	1 (1.9%)
3	0 (0%)	2 (4.5%)	4 (17%)	11 (24%)	17 (33%)
4	19 (32%)	22 (50%)	0 (0%)	22 (49%)	14 (27%)
5	41 (68%)	20 (45%)	19 (83%)	12 (27%)	20 (38%)
Ease and speed of processing credit products
2	0 (0%)	0 (0%)	0 (0%)	0 (0%)	2 (3.8%)
3	3 (5.0%)	0 (0%)	4 (17%)	4 (8.9%)	8 (15%)
4	31 (52%)	22 (50%)	9 (39%)	25 (56%)	19 (37%)
5	26 (43%)	22 (50%)	10 (43%)	16 (36%)	23 (44%)
Company’s lifetime on the market
1	2 (3.3%)	3 (6.8%)	0 (0%)	4 (8.9%)	3 (5.8%)
2	2 (3.3%)	3 (6.8%)	0 (0%)	4 (8.9%)	6 (12%)
3	8 (13%)	3 (6.8%)	5 (22%)	5 (11%)	3 (5.8%)
4	20 (33%)	22 (50%)	8 (35%)	11 (24%)	24 (46%)
5	28 (47%)	13 (30%)	10 (43%)	21 (47%)	16 (31%)
A trustworthy company
2	1 (1.7%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)
3	0 (0%)	1 (2.3%)	0 (0%)	4 (8.9%)	3 (5.8%)
4	12 (20%)	16 (36%)	4 (17%)	13 (29%)	17 (33%)
5	47 (78%)	27 (61%)	19 (83%)	28 (62%)	30 (58%)
9	0 (0%)	0 (0%)	0 (0%)	0 (0%)	2 (3.8%)
¹ n (%); Median (IQR)

data1 <- as.data.frame(data) 
str(data1)

## 'data.frame':    224 obs. of  10 variables:
##  $ Main bank                                    : chr  "Bank Aster" "Bank Aster" "Bank Rose" "Bank Snowdrop" ...
##  $ Gender                                       : num  2 2 1 1 2 2 2 2 2 2 ...
##  $ Age                                          : num  38 37 35 47 22 42 26 26 39 31 ...
##  $ A serious and stable company                 : num  5 4 5 5 5 5 5 5 4 5 ...
##  $ A progressive and modern company             : num  4 4 4 4 5 4 4 4 3 5 ...
##  $ Service quality                              : num  5 5 5 4 5 5 4 5 4 5 ...
##  $ Offers all common credit services            : num  5 5 5 5 5 5 5 4 3 5 ...
##  $ Ease and speed of processing  credit products: num  5 4 4 5 5 5 4 5 4 4 ...
##  $ Company’s lifetime on the market             : num  3 4 5 4 4 5 4 5 4 4 ...
##  $ A trustworthy company                        : num  5 5 5 5 5 5 5 5 5 5 ...

#фильтрация респондентов от 18 до 30 
data1$Sample <- ifelse(data1$Age < 31 & data1$Age > 17, "Респонденты от 18 до 30", "Другие респонденты")

#описательные статистики 
summary(data1)

##   Main bank             Gender           Age       
##  Length:224         Min.   :1.000   Min.   :18.00  
##  Class :character   1st Qu.:1.000   1st Qu.:24.75  
##  Mode  :character   Median :1.000   Median :36.00  
##                     Mean   :1.487   Mean   :35.39  
##                     3rd Qu.:2.000   3rd Qu.:45.00  
##                     Max.   :2.000   Max.   :64.00  
##  A serious and stable company A progressive and modern company Service quality
##  Min.   :1.000                Min.   :1.000                    Min.   :1.000  
##  1st Qu.:4.000                1st Qu.:4.000                    1st Qu.:4.000  
##  Median :5.000                Median :4.000                    Median :5.000  
##  Mean   :4.518                Mean   :3.929                    Mean   :4.531  
##  3rd Qu.:5.000                3rd Qu.:4.000                    3rd Qu.:5.000  
##  Max.   :5.000                Max.   :9.000                    Max.   :5.000  
##  Offers all common credit services
##  Min.   :2.000                    
##  1st Qu.:4.000                    
##  Median :4.500                    
##  Mean   :4.339                    
##  3rd Qu.:5.000                    
##  Max.   :5.000                    
##  Ease and speed of processing  credit products Company’s lifetime on the market
##  Min.   :2.00                                  Min.   :1.000                   
##  1st Qu.:4.00                                  1st Qu.:4.000                   
##  Median :4.00                                  Median :4.000                   
##  Mean   :4.33                                  Mean   :3.991                   
##  3rd Qu.:5.00                                  3rd Qu.:5.000                   
##  Max.   :5.00                                  Max.   :5.000                   
##  A trustworthy company    Sample         
##  Min.   :2.000         Length:224        
##  1st Qu.:4.000         Class :character  
##  Median :5.000         Mode  :character  
##  Mean   :4.674                           
##  3rd Qu.:5.000                           
##  Max.   :9.000

#волатильности оценок по компонентам 
data1_3 <- data1 %>% pivot_longer(cols = "A serious and stable company":"A trustworthy company", names_to = "Indicator", values_to = "Value")
ggplot(data = data1_3, aes(x = Indicator, y = Value, fill = Indicator)) + geom_boxplot() #не показательный график

#распределение респондентов по банкам 
names(data1)

##  [1] "Main bank"                                    
##  [2] "Gender"                                       
##  [3] "Age"                                          
##  [4] "A serious and stable company"                 
##  [5] "A progressive and modern company"             
##  [6] "Service quality"                              
##  [7] "Offers all common credit services"            
##  [8] "Ease and speed of processing  credit products"
##  [9] "Company’s lifetime on the market"             
## [10] "A trustworthy company"                        
## [11] "Sample"

ggplot(data = data1, aes(x = `Main bank`)) + geom_bar(fill = "#163B88") + theme_classic() + xlab("") + ylab("Количество респондентов по банкам") + stat_count(geom = "label", aes(label = ..count..)) + theme(axis.text.x = element_text(size = 11))

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

#распределение по полу 
table(data1$Gender)

## 
##   1   2 
## 115 109

#распределение по полу и банкам
names(data1)

##  [1] "Main bank"                                    
##  [2] "Gender"                                       
##  [3] "Age"                                          
##  [4] "A serious and stable company"                 
##  [5] "A progressive and modern company"             
##  [6] "Service quality"                              
##  [7] "Offers all common credit services"            
##  [8] "Ease and speed of processing  credit products"
##  [9] "Company’s lifetime on the market"             
## [10] "A trustworthy company"                        
## [11] "Sample"

ggplot(data1, aes(x = `Main bank`, fill = as.character(Gender))) + geom_bar() + stat_count(geom = "label", aes(label = ..count..), color = "white") + scale_fill_manual(values = c("#163B88", "#192D45")) + theme_classic() + 
  theme(axis.text.x = element_text(size = 11), legend.position = "bottom") + xlab("") + ylab("")

data4 <- data1 %>% filter(Sample == "Респонденты от 18 до 30")
as.data.frame(data4 %>% select(-Sample)) %>% tbl_summary(by = `Main bank`)

Characteristic	Bank Aster, N = 38¹	Bank Barberry, N = 22¹	Bank Bramble, N = 1¹	Bank Rose, N = 12¹	Bank Snowdrop, N = 19¹
Gender
1	24 (63%)	10 (45%)	1 (100%)	11 (92%)	6 (32%)
2	14 (37%)	12 (55%)	0 (0%)	1 (8.3%)	13 (68%)
Age	20.0 (19.0, 25.8)	26.0 (18.2, 27.0)	19.0 (19.0, 19.0)	24.0 (18.0, 26.5)	22.0 (20.0, 25.0)
A serious and stable company
1	0 (0%)	0 (0%)	0 (0%)	1 (8.3%)	0 (0%)
2	0 (0%)	0 (0%)	0 (0%)	4 (33%)	0 (0%)
4	12 (32%)	5 (23%)	0 (0%)	0 (0%)	3 (16%)
5	26 (68%)	17 (77%)	1 (100%)	7 (58%)	16 (84%)
A progressive and modern company
1	0 (0%)	0 (0%)	0 (0%)	0 (0%)	4 (21%)
2	4 (11%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)
3	8 (21%)	3 (14%)	0 (0%)	4 (33%)	6 (32%)
4	19 (50%)	13 (59%)	1 (100%)	7 (58%)	5 (26%)
5	7 (18%)	6 (27%)	0 (0%)	1 (8.3%)	2 (11%)
9	0 (0%)	0 (0%)	0 (0%)	0 (0%)	2 (11%)
Service quality
3	0 (0%)	0 (0%)	0 (0%)	0 (0%)	2 (11%)
4	13 (34%)	11 (50%)	1 (100%)	4 (33%)	0 (0%)
5	25 (66%)	11 (50%)	0 (0%)	8 (67%)	17 (89%)
Offers all common credit services
3	0 (0%)	1 (4.5%)	0 (0%)	0 (0%)	3 (16%)
4	16 (42%)	12 (55%)	0 (0%)	7 (58%)	1 (5.3%)
5	22 (58%)	9 (41%)	1 (100%)	5 (42%)	15 (79%)
Ease and speed of processing credit products
2	0 (0%)	0 (0%)	0 (0%)	0 (0%)	2 (11%)
3	0 (0%)	0 (0%)	0 (0%)	0 (0%)	2 (11%)
4	20 (53%)	11 (50%)	0 (0%)	7 (58%)	9 (47%)
5	18 (47%)	11 (50%)	1 (100%)	5 (42%)	6 (32%)
Company’s lifetime on the market
1	2 (5.3%)	3 (14%)	0 (0%)	4 (33%)	0 (0%)
2	2 (5.3%)	0 (0%)	0 (0%)	4 (33%)	0 (0%)
3	7 (18%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)
4	16 (42%)	17 (77%)	1 (100%)	4 (33%)	6 (32%)
5	11 (29%)	2 (9.1%)	0 (0%)	0 (0%)	13 (68%)
A trustworthy company
2	1 (2.6%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)
3	0 (0%)	0 (0%)	0 (0%)	0 (0%)	3 (16%)
4	9 (24%)	4 (18%)	0 (0%)	5 (42%)	4 (21%)
5	28 (74%)	18 (82%)	1 (100%)	7 (58%)	12 (63%)
¹ n (%); Median (IQR)

ggplot(data4, aes(x = `Main bank`, fill = as.character(Gender))) + geom_bar() + stat_count(geom = "label", aes(label = ..count..), color = "white") + scale_fill_manual(values = c("#163B88", "#192D45")) + theme_classic() + 
  theme(axis.text.x = element_text(size = 11), legend.position = "bottom") + xlab("") + ylab("")

#распределение респондентов по возрасту
#вся совокупность
ggplot(data1, aes(x = Age)) + geom_density(fill = "#748CDB") + theme_classic() + xlab("Возраст") + ylab("Плотность распределения") + 
  theme(axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11))

#в разрезе банков

ggplot(data1, aes(x = Age, fill = `Main bank`)) + geom_density(alpha = 0.5) + theme_classic() + xlab("Возраст") + ylab("Плотность распределения") + 
  theme(axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11), legend.position = "bottom") + scale_fill_manual(values = c("red", "#C1DFF9", "#748CDB", "#163B88", "#192D45"))

#распределение респондентов по банкам с учетом возраста
ggplot(data = data1, aes(x = `Main bank`, fill = Sample)) + geom_bar() + theme_classic() + xlab("") + ylab("Количество респондентов по банкам") + stat_count(geom = "label", aes(label = ..count..)) + theme(axis.text.x = element_text(size = 11), legend.position = "bottom") + scale_fill_manual(values = c("#C1DFF9", "#748CDB"))

ggplot(data = data1, aes(x = `Main bank`, fill = Sample)) + geom_bar(position = "fill") + theme_classic() + xlab("") + ylab("Количество респондентов по банкам") + theme(axis.text.x = element_text(size = 11), legend.position = "bottom") + scale_fill_manual(values = c("#C1DFF9", "#748CDB"))

data2 <- as.data.frame(table(data1$`Main bank`, data1$Sample))
data2_test <- data2
data2_test$Var2 <- ifelse(data2_test$Var2 == "Другие респонденты", "Other respondents", "Respondents 18-30")
chordDiagram(data2_test)

#корреляция между параметрами 
data3 <- as.data.frame(data)
corrplot(cor(as.matrix(data3[,-1])), method = "number") #маленькая корреляция

#корреляция, если ограничения респондентов
table(data4$Gender)

## 
##  1  2 
## 52 40

52/(52+40)

## [1] 0.5652174

31/(31+29)

## [1] 0.5166667

corrplot(cor(as.matrix(data4[,-c(1,11)])), method = "number") #маленькая корреляция

#исследование в разрезе банков и респондентов 
ggplot(data = data1, aes(x = `Main bank`, y = `A trustworthy company`, fill = Sample)) + geom_boxplot() #не показательно

#средние значения показателей
data5 <- data1 %>% group_by(`Main bank`, Sample) %>% summarise(`A serious and stable company` = mean(`A serious and stable company`), 
                                                               `A progressive and modern company` = mean(`A progressive and modern company`), 
                                                               `Service quality` = mean(`Service quality`), 
                                                               `Offers all common credit services` = mean(`Offers all common credit services`), 
                                                               `Ease and speed of processing  credit products` = mean(`Ease and speed of processing  credit products`), 
                                                               `Company’s lifetime on the market` = mean(`Company’s lifetime on the market`), 
                                                               `A trustworthy company` = mean(`A trustworthy company`))

## `summarise()` has grouped output by 'Main bank'. You can override using the
## `.groups` argument.

ggplot(data = data5, aes(x = `Main bank`, y = `A trustworthy company`, fill = Sample)) + geom_col(position = "dodge") + theme_classic() + theme(legend.position = "bottom") + scale_fill_manual(values = c("#192D45", "#7990A3"))

data6 <- data1 %>% group_by(Sample) %>% summarise(`A serious and stable company` = mean(`A serious and stable company`), 
                                                               `A progressive and modern company` = mean(`A progressive and modern company`), 
                                                               `Service quality` = mean(`Service quality`), 
                                                               `Offers all common credit services` = mean(`Offers all common credit services`), 
                                                               `Ease and speed of processing credit products` = mean(`Ease and speed of processing  credit products`), 
                                                               `Company’s lifetime on the market` = mean(`Company’s lifetime on the market`), 
                                                               `A trustworthy company` = mean(`A trustworthy company`)) %>% pivot_longer(cols = "A serious and stable company":"A trustworthy company", names_to = "Indicator", values_to = "Value")

x <- NULL
addline_format <- function(x, ...){
  gsub('\\s', '\n', x)
}
ggplot(data = data6, aes(x = Indicator, y = Value, fill = Sample)) + geom_col(position = "dodge") + theme_classic() + theme(legend.position = "bottom") + scale_fill_manual(values = c("#192D45", "#7990A3")) + 
  scale_x_discrete(breaks = unique(data6$Indicator), labels = addline_format(unique(data6$Indicator))) + xlab("")

unique(data6$Indicator)

## [1] "A serious and stable company"                
## [2] "A progressive and modern company"            
## [3] "Service quality"                             
## [4] "Offers all common credit services"           
## [5] "Ease and speed of processing credit products"
## [6] "Company’s lifetime on the market"            
## [7] "A trustworthy company"

ggplot(data = data6, aes(x = Indicator, y = Value, fill = Sample)) + geom_col(position = "dodge") + theme_classic() + theme(legend.position = "bottom") + scale_fill_manual(values = c("#C1DFF9", "#748CDB")) + 
  scale_x_discrete(labels = c("A progressive \nand modern company", "A serious \nand stable company", "A trustworthy company",
                              "Company’s lifetime \non the market",
                              "Ease and speed of \nprocessing credit products", "Offers all common \ncredit services",
                              "Service quality")) + xlab("") + geom_label(aes(label = round(Value,2)), position = position_dodge(width = 1), color = "black") + ylab("Средняя оценка")

#по каким параметрам молодёжь отличается? проведем t-тест 
data1_1 <- data1 %>% filter(Sample == "Респонденты от 18 до 30")
data1_2 <- data1 %>% filter(Sample == "Другие респонденты")
#A serious and stable company - нет отличий
t.test(data1_1$`A serious and stable company`, data1_2$`A serious and stable company`)

## 
##  Welch Two Sample t-test
## 
## data:  data1_1$`A serious and stable company` and data1_2$`A serious and stable company`
## t = 1.425, df = 195.46, p-value = 0.1557
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0591878  0.3674882
## sample estimates:
## mean of x mean of y 
##  4.608696  4.454545

#A progressive and modern company - нет отличий
t.test(data1_1$`A progressive and modern company`, data1_2$`A progressive and modern company`)

## 
##  Welch Two Sample t-test
## 
## data:  data1_1$`A progressive and modern company` and data1_2$`A progressive and modern company`
## t = -1.0365, df = 157.4, p-value = 0.3016
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4517286  0.1407932
## sample estimates:
## mean of x mean of y 
##  3.836957  3.992424

#Service quality - значимо на 10% уровне значимости - у молодежи выше значение параметра
t.test(data1_1$`Service quality`, data1_2$`Service quality`)

## 
##  Welch Two Sample t-test
## 
## data:  data1_1$`Service quality` and data1_2$`Service quality`
## t = 1.9495, df = 215.94, p-value = 0.05254
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.002064347  0.375582134
## sample estimates:
## mean of x mean of y 
##  4.641304  4.454545

#Offers all common credit services - значимо на 1 и 5% уровне значимости - у молодежи выше значение параметра
t.test(data1_1$`Offers all common credit services`, data1_2$`Offers all common credit services`)

## 
##  Welch Two Sample t-test
## 
## data:  data1_1$`Offers all common credit services` and data1_2$`Offers all common credit services`
## t = 3.3024, df = 221.89, p-value = 0.001117
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1248518 0.4943841
## sample estimates:
## mean of x mean of y 
##  4.521739  4.212121

#Ease and speed of processing  credit products - нет отличий
t.test(data1_1$`Ease and speed of processing  credit products`, data1_2$`Ease and speed of processing  credit products`)

## 
##  Welch Two Sample t-test
## 
## data:  data1_1$`Ease and speed of processing  credit products` and data1_2$`Ease and speed of processing  credit products`
## t = 0.94677, df = 203.38, p-value = 0.3449
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09199567  0.26195615
## sample estimates:
## mean of x mean of y 
##  4.380435  4.295455

#Company’s lifetime on the market - значимо на 5% уровне значимости - у молодежи ниже 
t.test(data1_1$`Company’s lifetime on the market`, data1_2$`Company’s lifetime on the market`)

## 
##  Welch Two Sample t-test
## 
## data:  data1_1$`Company’s lifetime on the market` and data1_2$`Company’s lifetime on the market`
## t = -2.2819, df = 175.16, p-value = 0.0237
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.65972102 -0.04778886
## sample estimates:
## mean of x mean of y 
##  3.782609  4.136364

#A trustworthy company - не значимо
t.test(data1_1$`A trustworthy company`, data1_2$`A trustworthy company`)

## 
##  Welch Two Sample t-test
## 
## data:  data1_1$`A trustworthy company` and data1_2$`A trustworthy company`
## t = -0.20441, df = 219.64, p-value = 0.8382
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1997907  0.1622413
## sample estimates:
## mean of x mean of y 
##  4.663043  4.681818

#Отличия респондентов 18-30 лет от людей другого возраста в части волатильности (сделано по var-тесту)
#A serious and stable company - нет отличий
var.test(data1_1$`A serious and stable company`, data1_2$`A serious and stable company`)

## 
##  F test to compare two variances
## 
## data:  data1_1$`A serious and stable company` and data1_2$`A serious and stable company`
## F = 1.0078, num df = 91, denom df = 131, p-value = 0.9589
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6935081 1.4838514
## sample estimates:
## ratio of variances 
##           1.007761

#A progressive and modern company - значимо на 1% уровне значимости - у молодежи выше
var.test(data1_1$`A progressive and modern company`, data1_2$`A progressive and modern company`)

## 
##  F test to compare two variances
## 
## data:  data1_1$`A progressive and modern company` and data1_2$`A progressive and modern company`
## F = 1.8373, num df = 91, denom df = 131, p-value = 0.001421
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.264376 2.705299
## sample estimates:
## ratio of variances 
##            1.83731

#Service quality - значимо на 1% уровне значимости - у молодежи ниже
var.test(data1_1$`Service quality`, data1_2$`Service quality`)

## 
##  F test to compare two variances
## 
## data:  data1_1$`Service quality` and data1_2$`Service quality`
## F = 0.3394, num df = 91, denom df = 131, p-value = 1.226e-07
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.2335671 0.4997473
## sample estimates:
## ratio of variances 
##          0.3394045

#Offers all common credit services - значимо на 1% уровне значимости - у молодежи ниже
var.test(data1_1$`Offers all common credit services`, data1_2$`Offers all common credit services`)

## 
##  F test to compare two variances
## 
## data:  data1_1$`Offers all common credit services` and data1_2$`Offers all common credit services`
## F = 0.50606, num df = 91, denom df = 131, p-value = 0.0006512
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3482526 0.7451320
## sample estimates:
## ratio of variances 
##          0.5060581

#Ease and speed of processing  credit products - нет отличий
var.test(data1_1$`Ease and speed of processing  credit products`, data1_2$`Ease and speed of processing  credit products`)

## 
##  F test to compare two variances
## 
## data:  data1_1$`Ease and speed of processing  credit products` and data1_2$`Ease and speed of processing  credit products`
## F = 0.88243, num df = 91, denom df = 131, p-value = 0.5271
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6072609 1.2993141
## sample estimates:
## ratio of variances 
##          0.8824322

#Company’s lifetime on the market - значимо на 10% уровне значимости - у молодежи выше
var.test(data1_1$`Company’s lifetime on the market`, data1_2$`Company’s lifetime on the market`)

## 
##  F test to compare two variances
## 
## data:  data1_1$`Company’s lifetime on the market` and data1_2$`Company’s lifetime on the market`
## F = 1.3788, num df = 91, denom df = 131, p-value = 0.09205
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.9488208 2.0301263
## sample estimates:
## ratio of variances 
##           1.378765

#A trustworthy company - значимо на 1% уровне значимости - у молодежи ниже
var.test(data1_1$`A trustworthy company`, data1_2$`A trustworthy company`)

## 
##  F test to compare two variances
## 
## data:  data1_1$`A trustworthy company` and data1_2$`A trustworthy company`
## F = 0.59596, num df = 91, denom df = 131, p-value = 0.009057
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4101186 0.8775024
## sample estimates:
## ratio of variances 
##          0.5959578

#итого для респондентов 18-30 лет важно: Service quality (10% уровень значимости), Offers all common credit services, Company’s lifetime on the market

#Кластеризация - метод самоорганизующихся карт
data7 <- data4 %>% select(-`Main bank`, -Sample)
data7_scale <- as.matrix(scale(data7))
set.seed(123)
som_grid <- somgrid(xdim = 4, ydim = 4, topo = "hexagonal")
som_model <- som(data7_scale, grid = som_grid, rlen = 100, keep.data = TRUE)
from.bottom <- ceiling(som_model$unit.classif / som_model$grid$xdim)
from.left <- som_model$unit.classif %% som_model$grid$xdim
from.left[from.left == 0] <-  som_model$grid$xdim

cluster_details <- cbind(
  som.unit = som_model$unit.classif,
  from.bottom = from.bottom, from.left = from.left
)

plot(som_model, type = "counts") #нормально, укрупнять не надо

plot(som_model, type = "mapping")

plot(som_model, type = "codes", main = "Эталонные представители узла")

## Warning in par(opar): argument 1 does not name a graphical parameter

#как выглядят кластеры?
plot(som_model, type = "code")

## Warning in par(opar): argument 1 does not name a graphical parameter

#достанем информацию о каждом кластере
Som_codes <- getCodes(som_model)
Som_codes <- as.data.frame(Som_codes)

#как распределены параметры?
par(mfrow = c(2,4))
for (i in 1:8) {
  assign(paste0("plot", i), plot(som_model, type = "property", property = Som_codes[,i], main = colnames(Som_codes)[i]))
}

par(mfrow = c(1,1))
par(mar=c(5.1 ,4.1 ,4.1 ,2.1)) 

#добавим иерархическую кластеризацию
mydata <- as.matrix(Som_codes)

#добавим иерархическую кластеризацию
mydata <- as.matrix(Som_codes)

#используем иерархическую кластеризацию
som_cluster <- cutree(hclust(dist(mydata)), 4)

# Определяем палитру цветов
palette_new_2 <- function(n) {
  inferno(n)
}

# Показываем разными цветами кластеры узлов и переменные
plot(som_model, type = "codes", 
     bgcol = palette_new_2(3)[som_cluster], main = NA)

## Warning in par(opar): argument 1 does not name a graphical parameter

add.cluster.boundaries(som_model, som_cluster)

cluster_details

##       som.unit from.bottom from.left
##  [1,]        3           1         3
##  [2,]       13           4         1
##  [3,]        2           1         2
##  [4,]       14           4         2
##  [5,]        1           1         1
##  [6,]       10           3         2
##  [7,]       16           4         4
##  [8,]        3           1         3
##  [9,]        8           2         4
## [10,]       12           3         4
## [11,]        9           3         1
## [12,]       13           4         1
## [13,]       13           4         1
## [14,]        6           2         2
## [15,]        7           2         3
## [16,]        3           1         3
## [17,]        9           3         1
## [18,]        2           1         2
## [19,]        3           1         3
## [20,]       12           3         4
## [21,]        3           1         3
## [22,]       12           3         4
## [23,]        7           2         3
## [24,]        5           2         1
## [25,]        8           2         4
## [26,]        7           2         3
## [27,]       15           4         3
## [28,]        1           1         1
## [29,]        4           1         4
## [30,]        9           3         1
## [31,]       12           3         4
## [32,]        8           2         4
## [33,]       15           4         3
## [34,]       13           4         1
## [35,]       11           3         3
## [36,]       10           3         2
## [37,]       11           3         3
## [38,]       10           3         2
## [39,]       13           4         1
## [40,]       13           4         1
## [41,]        1           1         1
## [42,]       16           4         4
## [43,]        3           1         3
## [44,]        8           2         4
## [45,]       12           3         4
## [46,]        9           3         1
## [47,]       13           4         1
## [48,]        4           1         4
## [49,]        9           3         1
## [50,]       12           3         4
## [51,]        8           2         4
## [52,]       15           4         3
## [53,]        7           2         3
## [54,]        5           2         1
## [55,]       13           4         1
## [56,]        2           1         2
## [57,]       14           4         2
## [58,]        1           1         1
## [59,]       10           3         2
## [60,]       16           4         4
## [61,]        4           1         4
## [62,]        9           3         1
## [63,]       12           3         4
## [64,]        8           2         4
## [65,]       15           4         3
## [66,]       13           4         1
## [67,]       11           3         3
## [68,]       10           3         2
## [69,]       11           3         3
## [70,]       10           3         2
## [71,]       13           4         1
## [72,]       13           4         1
## [73,]        1           1         1
## [74,]       16           4         4
## [75,]        3           1         3
## [76,]        8           2         4
## [77,]       12           3         4
## [78,]        9           3         1
## [79,]       13           4         1
## [80,]        4           1         4
## [81,]        9           3         1
## [82,]       12           3         4
## [83,]        8           2         4
## [84,]       15           4         3
## [85,]        7           2         3
## [86,]        5           2         1
## [87,]       13           4         1
## [88,]        2           1         2
## [89,]       14           4         2
## [90,]        1           1         1
## [91,]       10           3         2
## [92,]       16           4         4

table(cluster_details[,1])

## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 
##  6  4  7  4  3  1  5  8  8  7  4  9 13  3  5  5

#красный кластер: 
6+4+3+1+7+4+3

## [1] 28

#белый: 
7

## [1] 7

#желтый кластер: 
4+5+8+9+5+5

## [1] 36

#черный кластер:
8+13

## [1] 21

28+7+36+21

## [1] 92

som_cluster1 <- cutree(hclust(dist(mydata)), 5)
plot(som_model, type = "codes", 
     bgcol = palette_new_2(4)[som_cluster1], main = NA)

## Warning in par(opar): argument 1 does not name a graphical parameter

add.cluster.boundaries(som_model, som_cluster1)

#достаточно ли долго мы обучали модель?
par(mar=c(5.1 ,4.1 ,4.1 ,2.1)) 
plot(som_model, type = "changes")

#определим кластеры в выборке респондентов 18-30 лет
data8 <- data4 %>% select(-`Main bank`, -Sample)
data8_scale <- as.matrix(scale(data8))

som_model1 <- som(data8_scale, grid = som_grid, rlen = 100, keep.data = TRUE)
plot(som_model1, type = "counts") #много переобученных узлов => уменьшим сетку

som_grid1 <- somgrid(xdim = 4, ydim = 3, topo = "hexagonal")
som_model2 <- som(data8_scale, grid = som_grid1, rlen = 100, keep.data = TRUE)
plot(som_model2, type = "mapping") #нормально

plot(som_model2, type = "counts")

#корреляционная матрица в виде графа
names(data1)

##  [1] "Main bank"                                    
##  [2] "Gender"                                       
##  [3] "Age"                                          
##  [4] "A serious and stable company"                 
##  [5] "A progressive and modern company"             
##  [6] "Service quality"                              
##  [7] "Offers all common credit services"            
##  [8] "Ease and speed of processing  credit products"
##  [9] "Company’s lifetime on the market"             
## [10] "A trustworthy company"                        
## [11] "Sample"

qgraph(cor(as.matrix(data1[,-c(1,11)])))

## Warning in abbreviate(colnames(input), 3): 'abbreviate' использована с не-ASCII
## символами

qgraph(cor(as.matrix(data1[,-c(1,11)])), minimum = 0.1, cut = 0.5, vsize = 10)

## Warning in abbreviate(colnames(input), 3): 'abbreviate' использована с не-ASCII
## символами

#по всей выборке
qgraph(cor(as.matrix(data1[,-c(1,11)])), theme = "TeamFortress", minimum = 0.2)

## Warning in abbreviate(colnames(input), 3): 'abbreviate' использована с не-ASCII
## символами

#по выборке целевых респондентов
qgraph(cor(as.matrix(data4[,-c(1,11)])), theme = "TeamFortress", minimum = 0.2)

## Warning in abbreviate(colnames(input), 3): 'abbreviate' использована с не-ASCII
## символами

#распределение баллов по банкам
#в разрезе банков
data1_4 <- data1 %>% pivot_longer(cols = "A serious and stable company":"A trustworthy company", names_to = "Indicator", values_to = "Value")
#вся выборка
ggplot(data1_4, aes(x = Value)) + geom_density(fill = "darkblue", alpha = 0.5) + theme_classic() + xlab("") + ylab("Distribution density") + 
  theme(axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11), legend.position = "bottom") + scale_fill_manual(values = c("#7990A3", "#C1DFF9", "#748CDB", "#163B88", "#192D45")) + facet_wrap(~Indicator)

#в разрезе банков
ggplot(data1_4, aes(x = Value, fill = `Main bank`)) + geom_density(alpha = 0.5) + theme_classic() + xlab("") + ylab("Плотность распределения") + 
  theme(axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11), legend.position = "bottom") + scale_fill_manual(values = c("red", "#C1DFF9", "#748CDB", "#163B88", "#192D45")) + facet_wrap(~Indicator)

data4_1 <- data4 %>% pivot_longer(cols = "A serious and stable company":"A trustworthy company", names_to = "Indicator", values_to = "Value")
#целевая выборка
ggplot(data4_1, aes(x = Value, fill = `Main bank`)) + geom_density(alpha = 0.5) + theme_classic() + xlab("") + ylab("Плотность распределения") + 
  theme(axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11), legend.position = "bottom") + scale_fill_manual(values = c("red", "#C1DFF9", "#748CDB", "#163B88", "#192D45")) + facet_wrap(~Indicator)

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

ggplot(data4_1, aes(x = Value, fill = `Main bank`)) + geom_density(alpha = 0.5) + theme_classic() + xlab("") + ylab("Плотность распределения") + 
  theme(axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11)) + scale_fill_manual(values = c("red", "#C1DFF9", "#748CDB", "#163B88", "#192D45")) + facet_wrap(~Indicator)

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

## Warning in max(ids, na.rm = TRUE): у 'max' нет не пропущенных аргументов;
## возвращаю -Inf

#оценки по банкам среди респондентов 18-30 лет 
data5_1 <- data5 %>% filter(Sample == "Респонденты от 18 до 30") %>% select(-Sample) %>% pivot_longer(cols = "A serious and stable company":"A trustworthy company", names_to = "Indicator", values_to = "Value") %>% filter(!(`Main bank` == "Bank Bramble"))

ggplot(data = data5_1, aes(x = `Main bank`, y = Value, fill = Indicator)) + geom_col(position = "dodge") + xlab("") + ylab("Средняя оценка") + theme_classic() + theme(axis.text.x = element_text(size = 11), legend.position = "bottom") + geom_label(aes(label = round(Value,2)), position = position_dodge(width = 1), color = "black") + 
  scale_fill_manual(values = c("#163B88", "#748CDB", "#C1DFF9", "#7990A3", "#e9e7e8", "#6e9ec1", "#0d88bf"))

data5_2 <- data5_1 %>% filter(Indicator %in% c("Offers all common credit services", "Service quality", 
                                               "A serious and stable company", "Ease and speed of processing  credit products"))                                                                                                                                                                                                                                                       
ggplot(data = data5_2, aes(x = `Main bank`, y = Value, fill = Indicator)) + geom_col(position = "dodge") + xlab("") + ylab("Средняя оценка") + theme_classic() + theme(axis.text.x = element_text(size = 11), legend.position = "bottom") + geom_label(aes(label = round(Value,2)), position = position_dodge(width = 1), color = "black") + 
  scale_fill_manual(values = c("#163B88", "#748CDB", "#C1DFF9", "#e9e7e8", "#6e9ec1", "#0d88bf"))