library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the CSV file
NLB_data <- read.csv("nlb_data.csv", sep = ";")
# Remove columns 2 to 8
NLB_data <- NLB_data %>% select(-c(2:8, 101:116))
#Remove first row
NLB_data <- NLB_data[-1, ]
#Remove all questionnaires with non valid status
library(dplyr)
NLB_data <- NLB_data %>%
filter(!status == 5)
#Remove status column
NLB_data <- NLB_data[ , -1]
#Remove all under 18/over 27
library(dplyr)
NLB_data <- NLB_data %>%
filter(!Q1 %in% c(1, 12))
# Factoring
# Age (Q1)
NLB_data$Q1F <- factor(NLB_data$Q1,
levels = c(2:11),
labels = c(18:27))
# Q5
NLB_data$Q5F <- factor(NLB_data$Q5,
levels = c(1, 2, 3, 4),
labels = c("Pay digital elsewhere", "Pay as available", "Never occurred", "Other"))
# Q7
NLB_data$Q7F <- factor(NLB_data$Q7,
levels = c(1, 2),
labels = c("Yes", "No"))
# Q17
NLB_data$Q17F <- factor(NLB_data$Q17,
levels = c(1, 2, 3, 4, 5),
labels = c("Cash", "Mobile apps", "Bank transfer", "Don't share", "Other"))
# Q20
NLB_data$Q20F <- factor(NLB_data$Q20,
levels = c(1, 2, 3, 4),
labels = c("Yes - me", "Yes - others", "Yes - both", "No"))
# Q22
NLB_data$Q22F <- factor(NLB_data$Q22,
levels = c(1, 2, 3, 4),
labels = c("Fully digital", "Balance digital-cash", "Cash", "Don't know"))
# Q23
NLB_data$Q23F <- factor(NLB_data$Q23,
levels = c(1, 2, 3, 4),
labels = c("Man", "Woman", "Other", "Don't want to answer"))
# Q24
NLB_data$Q24F <- factor(NLB_data$Q24,
levels = c(1, 2, 3, 4, 5, 6, 7),
labels = c("Unfinished primary", "Primary school", "Vocational education", "High School", "Bachelor Degree", "Master Degree", "PhD" ))
# Q25
NLB_data$Q25F <- factor(NLB_data$Q25,
levels = c(1, 2, 3, 4, 5),
labels = c("Student", "Employed", "Self-employed", "Unemployed", "Other"))
# Q26
NLB_data$Q26F <- factor(NLB_data$Q26,
levels = c(1, 2, 3, 4, 5),
labels = c("0-200 EUR", "201-500 EUR", "501-800 EUR", "801-1300 EUR", "Above 1300 EUR"))
# Q27
NLB_data$Q27F <- factor(NLB_data$Q27,
levels = c(1, 2, 3, 4, 5, 6, 7),
labels = c("NLB", "OTP", "Intesa Sanpaolo", "Sparkasse", "Addiko Bank", "Delovska Hranilnica", "Other"))
Let’s check if assumptions to perform a parametric are met:
n * 𝜋> 5 -> 304 * 0.5 = 152 > 5
n(1-𝜋) > 5 -> 304 * 0.5 = 152 > 5
Both assumptions are met, we can proceed with the parametric test - test of population proportion
H0: π = 0.5
H1: π > 0.5
sum(NLB_data$Q3a > 1, na.rm = TRUE)
## [1] 281
prop.test(x = 281,
n = 304,
p = 0.5,
correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: 281 out of 304, null probability 0.5
## X-squared = 218.96, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.8954807 1.0000000
## sample estimates:
## p
## 0.9243421
We reject H0 at p<0.001, the proportion of young people who still use cash at least once a month is larger than 50%.
To test this, we will do a test of population proportion. Let’s check if assumptions are met.
n * 𝜋> 5 -> 304 * 0.5 = 152 > 5
n(1-𝜋) > 5 -> 304 * 0.5 = 152 > 5
Assumptions are met, so we can proceed with the test of population proportion.
H0: 𝜋 = 0.5
H1: 𝜋 > 0.5
sum(NLB_data$Q4a > 1, na.rm = TRUE)
## [1] 270
prop.test(x = 270,
n = 304,
p = 0.5,
correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: 270 out of 304, null probability 0.5
## X-squared = 183.21, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.8549349 1.0000000
## sample estimates:
## p
## 0.8881579
We reject H0 at p<0.001. We have found that more than 50% use cash for their small (less than 10 EUR payments) at least some of the time.
Let’s do the same test, to test how many people use cash for less than half of their small payments.
H0: 𝜋 = 0.5
H1: 𝜋 > 0.5
sum(NLB_data$Q4a == 2, na.rm = TRUE)
## [1] 169
prop.test(x = 169,
n = 304,
p = 0.5,
correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: 169 out of 304, null probability 0.5
## X-squared = 3.8026, df = 1, p-value = 0.02559
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.5087589 1.0000000
## sample estimates:
## p
## 0.5559211
We reject H0 at p = 0.026. We have found that the proportion of people, who use cash for their small payments is greater than 50%.
NLB_data_save <- NLB_data %>% select(20)
NLB_data_save$Q8a <-ifelse(test = NLB_data_save$Q8a == -2,
yes = NA,
no = NLB_data_save$Q8a)
library(tidyr)
NLB_data_save <- drop_na(NLB_data_save)
sum(NLB_data_save$Q8a < 4, na.rm = TRUE)
## [1] 34
NLB_data_save$Q8aF <- factor(NLB_data_save$Q8a,
levels = c(1, 2, 3, 4, 5, 6, 7),
labels = c("Fully in cash", "Up to 25% in cash", "25% and 50% in cash", "About 50% in cash, 50% digital", "25% to 50% digital", "Up to 25% digital", "Fully digital"))
library(ggplot2)
ggplot(NLB_data_save, aes(x = Q8aF)) +
geom_bar(fill = "darkblue", color = "black") +
labs(title = "How do you save?", x = "Preference", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
We have 233 people who save in our sample. Based on the frequency graph and the calculations, we have found that in our sample, only 34 (14.6%) people who save, save more than 50% in cash. Based on this, we have decided to reject our hypothesis, that young people who save, mostly do so in cash.
NLB_R4H4 <- NLB_data %>% select(85)
library(dplyr)
NLB_R4H4$Q22[NLB_R4H4$Q22 == -1] <- 4
NLB_R4H4$Q22F <- factor(NLB_R4H4$Q22,
levels = c(1, 2, 3, 4),
labels = c("Fully digital", "Balance digital-cash", "Cash", "Don't know"))
library(ggplot2)
ggplot(NLB_R4H4, aes(x = Q22F)) +
geom_bar(fill = "darkblue", color = "black") +
labs(title = "Would you switch to a fully digital society?", x = "Preference", y = "Count") +
theme_minimal()
From the graph, we can clearly see that the largest number of our respondents have answered that they would consider switching to a mostly digital society, but the do not want to fully give up cash.
library(ggplot2)
library(dplyr)
library(dplyr)
NLB_R5H5 <- NLB_data %>% select(c(65, 66))
head(NLB_R5H5)
## Q16a Q16b
## 1 5 2
## 2 4 4
## 3 4 4
## 4 7 7
## 5 4 4
## 6 3 5
str(NLB_R5H5)
## 'data.frame': 304 obs. of 2 variables:
## $ Q16a: chr "5" "4" "4" "7" ...
## $ Q16b: chr "2" "4" "4" "7" ...
NLB_R5H5 <- NLB_R5H5 %>% mutate_all(as.numeric)
str(NLB_R5H5)
## 'data.frame': 304 obs. of 2 variables:
## $ Q16a: num 5 4 4 7 4 3 1 3 1 1 ...
## $ Q16b: num 2 4 4 7 4 5 1 3 1 1 ...
summary(NLB_R5H5)
## Q16a Q16b
## Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000
## Median :3.000 Median :4.000
## Mean :3.125 Mean :3.339
## 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :7.000 Max. :7.000
NLB_R5H5$diffs <- NLB_R5H5$Q16a - NLB_R5H5$Q16b
shapiro_result <- shapiro.test(NLB_R5H5$diffs)
print(shapiro_result)
##
## Shapiro-Wilk normality test
##
## data: NLB_R5H5$diffs
## W = 0.80546, p-value < 2.2e-16
p-value < 0.001 → The p-value is extremely small, meaning the data does not follow a normal distribution.
Since normality is not met, we cannot use the paired t-test and must rely on the Wilcoxon signed-rank test instead.
wilcox.test(
NLB_R5H5[[1]],
NLB_R5H5[[2]],
paired = TRUE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon signed rank test
##
## data: NLB_R5H5[[1]] and NLB_R5H5[[2]]
## V = 2813.5, p-value = 0.04295
## alternative hypothesis: true location shift is not equal to 0
p-value = 0.043 → The p-value is less than 0.05, meaning we reject H0.
We conclude that there is a significant difference in how young people are influenced by family vs peers regarding payment habits.
A Wilcoxon signed-rank test was conducted to compare the influence of peers and family on young people’s payment habits. The results showed a statistically significant difference (p = 0.043), indicating there is a difference that young people are more influenced by their family than by their peers when choosing payment methods.
library(dplyr)
NLB_R6H6 <- NLB_data %>% select(c(29, 32))
head(NLB_R6H6)
## Q31_2a Q31_2d
## 1 7 6
## 2 6 7
## 3 7 6
## 4 7 7
## 5 7 7
## 6 5 7
str(NLB_R6H6)
## 'data.frame': 304 obs. of 2 variables:
## $ Q31_2a: chr "7" "6" "7" "7" ...
## $ Q31_2d: chr "6" "7" "6" "7" ...
NLB_R6H6 <- NLB_R6H6 %>% mutate_all(as.numeric)
str(NLB_R6H6)
## 'data.frame': 304 obs. of 2 variables:
## $ Q31_2a: num 7 6 7 7 7 5 6 6 4 7 ...
## $ Q31_2d: num 6 7 6 7 7 7 5 7 7 7 ...
summary(NLB_R6H6)
## Q31_2a Q31_2d
## Min. :1.000 Min. :1.000
## 1st Qu.:6.000 1st Qu.:5.000
## Median :7.000 Median :7.000
## Mean :6.105 Mean :6.076
## 3rd Qu.:7.000 3rd Qu.:7.000
## Max. :7.000 Max. :7.000
NLB_R6H6 <- na.omit(NLB_R6H6)
# Calculate the mean of column 29 (Enostavnost)
mean_enostavnost <- mean(NLB_R6H6[[1]], na.rm = TRUE)
print(mean_enostavnost)
## [1] 6.105263
# Calculate the mean of column 29 (Varnost)
mean_varnost <- mean(NLB_R6H6[[2]], na.rm = TRUE)
print(mean_varnost)
## [1] 6.075658
NLB_R6H6$diffs <- NLB_R6H6$Q31_2a - NLB_R6H6$Q31_2d
shapiro_result <- shapiro.test(NLB_R6H6$diffs)
print(shapiro_result)
##
## Shapiro-Wilk normality test
##
## data: NLB_R6H6$diffs
## W = 0.8771, p-value = 6.971e-15
Since the p-value is much smaller than 0.05, we reject the null hypothesis that the differences are normally distributed.
This means the data does not follow a normal distribution, which justifies using a non-parametric test like the Wilcoxon Signed-Rank test instead of a paired t-test.
wilcox.test(
NLB_R6H6[[1]],
NLB_R6H6[[2]],
paired = TRUE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon signed rank test
##
## data: NLB_R6H6[[1]] and NLB_R6H6[[2]]
## V = 5908, p-value = 0.537
## alternative hypothesis: true location shift is not equal to 0
p-value = 0.537
Since the p-value is greater than 0.05, we fail to reject the null hypothesis.
This indicates that there is no statistically significant difference between the two variables (Enostavnost and Varnost).
In the context of our hypothesis, this means that we do not have enough evidence to say that convenience is a significantly stronger factor than security in influencing digital payment adoption among young people.
The Wilcoxon test results indicate that there is no statistically significant difference between the perceived importance of convenience and security (p = 0.537). This means that we fail to reject the null hypothesis, suggesting that convenience is not significantly more important than security in influencing digital payment adoption among young people (18-27 years old).
Therefore, our data does not support the hypothesis that convenience is a more important factor than security in digital payment adoption. Both factors appear to have similar levels of importance for young users.
We are testing whether control over spending is the most significant factor influencing cash usage among young people.
library(dplyr)
NLB_R7H7 <- NLB_data %>% select(c(29:34))
NLB_R7H7$ID <- 1:nrow(NLB_R7H7)
head(NLB_R7H7)
## Q31_2a Q31_2b Q31_2c Q31_2d Q31_2e Q31_2f ID
## 1 7 7 6 6 3 6 1
## 2 6 7 5 7 7 6 2
## 3 7 6 6 6 4 5 3
## 4 7 7 7 7 7 7 4
## 5 7 7 7 7 7 7 5
## 6 5 3 7 7 4 4 6
str(NLB_R7H7)
## 'data.frame': 304 obs. of 7 variables:
## $ Q31_2a: chr "7" "6" "7" "7" ...
## $ Q31_2b: chr "7" "7" "6" "7" ...
## $ Q31_2c: chr "6" "5" "6" "7" ...
## $ Q31_2d: chr "6" "7" "6" "7" ...
## $ Q31_2e: chr "3" "7" "4" "7" ...
## $ Q31_2f: chr "6" "6" "5" "7" ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
NLB_R7H7 <- as.data.frame(NLB_R7H7)
NLB_R7H7 <- NLB_R7H7 %>%
mutate(
Q31_2a = as.numeric(Q31_2a),
Q31_2b = as.numeric(Q31_2b),
Q31_2c = as.numeric(Q31_2c),
Q31_2d = as.numeric(Q31_2d),
Q31_2e = as.numeric(Q31_2e),
Q31_2f = as.numeric(Q31_2f)
)
colnames(NLB_R7H7) <- c("Ease of Use", "Transaction Speed", "Usability in Stores","Payment Method", "Budget Tracking", "Privacy of Use", "ID")
str(NLB_R7H7)
## 'data.frame': 304 obs. of 7 variables:
## $ Ease of Use : num 7 6 7 7 7 5 6 6 4 7 ...
## $ Transaction Speed : num 7 7 6 7 7 3 6 7 4 5 ...
## $ Usability in Stores: num 6 5 6 7 7 7 6 7 7 7 ...
## $ Payment Method : num 6 7 6 7 7 7 5 7 7 7 ...
## $ Budget Tracking : num 3 7 4 7 7 4 5 6 1 4 ...
## $ Privacy of Use : num 6 6 5 7 7 4 4 7 1 5 ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
library(dplyr)
library(tidyr)
mydata_long <- NLB_R7H7 %>%
pivot_longer(
cols = c("Ease of Use", "Transaction Speed", "Usability in Stores","Payment Method", "Budget Tracking", "Privacy of Use"),
names_to = "System",
values_to = "Result"
) %>%
mutate(System = as.factor(System))
str(mydata_long)
## tibble [1,824 × 3] (S3: tbl_df/tbl/data.frame)
## $ ID : int [1:1824] 1 1 1 1 1 1 2 2 2 2 ...
## $ System: Factor w/ 6 levels "Budget Tracking",..: 2 5 6 3 1 4 2 5 6 3 ...
## $ Result: num [1:1824] 7 7 6 6 3 6 6 7 5 7 ...
head(mydata_long)
## # A tibble: 6 × 3
## ID System Result
## <int> <fct> <dbl>
## 1 1 Ease of Use 7
## 2 1 Transaction Speed 7
## 3 1 Usability in Stores 6
## 4 1 Payment Method 6
## 5 1 Budget Tracking 3
## 6 1 Privacy of Use 6
mydata_long <- mydata_long %>%
mutate(
Result = as.numeric(Result))
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.4.2
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
mydata_long %>%
group_by(System) %>%
shapiro_test(Result)
## # A tibble: 6 × 4
## System variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Budget Tracking Result 0.869 2.23e-15
## 2 Ease of Use Result 0.647 1.26e-24
## 3 Payment Method Result 0.720 2.92e-22
## 4 Privacy of Use Result 0.803 6.64e-19
## 5 Transaction Speed Result 0.685 1.98e-23
## 6 Usability in Stores Result 0.678 1.12e-23
H₀: The data follows a normal distribution.
H₁: The data does not follow a normal distribution.
The p-values for all groups are extremely low (p < 0,001). This means we reject the null hypothesis (H₀) and conclude that the data is not normally distributed.
library(rstatix)
FriedmanANOVA <- friedman_test(Result ~ System | ID,
data = mydata_long)
FriedmanANOVA
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 Result 304 151. 5 8.49e-31 Friedman test
H₀: There is no significant difference between the related groups; they come from the same distribution.
H₁: There is a significant difference between at least one pair of related groups.
Since the p-value is extremely low, we reject the H₀ and conclude that there is a significant difference in how young people perceive different motivations for using cash.
library(rstatix)
paires_nonpar <- wilcox_test(Result ~ System,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_long)
paires_nonpar
## # A tibble: 15 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Result Budget Tr… Ease … 304 304 3267 1.43e-13 2.14e-12 ****
## 2 Result Budget Tr… Payme… 304 304 2101 2.45e-16 3.68e-15 ****
## 3 Result Budget Tr… Priva… 304 304 3862. 2.08e- 6 3.12e- 5 ****
## 4 Result Budget Tr… Trans… 304 304 3730. 5.37e-12 8.06e-11 ****
## 5 Result Budget Tr… Usabi… 304 304 2379 2.35e-16 3.52e-15 ****
## 6 Result Ease of U… Payme… 304 304 5908 5.38e- 1 1 e+ 0 ns
## 7 Result Ease of U… Priva… 304 304 10024. 3.16e- 5 4.74e- 4 ***
## 8 Result Ease of U… Trans… 304 304 2077 1.22e- 1 1 e+ 0 ns
## 9 Result Ease of U… Usabi… 304 304 3312 6.99e- 1 1 e+ 0 ns
## 10 Result Payment M… Priva… 304 304 5870. 1.22e- 7 1.83e- 6 ****
## 11 Result Payment M… Trans… 304 304 6852 5.71e- 1 1 e+ 0 ns
## 12 Result Payment M… Usabi… 304 304 4026. 2.08e- 1 1 e+ 0 ns
## 13 Result Privacy o… Trans… 304 304 5279 5.72e- 4 9 e- 3 **
## 14 Result Privacy o… Usabi… 304 304 2981 1.37e- 7 2.05e- 6 ****
## 15 Result Transacti… Usabi… 304 304 3028. 5.9 e- 2 8.89e- 1 ns
library(rstatix)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.2
comparisons <- paires_nonpar %>%
add_y_position(fun = "median", step.increase = 0.35)
ggboxplot(mydata_long, x = "System", y = "Result", add = "point", ylim=c(0, 18)) +
stat_pvalue_manual(comparisons, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = System), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "darkred",
position = position_dodge(width = 0.8),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = System)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate names
labs(subtitle = get_test_label(FriedmanANOVA, detailed = TRUE),
caption = get_pwc_label(comparisons))
library(dplyr)
NLB_R8H8 <- NLB_data %>% select(c(69))
head(NLB_R8H8)
## Q18a
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
str(NLB_R8H8)
## 'data.frame': 304 obs. of 1 variable:
## $ Q18a: chr "1" "1" "1" "1" ...
col_name <- colnames(NLB_R8H8)[1]
NLB_R8H8$Q18aF <- factor(NLB_R8H8$Q18a,
levels = c(1, 0, -2),
labels = c("Transfer whole amounts", "Other reasons", "Not applicable"))
library(dplyr)
NLB_R8H8 %>% count (Q18aF)
## Q18aF n
## 1 Transfer whole amounts 236
## 2 Other reasons 28
## 3 Not applicable 40
To test this, we will do a test of population proportion. Let’s check if assumptions are met.
n * 𝜋> 5 -> 264 * 0.5 = 132 > 5
n(1-𝜋) > 5 -> 264 * 0.5 = 132 > 5
Assumptions are met, so we can proceed with the test of population proportion.
H0: 𝜋 = 0.5
H1: 𝜋 > 0.5
prop.test(x = 236,
n = 264,
p = 0.5,
correct = FALSE,
alternative = "greater")
##
## 1-sample proportions test without continuity correction
##
## data: 236 out of 264, null probability 0.5
## X-squared = 163.88, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
## 0.8586738 1.0000000
## sample estimates:
## p
## 0.8939394
Since p-value < 0.05, we reject the null hypothesis H₀. This means there is strong statistical evidence that the true population proportion is greater than 50%.