rm(list = ls())
date()
## [1] "Sun Dec 1 17:50:18 2024"
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS Ventura 13.7.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Moscow
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.36 R6_2.5.1 fastmap_1.2.0 xfun_0.46
## [5] cachem_1.1.0 knitr_1.48 htmltools_0.5.8.1 rmarkdown_2.27
## [9] lifecycle_1.0.4 cli_3.6.3 sass_0.4.9 jquerylib_0.1.4
## [13] compiler_4.4.1 rstudioapi_0.16.0 tools_4.4.1 evaluate_0.24.0
## [17] bslib_0.7.0 yaml_2.3.9 rlang_1.1.4 jsonlite_1.8.8
options(scipen = 999) # Убирает научную запись чисел
#Library
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load(file = "Data.RData")
summary(Data$WellBeing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 258.0 303.0 326.0 328.7 343.0 447.0
summary(Data$EnvIdentity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.00 49.00 61.00 59.85 70.00 85.00
table(Data$Nationality, Data$ResidenceChild)
##
## BigCity SmallTown Village SmallVillage
## France 22 16 14 10
## Russia 27 10 6 4
summary(Data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 22.00 24.00 24.81 27.00 34.00
Data |>
group_by(Nationality) |>
summarise(
Mean_Age = mean(Age, na.rm = TRUE),
SD_Age = sd(Age, na.rm = TRUE),
)
## # A tibble: 2 × 3
## Nationality Mean_Age SD_Age
## <fct> <dbl> <dbl>
## 1 France 24.8 4.54
## 2 Russia 24.8 3.99
Создаю переменную с сокращением числа групп в “место жительства в детстве”
library(dplyr)
Data$ResidenceChild_3 <- recode(Data$ResidenceChild,
"Village" = "Small",
"SmallVillage" = "Small")
Data <- Data %>%
select(1:4, ResidenceChild_3, 5:(ncol(Data) - 1)) # Перемещаем 'ResidenceChild_3' между 4 и 5 столбцом
table(Data$Nationality, Data$ResidenceChild_3)
##
## BigCity SmallTown Small
## France 22 16 24
## Russia 27 10 10
boxplot(WellBeing ~ ResidenceChild
, data = Data, main = "Well-Being по месту жительства в детстве")
boxplot(EnvIdentity ~ ResidenceChild
, data = Data, main = "Идентичность с природой по месту жительства в детстве")
boxplot(WellBeing ~ ResidenceChild_3
, data = Data, main = "Well-Being по месту жительства в детстве")
boxplot(EnvIdentity ~ ResidenceChild_3
, data = Data, main = "Идентичность с природой по месту жительства в детстве")
ggplot(Data, aes(x = ResidenceChild, y = WellBeing, fill = Nationality)) +
geom_boxplot(outlier.color = "red", outlier.shape = 1) +
labs(title = "WellBeing по месту жительства и национальности",
x = "Место жительства в детстве",
y = "WellBeing") +
scale_fill_manual(values = c("skyblue", "lightgreen")) +
theme_minimal()
ggplot(Data, aes(x = ResidenceChild, y = EnvIdentity, fill = Nationality)) +
geom_boxplot(outlier.color = "red", outlier.shape = 1) +
labs(title = "EnvIdentity по месту жительства и национальности",
x = "Место жительства в детстве",
y = "EnvIdentity") +
scale_fill_manual(values = c("pink", "lightblue")) +
theme_minimal()
ggplot(Data, aes(x = ResidenceChild_3, y = WellBeing, fill = Nationality)) +
geom_boxplot(outlier.color = "red", outlier.shape = 1) +
labs(title = "WellBeing по месту жительства и национальности",
x = "Место жительства в детстве",
y = "WellBeing") +
scale_fill_manual(values = c("skyblue", "lightgreen")) +
theme_minimal()
ggplot(Data, aes(x = ResidenceChild_3, y = EnvIdentity, fill = Nationality)) +
geom_boxplot(outlier.color = "red", outlier.shape = 1) +
labs(title = "EnvIdentity по месту жительства и национальности",
x = "Место жительства в детстве",
y = "EnvIdentity") +
scale_fill_manual(values = c("pink", "lightblue")) +
theme_minimal()
Data %>%
group_by(Nationality, ResidenceChild) %>%
summarise(
Mean_WellBeing = mean(WellBeing, na.rm = TRUE),
SD_WellBeing = sd(WellBeing, na.rm = TRUE),
Mean_EnvIdentity = mean(EnvIdentity, na.rm = TRUE),
SD_EnvIdentity = sd(EnvIdentity, na.rm = TRUE)
)
## `summarise()` has grouped output by 'Nationality'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 6
## # Groups: Nationality [2]
## Nationality ResidenceChild Mean_WellBeing SD_WellBeing Mean_EnvIdentity
## <fct> <ord> <dbl> <dbl> <dbl>
## 1 France BigCity 315. 25.5 57.2
## 2 France SmallTown 301. 26.2 54.9
## 3 France Village 313. 13.4 64.7
## 4 France SmallVillage 310. 22.2 66.6
## 5 Russia BigCity 354. 42.0 58.5
## 6 Russia SmallTown 366. 36.8 60.9
## 7 Russia Village 345. 39.5 57.7
## 8 Russia SmallVillage 322 23.6 69.8
## # ℹ 1 more variable: SD_EnvIdentity <dbl>
Data %>%
group_by(Nationality, ResidenceChild_3) %>%
summarise(
Mean_WellBeing = mean(WellBeing, na.rm = TRUE),
SD_WellBeing = sd(WellBeing, na.rm = TRUE),
Mean_EnvIdentity = mean(EnvIdentity, na.rm = TRUE),
SD_EnvIdentity = sd(EnvIdentity, na.rm = TRUE)
)
## `summarise()` has grouped output by 'Nationality'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 6
## # Groups: Nationality [2]
## Nationality ResidenceChild_3 Mean_WellBeing SD_WellBeing Mean_EnvIdentity
## <fct> <ord> <dbl> <dbl> <dbl>
## 1 France BigCity 315. 25.5 57.2
## 2 France SmallTown 301. 26.2 54.9
## 3 France Small 312. 17.3 65.5
## 4 Russia BigCity 354. 42.0 58.5
## 5 Russia SmallTown 366. 36.8 60.9
## 6 Russia Small 336 34.6 62.5
## # ℹ 1 more variable: SD_EnvIdentity <dbl>
shapiro.test(Data$WellBeing[Data$Nationality == "France"])
##
## Shapiro-Wilk normality test
##
## data: Data$WellBeing[Data$Nationality == "France"]
## W = 0.98671, p-value = 0.7408
shapiro.test(Data$WellBeing[Data$Nationality == "Russia"])
##
## Shapiro-Wilk normality test
##
## data: Data$WellBeing[Data$Nationality == "Russia"]
## W = 0.9697, p-value = 0.258
shapiro.test(Data$EnvIdentity[Data$Nationality == "France"])
##
## Shapiro-Wilk normality test
##
## data: Data$EnvIdentity[Data$Nationality == "France"]
## W = 0.9758, p-value = 0.2582
shapiro.test(Data$EnvIdentity[Data$Nationality == "Russia"])
##
## Shapiro-Wilk normality test
##
## data: Data$EnvIdentity[Data$Nationality == "Russia"]
## W = 0.98051, p-value = 0.6136
Распределение не отличается от нормального
ggplot(Data, aes(x = WellBeing)) + geom_histogram(binwidth = 10) + facet_wrap(~Nationality)
ggplot(Data, aes(x = EnvIdentity)) + geom_histogram(binwidth = 10) + facet_wrap(~Nationality)
ggplot(Data, aes(x = EnvIdentity)) + geom_histogram(binwidth = 10) + facet_wrap(~Nationality)
# Проверка на равенство дисперсий
bartlett.test(WellBeing ~ Nationality, data = Data)
##
## Bartlett test of homogeneity of variances
##
## data: WellBeing by Nationality
## Bartlett's K-squared = 15.588, df = 1, p-value = 0.00007876
bartlett.test(EnvIdentity ~ Nationality, data = Data)
##
## Bartlett test of homogeneity of variances
##
## data: EnvIdentity by Nationality
## Bartlett's K-squared = 0.027032, df = 1, p-value = 0.8694
Благополучие - равенство дисперсий не соблюдается. Флаг - для неравных дисперсий Идентичность - равенство дисперсий соблюдается. Использую тест для равных дисперсий.
t.test(WellBeing ~ Nationality, data = Data, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: WellBeing by Nationality
## t = -6.5518, df = 68.958, p-value = 0.000000008566
## alternative hypothesis: true difference in means between group France and group Russia is not equal to 0
## 95 percent confidence interval:
## -55.81866 -29.76061
## sample estimates:
## mean in group France mean in group Russia
## 310.2742 353.0638
t.test(EnvIdentity ~ Nationality, data = Data, var.equal = TRUE)
##
## Two Sample t-test
##
## data: EnvIdentity by Nationality
## t = -0.013705, df = 107, p-value = 0.9891
## alternative hypothesis: true difference in means between group France and group Russia is not equal to 0
## 95 percent confidence interval:
## -4.898282 4.831020
## sample estimates:
## mean in group France mean in group Russia
## 59.83871 59.87234
Сравнение по отдельным группам при четырех местах жительства
unique(Data$ResidenceChild) |>
lapply(function(res) {
subset_data <- Data[Data$ResidenceChild == res, ]
test <- wilcox.test(WellBeing ~ Nationality, data = subset_data
#, exact = FALSE
)
data.frame(
ResidenceChild = res,
p_value = test$p.value,
statistic = as.numeric(test$statistic) # Преобразуем для совместимости
)
}) |>
(\(results) do.call(rbind, results))()
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## ResidenceChild p_value statistic
## 1 BigCity 0.0008143039 130
## 2 SmallTown 0.0002016445 9
## 3 Village 0.0186078164 13
## 4 SmallVillage 0.3736263736 13
exact = FALSE - предотвращает появление предупреждения. Но в целом на результатах не сказывается. Просто предупреждает, что не может показать точную статистику
unique(Data$ResidenceChild) |>
lapply(function(res) {
subset_data <- Data[Data$ResidenceChild == res, ]
test <- wilcox.test(EnvIdentity ~ Nationality, data = subset_data
#, exact = FALSE
)
data.frame(
ResidenceChild = res,
p_value = test$p.value,
statistic = as.numeric(test$statistic)
)
}) |>
(\(results) do.call(rbind, results))()
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## ResidenceChild p_value statistic
## 1 BigCity 0.7022136 277.5
## 2 SmallTown 0.3836088 63.0
## 3 Village 0.2644301 56.0
## 4 SmallVillage 1.0000000 20.0
Сравнение по отдельным группам при четырех местах жительства
unique(Data$ResidenceChild_3) |>
lapply(function(res) {
subset_data <- Data[Data$ResidenceChild_3 == res, ]
test <- wilcox.test(WellBeing ~ Nationality, data = subset_data
#, exact = FALSE
)
data.frame(
ResidenceChild_3 = res,
p_value = test$p.value,
statistic = as.numeric(test$statistic)
)
}) |>
(\(results) do.call(rbind, results))()
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## ResidenceChild_3 p_value statistic
## 1 BigCity 0.0008143039 130
## 2 SmallTown 0.0002016445 9
## 3 Small 0.0163343073 56
unique(Data$ResidenceChild_3) |>
lapply(function(res) {
subset_data <- Data[Data$ResidenceChild_3 == res, ]
test <- wilcox.test(EnvIdentity ~ Nationality, data = subset_data#, exact = FALSE
)
data.frame(
ResidenceChild_3 = res,
p_value = test$p.value,
statistic = as.numeric(test$statistic)
)
}) |>
(\(results) do.call(rbind, results))()
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## ResidenceChild_3 p_value statistic
## 1 BigCity 0.7022136 277.5
## 2 SmallTown 0.3836088 63.0
## 3 Small 0.4156133 142.0
# Франция
Data %>%
subset(Nationality == "France") %>%
kruskal.test(WellBeing ~ ResidenceChild, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: WellBeing by ResidenceChild
## Kruskal-Wallis chi-squared = 2.6401, df = 3, p-value = 0.4505
Data %>%
subset(Nationality == "France") %>%
kruskal.test(EnvIdentity ~ ResidenceChild, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: EnvIdentity by ResidenceChild
## Kruskal-Wallis chi-squared = 8.8257, df = 3, p-value = 0.0317
# Россия
Data %>%
subset(Nationality == "Russia") %>%
kruskal.test(WellBeing ~ ResidenceChild, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: WellBeing by ResidenceChild
## Kruskal-Wallis chi-squared = 4.5871, df = 3, p-value = 0.2047
Data %>%
subset(Nationality == "Russia") %>%
kruskal.test(EnvIdentity ~ ResidenceChild, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: EnvIdentity by ResidenceChild
## Kruskal-Wallis chi-squared = 4.0903, df = 3, p-value = 0.2519
# Общий анализ (без разделения по странам)
Data %>%
kruskal.test(WellBeing ~ ResidenceChild, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: WellBeing by ResidenceChild
## Kruskal-Wallis chi-squared = 5.0176, df = 3, p-value = 0.1705
Data %>%
kruskal.test(EnvIdentity ~ ResidenceChild, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: EnvIdentity by ResidenceChild
## Kruskal-Wallis chi-squared = 8.7986, df = 3, p-value = 0.03209
# Франция
Data %>%
subset(Nationality == "France") %>%
kruskal.test(WellBeing ~ ResidenceChild_3, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: WellBeing by ResidenceChild_3
## Kruskal-Wallis chi-squared = 2.2096, df = 2, p-value = 0.3313
Data %>%
subset(Nationality == "France") %>%
kruskal.test(EnvIdentity ~ ResidenceChild_3, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: EnvIdentity by ResidenceChild_3
## Kruskal-Wallis chi-squared = 8.6499, df = 2, p-value = 0.01323
# Россия
Data %>%
subset(Nationality == "Russia") %>%
kruskal.test(WellBeing ~ ResidenceChild_3, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: WellBeing by ResidenceChild_3
## Kruskal-Wallis chi-squared = 3.7001, df = 2, p-value = 0.1572
Data %>%
subset(Nationality == "Russia") %>%
kruskal.test(EnvIdentity ~ ResidenceChild_3, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: EnvIdentity by ResidenceChild_3
## Kruskal-Wallis chi-squared = 0.76418, df = 2, p-value = 0.6824
# Общий анализ (без разделения по странам)
Data %>%
kruskal.test(WellBeing ~ ResidenceChild_3, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: WellBeing by ResidenceChild_3
## Kruskal-Wallis chi-squared = 4.2913, df = 2, p-value = 0.117
Data %>%
kruskal.test(EnvIdentity ~ ResidenceChild_3, data = .)
##
## Kruskal-Wallis rank sum test
##
## data: EnvIdentity by ResidenceChild_3
## Kruskal-Wallis chi-squared = 7.2376, df = 2, p-value = 0.02681