#Global options
knitr::opts_chunk$set(echo = TRUE,
message = FALSE,
warning = FALSE,
include = TRUE,
cache = FALSE)
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
# auto format (kable)
options(kableExtra.auto_format = FALSE)
#round
options("scipen"=100, "digits"=4)
#get file
load(file="C:/Users/luisf/Dropbox/Puc-Rio/Projeto - Junia/Base R - Pesquisa mapfre.RData")
library(tidyverse) #environment
library(psych) #psychometric analysis
library(summarytools) #descriptive
Dataset <- janitor::clean_names(Dataset)
Dataset <- Dataset %>%
mutate(id = row_number())
plot_bdi <- Dataset %>%
select(starts_with("bdi")) %>%
pivot_longer(everything()) %>%
ggplot(., aes(x=name, y = value)) +
geom_boxplot()
plot_bai <- Dataset %>%
select(starts_with("bai")) %>%
pivot_longer(everything()) %>%
ggplot(., aes(x=name, y = value)) +
geom_boxplot()
gridExtra::grid.arrange(plot_bdi, plot_bai)
Dataset %>%
select(starts_with("bdi")) %>%
DataExplorer::plot_missing()
Dataset %>%
select(starts_with("bai")) %>%
DataExplorer::plot_missing()
Dataset %>%
select(id,country,starts_with("bdi")) %>% #select all items
mutate(number_na = rowSums(is.na(.))) %>% #
filter(number_na == Dataset %>% select(starts_with("bdi")) %>% ncol()) %>% #all missing
count(country, id)
## # A tibble: 8 x 3
## country id n
## <fct> <int> <int>
## 1 SPAIN 9 1
## 2 SPAIN 10 1
## 3 SPAIN 486 1
## 4 SPAIN 532 1
## 5 PORTUGAL 1217 1
## 6 PORTUGAL 1218 1
## 7 PORTUGAL 1441 1
## 8 BRAZIL 1795 1
Dataset %>%
select(id,country,starts_with("bai")) %>% #select all items
mutate(number_na = rowSums(is.na(.))) %>% #
filter(number_na == Dataset %>% select(starts_with("bai")) %>% ncol()) %>% #all missing
count(country, id)
## # A tibble: 5 x 3
## country id n
## <fct> <int> <int>
## 1 SPAIN 8 1
## 2 SPAIN 9 1
## 3 SPAIN 10 1
## 4 PORTUGAL 1217 1
## 5 PORTUGAL 1218 1
imputed_data_bdi <- Dataset %>%
select(starts_with("bdi"), -bdi_sum, -bdi_class) %>%
mice::mice(., m=5, maxit = 50, method = 'pmm', seed = 500)
summary(imputed_Data_bdi)
#get complete data ( 2nd out of 5)
complete_data_bdi <- mice::complete(imputed_data_bdi,2)
#check
DataExplorer::plot_missing(complete_data_bdi)
#add sums
complete_data_bdi <- complete_data_bdi %>%
mutate(bdi_sum = rowSums(select(., starts_with("bdi")), na.rm=T))
Comparing
comparable_results <- bind_rows(Dataset %>%
mutate(base = "original") %>%
select(base, bdi_sum),
complete_data_bdi %>%
mutate(base = "imputada") %>%
select(base, bdi_sum))
# Check correlation
comparable_results %>%
unstack(bdi_sum~base) %>%
ggplot(., aes(x = original, y = imputada)) +
geom_jitter() +
geom_smooth()
#check correlation
cor(Dataset$bdi_sum, complete_data_bdi$bdi_sum)
#check distributionn
ggplot(comparable_results, aes(x = base, y = bdi_sum)) +
geom_boxplot()
wilcox.test(bdi_sum ~ base, data = comparable_results, paired = TRUE)
Dataset <- Dataset %>%
mutate(bdi_sum = rowSums(select(., starts_with("bdi")), na.rm=T)) #adjust for all missing)
If the participant left blank all items, then do not use
#Dataset
Dataset <- Dataset %>% #get the dataset
mutate(number_na_bdi = rowSums(is.na(select(.,starts_with("bdi"))))) %>% #create a variable to count missing
mutate(bdi_sum = if_else(number_na_bdi == 21, NA_real_,bdi_sum)) #if all is missing
Dataset <- Dataset %>%
mutate(bdi_class = case_when(
bdi_sum <= 13 ~ "minima",
bdi_sum < 20 ~ "leve",
bdi_sum < 29 ~ "moderada",
bdi_sum >= 29 ~ "grave"))
Dataset <- Dataset %>%
mutate(bdi_class = factor(bdi_class,
levels=c("minima","leve","moderada","grave")))
Dataset <- Dataset %>%
mutate(bai_sum = rowSums(select(., starts_with("bai")), na.rm=T)) #adjust for all missing)
If the participant left blank all items, then do not use
#Dataset
Dataset <- Dataset %>% #get the dataset
mutate(number_na_bai = rowSums(is.na(select(.,starts_with("bai"))))) %>% #create a variable to count missing
mutate(bai_sum = if_else(number_na_bai == 21, NA_real_,bai_sum)) #if all is missing
Dataset <- Dataset %>%
mutate(bai_class = case_when(
bai_sum <= 10 ~ "minima",
bai_sum < 20 ~ "leve",
bai_sum < 31 ~ "moderada",
bai_sum >= 31 ~ "grave"))
Dataset <- Dataset %>%
mutate(bai_class = factor(bai_class,
levels=c("minima","leve","moderada","grave")))
Dataset %>%
select(starts_with("bdi"), -bdi_sum, -bdi_class) %>%
alpha()
##
## Reliability analysis
## Call: alpha(x = .)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.89 0.9 0.9 0.29 8.7 0.0034 0.44 0.37 0.29
##
## lower alpha upper 95% confidence boundaries
## 0.89 0.89 0.9
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## bdi_1 0.89 0.89 0.89 0.29 8.1 0.0036 0.0049 0.29
## bdi_2 0.89 0.89 0.90 0.30 8.4 0.0035 0.0050 0.30
## bdi_3 0.89 0.89 0.90 0.29 8.2 0.0035 0.0049 0.29
## bdi_4 0.89 0.89 0.90 0.29 8.2 0.0036 0.0050 0.29
## bdi_5 0.89 0.89 0.90 0.29 8.3 0.0035 0.0052 0.29
## bdi_6 0.89 0.89 0.90 0.30 8.5 0.0035 0.0051 0.30
## bdi_7 0.89 0.89 0.89 0.29 8.1 0.0036 0.0047 0.29
## bdi_8 0.89 0.89 0.90 0.29 8.3 0.0035 0.0052 0.29
## bdi_9 0.89 0.90 0.90 0.30 8.6 0.0034 0.0045 0.30
## bdi_10 0.89 0.89 0.90 0.30 8.5 0.0035 0.0051 0.30
## bdi_11 0.89 0.89 0.90 0.30 8.4 0.0035 0.0052 0.30
## bdi_12 0.89 0.89 0.90 0.29 8.2 0.0036 0.0051 0.29
## bdi_13 0.89 0.89 0.90 0.29 8.2 0.0036 0.0051 0.29
## bdi_14 0.89 0.89 0.89 0.29 8.1 0.0036 0.0047 0.29
## bdi_15 0.89 0.89 0.89 0.29 8.1 0.0036 0.0046 0.29
## bdi_16 0.89 0.89 0.90 0.30 8.5 0.0035 0.0048 0.30
## bdi_17 0.89 0.89 0.90 0.29 8.2 0.0036 0.0053 0.29
## bdi_18 0.89 0.89 0.90 0.30 8.4 0.0035 0.0052 0.29
## bdi_19 0.89 0.89 0.90 0.29 8.3 0.0035 0.0052 0.29
## bdi_20 0.89 0.89 0.89 0.29 8.2 0.0036 0.0048 0.29
## bdi_21 0.89 0.89 0.90 0.30 8.5 0.0035 0.0051 0.30
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## bdi_1 1946 0.64 0.66 0.65 0.60 0.248 0.53
## bdi_2 1947 0.52 0.52 0.48 0.45 0.543 0.69
## bdi_3 1949 0.58 0.60 0.57 0.53 0.298 0.57
## bdi_4 1947 0.62 0.62 0.60 0.56 0.375 0.59
## bdi_5 1946 0.56 0.56 0.53 0.50 0.488 0.61
## bdi_6 1946 0.48 0.50 0.46 0.43 0.162 0.51
## bdi_7 1945 0.66 0.65 0.64 0.60 0.386 0.70
## bdi_8 1949 0.59 0.58 0.55 0.52 0.790 0.72
## bdi_9 1947 0.40 0.44 0.40 0.36 0.072 0.32
## bdi_10 1947 0.52 0.51 0.47 0.45 0.342 0.69
## bdi_11 1941 0.54 0.53 0.49 0.47 0.544 0.69
## bdi_12 1943 0.60 0.61 0.58 0.54 0.433 0.62
## bdi_13 1943 0.61 0.60 0.57 0.54 0.464 0.79
## bdi_14 1941 0.64 0.64 0.62 0.59 0.307 0.67
## bdi_15 1939 0.66 0.65 0.64 0.60 0.565 0.69
## bdi_16 1933 0.53 0.51 0.48 0.45 0.855 0.78
## bdi_17 1938 0.60 0.60 0.57 0.54 0.434 0.64
## bdi_18 1942 0.55 0.54 0.50 0.48 0.535 0.71
## bdi_19 1942 0.59 0.57 0.54 0.52 0.756 0.80
## bdi_20 1942 0.63 0.62 0.60 0.57 0.527 0.69
## bdi_21 1939 0.48 0.51 0.47 0.44 0.136 0.45
##
## Non missing response frequency for each item
## 0 1 2 3 miss
## bdi_1 0.79 0.17 0.03 0.01 0.01
## bdi_2 0.55 0.38 0.05 0.02 0.01
## bdi_3 0.75 0.20 0.04 0.01 0.00
## bdi_4 0.68 0.28 0.04 0.01 0.01
## bdi_5 0.56 0.40 0.03 0.01 0.01
## bdi_6 0.88 0.10 0.01 0.02 0.01
## bdi_7 0.72 0.20 0.06 0.02 0.01
## bdi_8 0.37 0.50 0.11 0.02 0.00
## bdi_9 0.94 0.05 0.00 0.01 0.01
## bdi_10 0.77 0.14 0.07 0.02 0.01
## bdi_11 0.55 0.39 0.04 0.02 0.01
## bdi_12 0.63 0.33 0.03 0.01 0.01
## bdi_13 0.67 0.24 0.04 0.05 0.01
## bdi_14 0.80 0.10 0.09 0.01 0.01
## bdi_15 0.54 0.37 0.08 0.01 0.01
## bdi_16 0.35 0.49 0.12 0.04 0.01
## bdi_17 0.64 0.31 0.04 0.01 0.01
## bdi_18 0.57 0.34 0.06 0.02 0.01
## bdi_19 0.46 0.34 0.18 0.02 0.01
## bdi_20 0.57 0.35 0.06 0.02 0.01
## bdi_21 0.90 0.08 0.02 0.01 0.01
Dataset %>%
select(starts_with("bai"), -bai_sum, -bai_class) %>%
alpha()
##
## Reliability analysis
## Call: alpha(x = .)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.9 0.9 0.91 0.31 9.4 0.0032 0.41 0.39 0.31
##
## lower alpha upper 95% confidence boundaries
## 0.89 0.9 0.91
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## bai_1 0.90 0.9 0.91 0.31 9.1 0.0033 0.0081 0.31
## bai_2 0.90 0.9 0.91 0.32 9.2 0.0032 0.0077 0.31
## bai_3 0.90 0.9 0.91 0.31 9.0 0.0033 0.0082 0.31
## bai_4 0.89 0.9 0.91 0.31 8.8 0.0034 0.0074 0.30
## bai_5 0.89 0.9 0.90 0.30 8.7 0.0034 0.0071 0.30
## bai_6 0.90 0.9 0.91 0.31 9.0 0.0033 0.0080 0.30
## bai_7 0.89 0.9 0.91 0.31 8.8 0.0034 0.0075 0.30
## bai_8 0.89 0.9 0.91 0.31 8.8 0.0033 0.0078 0.30
## bai_9 0.90 0.9 0.91 0.31 8.9 0.0033 0.0075 0.31
## bai_10 0.89 0.9 0.90 0.30 8.7 0.0034 0.0074 0.30
## bai_11 0.89 0.9 0.90 0.30 8.8 0.0034 0.0074 0.30
## bai_12 0.89 0.9 0.90 0.31 8.8 0.0033 0.0077 0.30
## bai_13 0.90 0.9 0.90 0.30 8.8 0.0033 0.0074 0.30
## bai_14 0.90 0.9 0.91 0.31 8.9 0.0033 0.0079 0.31
## bai_15 0.90 0.9 0.91 0.31 9.0 0.0033 0.0076 0.30
## bai_16 0.90 0.9 0.91 0.32 9.4 0.0032 0.0068 0.31
## bai_17 0.89 0.9 0.90 0.31 8.8 0.0034 0.0076 0.30
## bai_18 0.90 0.9 0.91 0.31 9.1 0.0033 0.0082 0.31
## bai_19 0.90 0.9 0.91 0.31 9.0 0.0033 0.0081 0.31
## bai_20 0.90 0.9 0.91 0.32 9.4 0.0031 0.0070 0.31
## bai_21 0.90 0.9 0.91 0.31 9.1 0.0033 0.0081 0.31
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## bai_1 1944 0.53 0.53 0.49 0.47 0.41 0.63
## bai_2 1946 0.49 0.48 0.44 0.42 0.69 0.75
## bai_3 1940 0.55 0.56 0.53 0.50 0.34 0.63
## bai_4 1941 0.66 0.63 0.62 0.59 0.89 0.89
## bai_5 1947 0.70 0.68 0.67 0.64 0.69 0.90
## bai_6 1949 0.57 0.58 0.55 0.52 0.28 0.61
## bai_7 1948 0.65 0.65 0.63 0.60 0.34 0.68
## bai_8 1948 0.63 0.63 0.61 0.58 0.36 0.65
## bai_9 1942 0.60 0.61 0.59 0.55 0.20 0.53
## bai_10 1946 0.70 0.67 0.66 0.63 1.04 0.88
## bai_11 1944 0.65 0.66 0.65 0.61 0.22 0.59
## bai_12 1946 0.63 0.64 0.63 0.58 0.30 0.63
## bai_13 1942 0.63 0.65 0.64 0.59 0.17 0.47
## bai_14 1947 0.58 0.59 0.56 0.53 0.24 0.59
## bai_15 1945 0.57 0.59 0.57 0.52 0.22 0.57
## bai_16 1947 0.41 0.41 0.36 0.34 0.18 0.56
## bai_17 1940 0.65 0.65 0.63 0.60 0.39 0.69
## bai_18 1944 0.56 0.55 0.51 0.49 0.55 0.77
## bai_19 1939 0.53 0.56 0.53 0.49 0.14 0.44
## bai_20 1940 0.43 0.43 0.38 0.36 0.50 0.71
## bai_21 1944 0.53 0.53 0.50 0.47 0.37 0.66
##
## Non missing response frequency for each item
## 0 1 2 3 miss
## bai_1 0.67 0.26 0.07 0.00 0.01
## bai_2 0.47 0.38 0.14 0.01 0.01
## bai_3 0.74 0.19 0.07 0.01 0.01
## bai_4 0.41 0.35 0.19 0.05 0.01
## bai_5 0.56 0.24 0.15 0.05 0.01
## bai_6 0.80 0.13 0.06 0.01 0.00
## bai_7 0.76 0.16 0.07 0.01 0.00
## bai_8 0.73 0.20 0.06 0.01 0.00
## bai_9 0.85 0.10 0.04 0.01 0.01
## bai_10 0.30 0.42 0.21 0.07 0.01
## bai_11 0.85 0.09 0.05 0.01 0.01
## bai_12 0.78 0.15 0.06 0.01 0.01
## bai_13 0.87 0.10 0.03 0.00 0.01
## bai_14 0.83 0.12 0.04 0.01 0.01
## bai_15 0.85 0.10 0.04 0.01 0.01
## bai_16 0.89 0.06 0.03 0.02 0.01
## bai_17 0.71 0.20 0.07 0.01 0.01
## bai_18 0.60 0.27 0.10 0.02 0.01
## bai_19 0.89 0.08 0.03 0.00 0.01
## bai_20 0.61 0.29 0.09 0.01 0.01
## bai_21 0.72 0.21 0.06 0.01 0.01
mod_completo_bdi <- lm(bdi_sum ~ sex + country, Dataset)
apaTables::apa.aov.table(mod_completo_bdi)
##
##
## ANOVA results using bdi_sum as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2
## (Intercept) 35232.64 1 35232.64 595.18 .000
## sex 528.14 1 528.14 8.92 .003 .00
## country 1179.55 2 589.77 9.96 .000 .01
## Error 114723.93 1938 59.20
## CI_90_partial_eta2
##
## [.00, .01]
## [.00, .02]
##
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
#normality
olsrr::ols_test_normality(mod_completo_bdi)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.8825 0.0000
## Kolmogorov-Smirnov 0.1111 0.0000
## Cramer-von Mises 171.717 0.0000
## Anderson-Darling 47.4803 0.0000
## -----------------------------------------------
#equal variance
olsrr::ols_test_breusch_pagan(mod_completo_bdi)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------
## Response : bdi_sum
## Variables: fitted values of bdi_sum
##
## Test Summary
## -------------------------
## DF = 1
## Chi2 = 2.5017
## Prob > Chi2 = 0.1137
#all plots
lindia::gg_diagnose(mod_completo_bdi)
mod_completo_bai <- lm(bai_sum ~ sex + country, Dataset)
apaTables::apa.aov.table(mod_completo_bai)
##
##
## ANOVA results using bai_sum as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2
## (Intercept) 23701.50 1 23701.50 372.46 .000
## sex 3248.35 1 3248.35 51.05 .000 .03
## country 252.51 2 126.25 1.98 .138 .00
## Error 123515.25 1941 63.63
## CI_90_partial_eta2
##
## [.02, .04]
## [.00, .01]
##
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
#normality
olsrr::ols_test_normality(mod_completo_bai)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.8549 0.0000
## Kolmogorov-Smirnov 0.1427 0.0000
## Cramer-von Mises 186.2753 0.0000
## Anderson-Darling 72.9096 0.0000
## -----------------------------------------------
#equal variance
olsrr::ols_test_breusch_pagan(mod_completo_bai)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------
## Response : bai_sum
## Variables: fitted values of bai_sum
##
## Test Summary
## -------------------------------
## DF = 1
## Chi2 = 27.6053
## Prob > Chi2 = 0.0000001488
#all plots
lindia::gg_diagnose(mod_completo_bai)
Country
Dataset %>%
freq(country)
## Frequencies
## Dataset$country
## Type: Factor
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## -------------- ------ --------- -------------- --------- --------------
## SPAIN 1216 62.14 62.14 62.14 62.14
## PORTUGAL 426 21.77 83.90 21.77 83.90
## BRAZIL 315 16.10 100.00 16.10 100.00
## <NA> 0 0.00 100.00
## Total 1957 100.00 100.00 100.00 100.00
Dataset %>%
group_by(country) %>%
freq(sex)
## Frequencies
## Dataset$sex
## Type: Factor
## Group: country = SPAIN
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## M 384 31.76 31.76 31.58 31.58
## F 825 68.24 100.00 67.85 99.42
## <NA> 7 0.58 100.00
## Total 1216 100.00 100.00 100.00 100.00
##
## Group: country = PORTUGAL
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## M 203 47.65 47.65 47.65 47.65
## F 223 52.35 100.00 52.35 100.00
## <NA> 0 0.00 100.00
## Total 426 100.00 100.00 100.00 100.00
##
## Group: country = BRAZIL
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## M 149 47.30 47.30 47.30 47.30
## F 166 52.70 100.00 52.70 100.00
## <NA> 0 0.00 100.00
## Total 315 100.00 100.00 100.00 100.00
Dataset %>%
group_by(country) %>%
summarise(pvalue = chisq.test(table(sex))$p.value)
## # A tibble: 3 x 2
## country pvalue
## <fct> <dbl>
## 1 SPAIN 7.34e-37
## 2 PORTUGAL 3.33e- 1
## 3 BRAZIL 3.38e- 1
Age
Dataset %>%
group_by(country) %>%
descr(age)
## Descriptive Statistics
## age by country
## Data Frame: Dataset
## N: 1216
##
## SPAIN PORTUGAL BRAZIL
## ----------------- --------- ---------- --------
## Mean 21.49 20.43 22.83
## Std.Dev 3.02 1.66 7.18
## Min 18.00 18.00 17.00
## Q1 20.00 19.00 20.00
## Median 21.00 20.00 21.00
## Q3 23.00 22.00 23.00
## Max 48.00 23.00 68.00
## MAD 2.97 1.48 2.97
## IQR 3.00 3.00 3.00
## CV 0.14 0.08 0.31
## Skewness 2.59 0.02 3.65
## SE.Skewness 0.07 0.12 0.14
## Kurtosis 13.68 -1.27 15.34
## N.Valid 1210.00 426.00 293.00
## Pct.Valid 99.51 100.00 93.02
Dataset %>%
group_by(sex) %>%
descr(age)
## Descriptive Statistics
## age by sex
## Data Frame: Dataset
## N: 736
##
## M F NA
## ----------------- -------- --------- -------
## Mean 21.30 21.56 21.33
## Std.Dev 3.31 4.11 4.16
## Min 17.00 17.00 18.00
## Q1 19.00 19.00 18.00
## Median 21.00 21.00 20.00
## Q3 22.00 22.00 26.00
## Max 48.00 68.00 26.00
## MAD 1.48 2.97 2.97
## IQR 3.00 3.00 4.00
## CV 0.16 0.19 0.20
## Skewness 3.01 5.53 0.29
## SE.Skewness 0.09 0.07 1.22
## Kurtosis 14.69 45.70 -2.33
## N.Valid 720.00 1206.00 3.00
## Pct.Valid 97.83 99.34 42.86
Dataset %>%
group_by(sex, country) %>%
descr(age)
## Descriptive Statistics
## age by sex
## Data Frame: Dataset
## N: 384
##
## sex = M, country = SPAIN sex = M, country = PORTUGAL
## ----------------- -------------------------- -----------------------------
## Mean 21.43 20.51
## Std.Dev 3.48 1.64
## Min 18.00 18.00
## Q1 19.00 19.00
## Median 21.00 21.00
## Q3 22.00 22.00
## Max 48.00 23.00
## MAD 2.97 1.48
## IQR 3.00 3.00
## CV 0.16 0.08
## Skewness 2.85 -0.06
## SE.Skewness 0.12 0.17
## Kurtosis 14.11 -1.23
## N.Valid 384.00 203.00
## Pct.Valid 100.00 100.00
##
## Table: Table continues below
##
##
##
## sex = M, country = BRAZIL sex = F, country = SPAIN
## ----------------- --------------------------- --------------------------
## Mean 22.13 21.52
## Std.Dev 4.31 2.77
## Min 17.00 18.00
## Q1 20.00 20.00
## Median 21.00 21.00
## Q3 23.00 23.00
## Max 41.00 43.00
## MAD 1.48 2.97
## IQR 3.00 3.00
## CV 0.19 0.13
## Skewness 2.40 2.29
## SE.Skewness 0.21 0.09
## Kurtosis 6.45 11.63
## N.Valid 133.00 823.00
## Pct.Valid 89.26 99.76
##
## Table: Table continues below
##
##
##
## sex = F, country = PORTUGAL sex = F, country = BRAZIL
## ----------------- ----------------------------- ---------------------------
## Mean 20.37 23.41
## Std.Dev 1.68 8.87
## Min 18.00 17.00
## Q1 19.00 19.00
## Median 20.00 21.00
## Q3 22.00 23.00
## Max 23.00 68.00
## MAD 1.48 2.97
## IQR 3.00 4.00
## CV 0.08 0.38
## Skewness 0.09 3.18
## SE.Skewness 0.16 0.19
## Kurtosis -1.30 10.26
## N.Valid 223.00 160.00
## Pct.Valid 100.00 96.39
##
## Table: Table continues below
##
##
##
## sex = NA, country = SPAIN
## ----------------- ---------------------------
## Mean 21.33
## Std.Dev 4.16
## Min 18.00
## Q1 18.00
## Median 20.00
## Q3 26.00
## Max 26.00
## MAD 2.97
## IQR 4.00
## CV 0.20
## Skewness 0.29
## SE.Skewness 1.22
## Kurtosis -2.33
## N.Valid 3.00
## Pct.Valid 42.86
## Descriptive Statistics
## age by country
## Data Frame: Dataset
## N: 384
##
## sex = M, country = SPAIN sex = M, country = PORTUGAL
## ----------------- -------------------------- -----------------------------
## Mean 21.43 20.51
## Std.Dev 3.48 1.64
## Min 18.00 18.00
## Q1 19.00 19.00
## Median 21.00 21.00
## Q3 22.00 22.00
## Max 48.00 23.00
## MAD 2.97 1.48
## IQR 3.00 3.00
## CV 0.16 0.08
## Skewness 2.85 -0.06
## SE.Skewness 0.12 0.17
## Kurtosis 14.11 -1.23
## N.Valid 384.00 203.00
## Pct.Valid 100.00 100.00
##
## Table: Table continues below
##
##
##
## sex = M, country = BRAZIL sex = F, country = SPAIN
## ----------------- --------------------------- --------------------------
## Mean 22.13 21.52
## Std.Dev 4.31 2.77
## Min 17.00 18.00
## Q1 20.00 20.00
## Median 21.00 21.00
## Q3 23.00 23.00
## Max 41.00 43.00
## MAD 1.48 2.97
## IQR 3.00 3.00
## CV 0.19 0.13
## Skewness 2.40 2.29
## SE.Skewness 0.21 0.09
## Kurtosis 6.45 11.63
## N.Valid 133.00 823.00
## Pct.Valid 89.26 99.76
##
## Table: Table continues below
##
##
##
## sex = F, country = PORTUGAL sex = F, country = BRAZIL
## ----------------- ----------------------------- ---------------------------
## Mean 20.37 23.41
## Std.Dev 1.68 8.87
## Min 18.00 17.00
## Q1 19.00 19.00
## Median 20.00 21.00
## Q3 22.00 23.00
## Max 23.00 68.00
## MAD 1.48 2.97
## IQR 3.00 4.00
## CV 0.08 0.38
## Skewness 0.09 3.18
## SE.Skewness 0.16 0.19
## Kurtosis -1.30 10.26
## N.Valid 223.00 160.00
## Pct.Valid 100.00 96.39
##
## Table: Table continues below
##
##
##
## sex = NA, country = SPAIN
## ----------------- ---------------------------
## Mean 21.33
## Std.Dev 4.16
## Min 18.00
## Q1 18.00
## Median 20.00
## Q3 26.00
## Max 26.00
## MAD 2.97
## IQR 4.00
## CV 0.20
## Skewness 0.29
## SE.Skewness 1.22
## Kurtosis -2.33
## N.Valid 3.00
## Pct.Valid 42.86
Dataset %>%
group_by(country) %>%
nest() %>%
mutate(t_test = map(data, ~t.test(.$age ~ .$sex)$p.value)) %>%
unnest(t_test)
## # A tibble: 3 x 3
## # Groups: country [3]
## country data t_test
## <fct> <list<df[,93]>> <dbl>
## 1 SPAIN [1,216 × 93] 0.669
## 2 PORTUGAL [426 × 93] 0.387
## 3 BRAZIL [315 × 93] 0.109
This table was checked twice. Last checked on May 14 2020.
Dataset %>%
group_by(country) %>%
descr(bdi_sum)
## Descriptive Statistics
## bdi_sum by country
## Data Frame: Dataset
## N: 1216
##
## SPAIN PORTUGAL BRAZIL
## ----------------- --------- ---------- --------
## Mean 8.86 9.05 10.89
## Std.Dev 7.54 7.73 8.29
## Min 0.00 0.00 0.00
## Q1 3.00 4.00 5.00
## Median 7.00 7.00 9.50
## Q3 12.00 12.00 15.00
## Max 57.00 53.00 41.00
## MAD 5.93 5.93 8.15
## IQR 9.00 8.00 10.00
## CV 0.85 0.85 0.76
## Skewness 1.77 1.76 1.01
## SE.Skewness 0.07 0.12 0.14
## Kurtosis 5.12 5.11 0.80
## N.Valid 1212.00 423.00 314.00
## Pct.Valid 99.67 99.30 99.68
kruskal.test(bdi_sum ~ country, data = Dataset)
##
## Kruskal-Wallis rank sum test
##
## data: bdi_sum by country
## Kruskal-Wallis chi-squared = 18, df = 2, p-value = 0.0001
Dataset %>%
rstatix::kruskal_effsize(bdi_sum ~ country) #effect size
## # A tibble: 1 x 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 bdi_sum 1957 0.00802 eta2[H] small
PMCMR::posthoc.kruskal.nemenyi.test(x=Dataset$bdi_sum, g=Dataset$country, method="Tukey")
##
## Pairwise comparisons using Tukey and Kramer (Nemenyi) test
## with Tukey-Dist approximation for independent samples
##
## data: Dataset$bdi_sum and Dataset$country
##
## SPAIN PORTUGAL
## PORTUGAL 0.9351 -
## BRAZIL 0.000098 0.0031
##
## P value adjustment method: none
Dataset %>%
rstatix::dunn_test(bdi_sum ~ country, p.adjust.method = "bonferroni")
## # A tibble: 3 x 11
## .y. group1 group2 n1 n2 estimate statistic p method p.adj
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 bdi_… SPAIN PORTU… 1212 423 11.1 0.349 7.27e-1 Dunn … 1.00e+0
## 2 bdi_… SPAIN BRAZIL 1212 314 148. 4.16 3.22e-5 Dunn … 9.67e-5
## 3 bdi_… PORTU… BRAZIL 423 314 137. 3.27 1.08e-3 Dunn … 3.24e-3
## # … with 1 more variable: p.adj.signif <chr>
dunn.test::dunn.test(Dataset$bdi_sum,Dataset$country,method = "Bonferroni")
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 17.675, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | BRAZIL PORTUGAL
## ---------+----------------------
## PORTUGAL | 3.269105
## | 0.0016*
## |
## SPAIN | 4.157182 0.349355
## | 0.0000* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
Dataset %>%
group_by(country) %>%
filter(!is.na(bdi_class)) %>%
count(bdi_class) %>%
mutate(prop=prop.table(n)*100) %>%
select(-n) %>%
pivot_wider(names_from = c(bdi_class),
values_from = prop)
## # A tibble: 3 x 5
## # Groups: country [3]
## country minima leve moderada grave
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 SPAIN 80.4 11.4 5.69 2.56
## 2 PORTUGAL 79.0 11.8 6.38 2.84
## 3 BRAZIL 68.8 16.2 10.8 4.14
Individual chi square for each country
Dataset %>%
group_by(country) %>%
summarise(pvalue = chisq.test(table(bdi_class))$p.value)
## # A tibble: 3 x 2
## country pvalue
## <fct> <dbl>
## 1 SPAIN 0.
## 2 PORTUGAL 1.48e-143
## 3 BRAZIL 2.67e- 71
Dataset %>%
group_by(sex) %>%
descr(bdi_sum)
## Descriptive Statistics
## bdi_sum by sex
## Data Frame: Dataset
## N: 736
##
## M F NA
## ----------------- -------- --------- --------
## Mean 8.68 9.59 5.14
## Std.Dev 7.74 7.73 5.46
## Min 0.00 0.00 1.00
## Q1 3.00 4.00 2.00
## Median 7.00 8.00 3.00
## Q3 12.00 13.00 9.00
## Max 57.00 53.00 16.00
## MAD 5.93 7.41 1.48
## IQR 9.00 9.00 4.00
## CV 0.89 0.81 1.06
## Skewness 1.89 1.47 1.04
## SE.Skewness 0.09 0.07 0.79
## Kurtosis 5.63 3.29 -0.65
## N.Valid 732.00 1210.00 7.00
## Pct.Valid 99.46 99.67 100.00
wilcox.test(bdi_sum ~ sex, data = Dataset)
##
## Wilcoxon rank sum test with continuity correction
##
## data: bdi_sum by sex
## W = 404216, p-value = 0.001
## alternative hypothesis: true location shift is not equal to 0
Dataset %>%
filter(!is.na(sex)) %>%
{sjstats::mwu(.,bdi_sum, sex,distribution = "asymptotic")}
##
## # Mann-Whitney-U-Test
##
## Groups 1 = M (n = 732) | 2 = F (n = 1210):
## U = 672494.000, W = 404216.000, p = 0.001, Z = -3.231
## effect-size r = 0.073
## rank-mean(1) = 918.71
## rank-mean(2) = 1003.44
Dataset %>%
filter(!is.na(sex), !is.na(bdi_class)) %>% #<- filtering
group_by(sex) %>%
count(bdi_class) %>%
mutate(prop=prop.table(n)*100) %>%
select(-n) %>%
pivot_wider(names_from = c(bdi_class),
values_from = prop)
## # A tibble: 2 x 5
## # Groups: sex [2]
## sex minima leve moderada grave
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 M 81.0 10.7 5.05 3.28
## 2 F 76.4 13.2 7.69 2.64
Individual chi square for each sex
Dataset %>%
filter(!is.na(sex)) %>% #<- filtering
group_by(sex) %>%
summarise(pvalue = chisq.test(table(bai_class))$p.value)
## # A tibble: 2 x 2
## sex pvalue
## <fct> <dbl>
## 1 M 5.09e-258
## 2 F 1.51e-252
Dataset %>%
group_by(country) %>%
descr(bai_sum)
## Descriptive Statistics
## bai_sum by country
## Data Frame: Dataset
## N: 1216
##
## SPAIN PORTUGAL BRAZIL
## ----------------- --------- ---------- --------
## Mean 8.55 7.92 9.01
## Std.Dev 8.06 8.04 8.40
## Min 0.00 0.00 0.00
## Q1 3.00 2.00 3.00
## Median 6.00 5.00 6.00
## Q3 12.00 11.00 12.00
## Max 48.00 45.00 46.00
## MAD 5.93 5.93 5.93
## IQR 9.00 9.00 9.00
## CV 0.94 1.02 0.93
## Skewness 1.68 1.82 1.64
## SE.Skewness 0.07 0.12 0.14
## Kurtosis 3.38 3.87 3.31
## N.Valid 1213.00 424.00 315.00
## Pct.Valid 99.75 99.53 100.00
kruskal.test(bai_sum ~ country, data = Dataset)
##
## Kruskal-Wallis rank sum test
##
## data: bai_sum by country
## Kruskal-Wallis chi-squared = 5.8, df = 2, p-value = 0.06
Dataset %>%
rstatix::kruskal_effsize(bai_sum ~ country) #effect size
## # A tibble: 1 x 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 bai_sum 1957 0.00192 eta2[H] small
Dataset %>%
group_by(country) %>%
filter(!is.na(bai_class)) %>%
count(bai_class) %>%
mutate(prop=prop.table(n)*100) %>%
select(-n) %>%
pivot_wider(names_from = c(bai_class),
values_from = prop)
## # A tibble: 3 x 5
## # Groups: country [3]
## country minima leve moderada grave
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 SPAIN 71.2 19.0 7.17 2.64
## 2 PORTUGAL 72.6 18.4 5.90 3.07
## 3 BRAZIL 67.3 21.6 8.25 2.86
Individual chi square for each country
Dataset %>%
group_by(country) %>%
summarise(pvalue = chisq.test(table(bai_class))$p.value)
## # A tibble: 3 x 2
## country pvalue
## <fct> <dbl>
## 1 SPAIN 2.04e-314
## 2 PORTUGAL 8.19e-116
## 3 BRAZIL 6.20e- 70
Dataset %>%
group_by(sex) %>%
descr(bai_sum)
## Descriptive Statistics
## bai_sum by sex
## Data Frame: Dataset
## N: 736
##
## M F NA
## ----------------- -------- --------- --------
## Mean 6.81 9.47 12.14
## Std.Dev 7.14 8.45 15.07
## Min 0.00 0.00 0.00
## Q1 2.00 4.00 2.00
## Median 5.00 7.00 9.00
## Q3 9.00 13.00 14.00
## Max 44.00 48.00 44.00
## MAD 4.45 5.93 8.90
## IQR 7.00 9.00 11.00
## CV 1.05 0.89 1.24
## Skewness 2.08 1.52 1.20
## SE.Skewness 0.09 0.07 0.79
## Kurtosis 5.51 2.70 -0.04
## N.Valid 731.00 1214.00 7.00
## Pct.Valid 99.32 100.00 100.00
wilcox.test(bai_sum ~ sex, data = Dataset)
##
## Wilcoxon rank sum test with continuity correction
##
## data: bai_sum by sex
## W = 348512, p-value = 0.000000000000002
## alternative hypothesis: true location shift is not equal to 0
Dataset %>%
filter(!is.na(sex)) %>%
{sjstats::mwu(.,bai_sum, sex,distribution = "asymptotic")}
##
## # Mann-Whitney-U-Test
##
## Groups 1 = M (n = 731) | 2 = F (n = 1214):
## U = 616057.500, W = 348511.500, p < 0.001, Z = -7.951
## effect-size r = 0.180
## rank-mean(1) = 842.76
## rank-mean(2) = 1051.42
Dataset %>%
filter(!is.na(sex), !is.na(bai_class)) %>% #<- filtering
group_by(sex) %>%
count(bai_class) %>%
mutate(prop=prop.table(n)*100) %>%
select(-n) %>%
pivot_wider(names_from = c(bai_class),
values_from = prop)
## # A tibble: 2 x 5
## # Groups: sex [2]
## sex minima leve moderada grave
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 M 79.8 13.8 4.38 2.05
## 2 F 65.7 22.5 8.73 3.13
Individual chi square for each sex
Dataset %>%
filter(!is.na(sex)) %>% #<- filtering
group_by(sex) %>%
summarise(pvalue = chisq.test(table(bai_class))$p.value)
## # A tibble: 2 x 2
## sex pvalue
## <fct> <dbl>
## 1 M 5.09e-258
## 2 F 1.51e-252
lm(age ~ country, Dataset) %>%
apaTables::apa.aov.table()
##
##
## ANOVA results using age as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2
## (Intercept) 558849.60 1 558849.60 39488.47 .000
## country 996.03 2 498.01 35.19 .000 .04
## Error 27257.18 1926 14.15
## CI_90_partial_eta2
##
## [.02, .05]
##
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
emmeans::emmeans(lm(age ~ country, Dataset), pairwise ~country, adj = "tukey")
## $emmeans
## country emmean SE df lower.CL upper.CL
## SPAIN 21.5 0.108 1926 21.2 21.7
## PORTUGAL 20.4 0.182 1926 20.0 20.9
## BRAZIL 22.8 0.220 1926 22.3 23.4
##
## Confidence level used: 0.95
## Conf-level adjustment: sidak method for 3 estimates
##
## $contrasts
## contrast estimate SE df t.ratio p.value
## SPAIN - PORTUGAL 1.06 0.212 1926 4.986 <.0001
## SPAIN - BRAZIL -1.33 0.245 1926 -5.450 <.0001
## PORTUGAL - BRAZIL -2.39 0.286 1926 -8.376 <.0001
##
## P value adjustment: tukey method for comparing a family of 3 estimates
Dataset %>%
filter(country == "SPAIN") %>%
{chisq.test(table(.$sex))}
##
## Chi-squared test for given probabilities
##
## data: table(.$sex)
## X-squared = 161, df = 1, p-value <0.0000000000000002
lm(bdi_sum ~ country, Dataset) %>%
residuals() %>%
qplot(.)
lm(bdi_sum ~ country, Dataset) %>%
residuals() %>% shapiro.test(.)
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.88, p-value <0.0000000000000002
lm(bai_sum ~ country, Dataset) %>%
residuals() %>% shapiro.test(.)
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.84, p-value <0.0000000000000002
Correlation
cor(Dataset$bdi_sum, Dataset$bai_sum, method = "spearman", use = "pairwise.complete.obs")
## [1] 0.5762