This markdown reports the analyses performed in the manuscript entitled “Depression and Anxiety Symptoms in a Representative Sample of Undergraduate Students in Spain, Portugal, and Brazil” (DOI: https://dx.doi.org/10.1590/0102.3772e36412). All files are available at https://osf.io/xpvbg/. To reproduce the results, load the Rdata file and run all chunks if ‘eval = FALSE’ is not present. I recommend going straight to line 280 to reproduce the published results.
In case you have any questions or suggestions, please feel free to contact me at luisfca@puc-rio.br
#Global options
knitr::opts_chunk$set(echo = TRUE,
message = FALSE,
warning = FALSE,
include = TRUE,
cache = FALSE)
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
# auto format (kable)
options(kableExtra.auto_format = FALSE)
#get file
load(file="C:/Users/luisf/Dropbox/Puc-Rio/Projeto - Junia/Base R - Pesquisa mapfre.RData")
library(tidyverse) #environment
library(psych) #psychometric analysis
library(summarytools) #descriptive
This set of chunks is not necessary. They just present what I performed with the raw data.
Dataset <- janitor::clean_names(Dataset)
Dataset <- Dataset %>%
mutate(id = row_number())
BDI
plot_bdi <- Dataset %>%
select(starts_with("bdi")) %>%
pivot_longer(everything()) %>%
ggplot(., aes(x=name, y = value)) +
geom_boxplot()
BAI
plot_bai <- Dataset %>%
select(starts_with("bai")) %>%
pivot_longer(everything()) %>%
ggplot(., aes(x=name, y = value)) +
geom_boxplot()
Display plots
gridExtra::grid.arrange(plot_bdi, plot_bai)
BDI
Dataset %>%
select(starts_with("bdi")) %>%
DataExplorer::plot_missing()
BAI
Dataset %>%
select(starts_with("bai")) %>%
DataExplorer::plot_missing()
BDI
Dataset %>%
select(id,country,starts_with("bdi")) %>% #select all items
mutate(number_na = rowSums(is.na(.))) %>% #
filter(number_na == Dataset %>% select(starts_with("bdi")) %>% ncol()) %>% #all missing
count(country, id)
## country id n
## 1 SPAIN 9 1
## 2 SPAIN 10 1
## 3 SPAIN 486 1
## 4 SPAIN 532 1
## 5 PORTUGAL 1217 1
## 6 PORTUGAL 1218 1
## 7 PORTUGAL 1441 1
## 8 BRAZIL 1795 1
BAI
Dataset %>%
select(id,country,starts_with("bai")) %>% #select all items
mutate(number_na = rowSums(is.na(.))) %>% #
filter(number_na == Dataset %>% select(starts_with("bai")) %>% ncol()) %>% #all missing
count(country, id)
## country id n
## 1 SPAIN 8 1
## 2 SPAIN 9 1
## 3 SPAIN 10 1
## 4 PORTUGAL 1217 1
## 5 PORTUGAL 1218 1
Create a vector with the complete data.
imputed_data_bdi <- Dataset %>%
select(starts_with("bdi"), -bdi_sum, -bdi_class) %>%
mice::mice(., m=5, maxit = 50, method = 'pmm', seed = 500)
Just checking the results
summary(imputed_data_bdi)
## Length Class Mode
## data 21 data.frame list
## imp 21 -none- list
## m 1 -none- numeric
## where 41097 -none- logical
## blocks 21 -none- list
## call 6 -none- call
## nmis 21 -none- numeric
## method 21 -none- character
## predictorMatrix 441 -none- numeric
## visitSequence 21 -none- character
## formulas 21 -none- list
## post 21 -none- character
## blots 21 -none- list
## seed 1 -none- numeric
## iteration 1 -none- numeric
## lastSeedValue 626 -none- numeric
## chainMean 5250 -none- numeric
## chainVar 5250 -none- numeric
## loggedEvents 0 -none- NULL
## version 1 package_version list
## date 1 Date numeric
Get complete data ( 2nd out of 5)
complete_data_bdi <- mice::complete(imputed_data_bdi,2)
Double check with this new data is complete
DataExplorer::plot_missing(complete_data_bdi)
Add sums to this new data
complete_data_bdi <- complete_data_bdi %>%
mutate(bdi_sum = rowSums(select(., starts_with("bdi")), na.rm=T))
Now I’ll Compare the results obtained with the raw data (with missing cases) and the results obtained with the complete data.
comparable_results <- bind_rows(Dataset %>%
mutate(base = "original") %>%
select(base, bdi_sum),
complete_data_bdi %>%
mutate(base = "imputada") %>%
select(base, bdi_sum))
The linear correlation will be the used as an index of the association of these two datasets.
comparable_results %>%
unstack(bdi_sum~base) %>%
ggplot(., aes(x = original, y = imputada)) +
geom_jitter() +
geom_smooth()
The Correlaation coefficient is below.
cor(Dataset$bdi_sum, complete_data_bdi$bdi_sum, use = "complete.obs")
The distributionn of descriptive results is also needed.
ggplot(comparable_results, aes(x = base, y = bdi_sum)) +
geom_boxplot()
Finally, the wilcox test will check if statistical differences are present. This test could be performed with pairwise options. However, as some cases are missing, I’ll just run the “independent” version of the wt.
wilcox.test(bdi_sum ~ base, data = comparable_results)
Dataset <- Dataset %>%
mutate(bdi_sum = rowSums(select(., starts_with("bdi")), na.rm=T)) #adjust for all missing)
If the participant left blank all items, do not use
#Dataset
Dataset <- Dataset %>% #get the dataset
mutate(number_na_bdi = rowSums(is.na(select(.,starts_with("bdi"))))) %>% #create a variable to count missing
mutate(bdi_sum = if_else(number_na_bdi == 21, NA_real_,bdi_sum)) #if all is missing
Dataset <- Dataset %>%
mutate(bdi_class = case_when(
bdi_sum <= 13 ~ "minima",
bdi_sum < 20 ~ "leve",
bdi_sum < 29 ~ "moderada",
bdi_sum >= 29 ~ "grave"))
Dataset <- Dataset %>%
mutate(bdi_class = factor(bdi_class,
levels=c("minima","leve","moderada","grave")))
Dataset <- Dataset %>%
mutate(bai_sum = rowSums(select(., starts_with("bai")), na.rm=T)) #adjust for all missing)
If the participant left blank all items, do not use
#Dataset
Dataset <- Dataset %>% #get the dataset
mutate(number_na_bai = rowSums(is.na(select(.,starts_with("bai"))))) %>% #create a variable to count missing
mutate(bai_sum = if_else(number_na_bai == 21, NA_real_,bai_sum)) #if all is missing
Dataset <- Dataset %>%
mutate(bai_class = case_when(
bai_sum <= 10 ~ "minima",
bai_sum < 20 ~ "leve",
bai_sum < 31 ~ "moderada",
bai_sum >= 31 ~ "grave"))
Dataset <- Dataset %>%
mutate(bai_class = factor(bai_class,
levels=c("minima","leve","moderada","grave")))
In the reality, the manuscript starts here.
Dataset %>%
select(starts_with("bdi"), -bdi_sum, -bdi_class) %>%
alpha()
##
## Reliability analysis
## Call: alpha(x = .)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.89 0.9 0.9 0.29 8.7 0.0034 0.44 0.37 0.29
##
## lower alpha upper 95% confidence boundaries
## 0.89 0.89 0.9
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## bdi_1 0.89 0.89 0.89 0.29 8.1 0.0036 0.0049 0.29
## bdi_2 0.89 0.89 0.90 0.30 8.4 0.0035 0.0050 0.30
## bdi_3 0.89 0.89 0.90 0.29 8.2 0.0035 0.0049 0.29
## bdi_4 0.89 0.89 0.90 0.29 8.2 0.0036 0.0050 0.29
## bdi_5 0.89 0.89 0.90 0.29 8.3 0.0035 0.0052 0.29
## bdi_6 0.89 0.89 0.90 0.30 8.5 0.0035 0.0051 0.30
## bdi_7 0.89 0.89 0.89 0.29 8.1 0.0036 0.0047 0.29
## bdi_8 0.89 0.89 0.90 0.29 8.3 0.0035 0.0052 0.29
## bdi_9 0.89 0.90 0.90 0.30 8.6 0.0034 0.0045 0.30
## bdi_10 0.89 0.89 0.90 0.30 8.5 0.0035 0.0051 0.30
## bdi_11 0.89 0.89 0.90 0.30 8.4 0.0035 0.0052 0.30
## bdi_12 0.89 0.89 0.90 0.29 8.2 0.0036 0.0051 0.29
## bdi_13 0.89 0.89 0.90 0.29 8.2 0.0036 0.0051 0.29
## bdi_14 0.89 0.89 0.89 0.29 8.1 0.0036 0.0047 0.29
## bdi_15 0.89 0.89 0.89 0.29 8.1 0.0036 0.0046 0.29
## bdi_16 0.89 0.89 0.90 0.30 8.5 0.0035 0.0048 0.30
## bdi_17 0.89 0.89 0.90 0.29 8.2 0.0036 0.0053 0.29
## bdi_18 0.89 0.89 0.90 0.30 8.4 0.0035 0.0052 0.29
## bdi_19 0.89 0.89 0.90 0.29 8.3 0.0035 0.0052 0.29
## bdi_20 0.89 0.89 0.89 0.29 8.2 0.0036 0.0048 0.29
## bdi_21 0.89 0.89 0.90 0.30 8.5 0.0035 0.0051 0.30
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## bdi_1 1946 0.64 0.66 0.65 0.60 0.248 0.53
## bdi_2 1947 0.52 0.52 0.48 0.45 0.543 0.69
## bdi_3 1949 0.58 0.60 0.57 0.53 0.298 0.57
## bdi_4 1947 0.62 0.62 0.60 0.56 0.375 0.59
## bdi_5 1946 0.56 0.56 0.53 0.50 0.488 0.61
## bdi_6 1946 0.48 0.50 0.46 0.43 0.162 0.51
## bdi_7 1945 0.66 0.65 0.64 0.60 0.386 0.70
## bdi_8 1949 0.59 0.58 0.55 0.52 0.790 0.72
## bdi_9 1947 0.40 0.44 0.40 0.36 0.072 0.32
## bdi_10 1947 0.52 0.51 0.47 0.45 0.342 0.69
## bdi_11 1941 0.54 0.53 0.49 0.47 0.544 0.69
## bdi_12 1943 0.60 0.61 0.58 0.54 0.433 0.62
## bdi_13 1943 0.61 0.60 0.57 0.54 0.464 0.79
## bdi_14 1941 0.64 0.64 0.62 0.59 0.307 0.67
## bdi_15 1939 0.66 0.65 0.64 0.60 0.565 0.69
## bdi_16 1933 0.53 0.51 0.48 0.45 0.855 0.78
## bdi_17 1938 0.60 0.60 0.57 0.54 0.434 0.64
## bdi_18 1942 0.55 0.54 0.50 0.48 0.535 0.71
## bdi_19 1942 0.59 0.57 0.54 0.52 0.756 0.80
## bdi_20 1942 0.63 0.62 0.60 0.57 0.527 0.69
## bdi_21 1939 0.48 0.51 0.47 0.44 0.136 0.45
##
## Non missing response frequency for each item
## 0 1 2 3 miss
## bdi_1 0.79 0.17 0.03 0.01 0.01
## bdi_2 0.55 0.38 0.05 0.02 0.01
## bdi_3 0.75 0.20 0.04 0.01 0.00
## bdi_4 0.68 0.28 0.04 0.01 0.01
## bdi_5 0.56 0.40 0.03 0.01 0.01
## bdi_6 0.88 0.10 0.01 0.02 0.01
## bdi_7 0.72 0.20 0.06 0.02 0.01
## bdi_8 0.37 0.50 0.11 0.02 0.00
## bdi_9 0.94 0.05 0.00 0.01 0.01
## bdi_10 0.77 0.14 0.07 0.02 0.01
## bdi_11 0.55 0.39 0.04 0.02 0.01
## bdi_12 0.63 0.33 0.03 0.01 0.01
## bdi_13 0.67 0.24 0.04 0.05 0.01
## bdi_14 0.80 0.10 0.09 0.01 0.01
## bdi_15 0.54 0.37 0.08 0.01 0.01
## bdi_16 0.35 0.49 0.12 0.04 0.01
## bdi_17 0.64 0.31 0.04 0.01 0.01
## bdi_18 0.57 0.34 0.06 0.02 0.01
## bdi_19 0.46 0.34 0.18 0.02 0.01
## bdi_20 0.57 0.35 0.06 0.02 0.01
## bdi_21 0.90 0.08 0.02 0.01 0.01
Dataset %>%
select(starts_with("bai"), -bai_sum, -bai_class) %>%
alpha()
##
## Reliability analysis
## Call: alpha(x = .)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.9 0.9 0.91 0.31 9.4 0.0032 0.41 0.39 0.31
##
## lower alpha upper 95% confidence boundaries
## 0.89 0.9 0.91
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## bai_1 0.90 0.9 0.91 0.31 9.1 0.0033 0.0081 0.31
## bai_2 0.90 0.9 0.91 0.32 9.2 0.0032 0.0077 0.31
## bai_3 0.90 0.9 0.91 0.31 9.0 0.0033 0.0082 0.31
## bai_4 0.89 0.9 0.91 0.31 8.8 0.0034 0.0074 0.30
## bai_5 0.89 0.9 0.90 0.30 8.7 0.0034 0.0071 0.30
## bai_6 0.90 0.9 0.91 0.31 9.0 0.0033 0.0080 0.30
## bai_7 0.89 0.9 0.91 0.31 8.8 0.0034 0.0075 0.30
## bai_8 0.89 0.9 0.91 0.31 8.8 0.0033 0.0078 0.30
## bai_9 0.90 0.9 0.91 0.31 8.9 0.0033 0.0075 0.31
## bai_10 0.89 0.9 0.90 0.30 8.7 0.0034 0.0074 0.30
## bai_11 0.89 0.9 0.90 0.30 8.8 0.0034 0.0074 0.30
## bai_12 0.89 0.9 0.90 0.31 8.8 0.0033 0.0077 0.30
## bai_13 0.90 0.9 0.90 0.30 8.8 0.0033 0.0074 0.30
## bai_14 0.90 0.9 0.91 0.31 8.9 0.0033 0.0079 0.31
## bai_15 0.90 0.9 0.91 0.31 9.0 0.0033 0.0076 0.30
## bai_16 0.90 0.9 0.91 0.32 9.4 0.0032 0.0068 0.31
## bai_17 0.89 0.9 0.90 0.31 8.8 0.0034 0.0076 0.30
## bai_18 0.90 0.9 0.91 0.31 9.1 0.0033 0.0082 0.31
## bai_19 0.90 0.9 0.91 0.31 9.0 0.0033 0.0081 0.31
## bai_20 0.90 0.9 0.91 0.32 9.4 0.0031 0.0070 0.31
## bai_21 0.90 0.9 0.91 0.31 9.1 0.0033 0.0081 0.31
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## bai_1 1944 0.53 0.53 0.49 0.47 0.41 0.63
## bai_2 1946 0.49 0.48 0.44 0.42 0.69 0.75
## bai_3 1940 0.55 0.56 0.53 0.50 0.34 0.63
## bai_4 1941 0.66 0.63 0.62 0.59 0.89 0.89
## bai_5 1947 0.70 0.68 0.67 0.64 0.69 0.90
## bai_6 1949 0.57 0.58 0.55 0.52 0.28 0.61
## bai_7 1948 0.65 0.65 0.63 0.60 0.34 0.68
## bai_8 1948 0.63 0.63 0.61 0.58 0.36 0.65
## bai_9 1942 0.60 0.61 0.59 0.55 0.20 0.53
## bai_10 1946 0.70 0.67 0.66 0.63 1.04 0.88
## bai_11 1944 0.65 0.66 0.65 0.61 0.22 0.59
## bai_12 1946 0.63 0.64 0.63 0.58 0.30 0.63
## bai_13 1942 0.63 0.65 0.64 0.59 0.17 0.47
## bai_14 1947 0.58 0.59 0.56 0.53 0.24 0.59
## bai_15 1945 0.57 0.59 0.57 0.52 0.22 0.57
## bai_16 1947 0.41 0.41 0.36 0.34 0.18 0.56
## bai_17 1940 0.65 0.65 0.63 0.60 0.39 0.69
## bai_18 1944 0.56 0.55 0.51 0.49 0.55 0.77
## bai_19 1939 0.53 0.56 0.53 0.49 0.14 0.44
## bai_20 1940 0.43 0.43 0.38 0.36 0.50 0.71
## bai_21 1944 0.53 0.53 0.50 0.47 0.37 0.66
##
## Non missing response frequency for each item
## 0 1 2 3 miss
## bai_1 0.67 0.26 0.07 0.00 0.01
## bai_2 0.47 0.38 0.14 0.01 0.01
## bai_3 0.74 0.19 0.07 0.01 0.01
## bai_4 0.41 0.35 0.19 0.05 0.01
## bai_5 0.56 0.24 0.15 0.05 0.01
## bai_6 0.80 0.13 0.06 0.01 0.00
## bai_7 0.76 0.16 0.07 0.01 0.00
## bai_8 0.73 0.20 0.06 0.01 0.00
## bai_9 0.85 0.10 0.04 0.01 0.01
## bai_10 0.30 0.42 0.21 0.07 0.01
## bai_11 0.85 0.09 0.05 0.01 0.01
## bai_12 0.78 0.15 0.06 0.01 0.01
## bai_13 0.87 0.10 0.03 0.00 0.01
## bai_14 0.83 0.12 0.04 0.01 0.01
## bai_15 0.85 0.10 0.04 0.01 0.01
## bai_16 0.89 0.06 0.03 0.02 0.01
## bai_17 0.71 0.20 0.07 0.01 0.01
## bai_18 0.60 0.27 0.10 0.02 0.01
## bai_19 0.89 0.08 0.03 0.00 0.01
## bai_20 0.61 0.29 0.09 0.01 0.01
## bai_21 0.72 0.21 0.06 0.01 0.01
Country
Dataset %>%
freq(country)
## Frequencies
## Dataset$country
## Type: Factor
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## -------------- ------ --------- -------------- --------- --------------
## SPAIN 1216 62.14 62.14 62.14 62.14
## PORTUGAL 426 21.77 83.90 21.77 83.90
## BRAZIL 315 16.10 100.00 16.10 100.00
## <NA> 0 0.00 100.00
## Total 1957 100.00 100.00 100.00 100.00
Check differences
Dataset %>%
group_by(country) %>%
summarise(pvalue = chisq.test(table(sex))$p.value)
## # A tibble: 3 x 2
## country pvalue
## <fct> <dbl>
## 1 SPAIN 7.34e-37
## 2 PORTUGAL 3.33e- 1
## 3 BRAZIL 3.38e- 1
sex
Dataset %>%
group_by(country) %>%
freq(sex)
## Frequencies
## Dataset$sex
## Type: Factor
## Group: country = SPAIN
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## M 384 31.76 31.76 31.58 31.58
## F 825 68.24 100.00 67.85 99.42
## <NA> 7 0.58 100.00
## Total 1216 100.00 100.00 100.00 100.00
##
## Group: country = PORTUGAL
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## M 203 47.65 47.65 47.65 47.65
## F 223 52.35 100.00 52.35 100.00
## <NA> 0 0.00 100.00
## Total 426 100.00 100.00 100.00 100.00
##
## Group: country = BRAZIL
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## M 149 47.30 47.30 47.30 47.30
## F 166 52.70 100.00 52.70 100.00
## <NA> 0 0.00 100.00
## Total 315 100.00 100.00 100.00 100.00
Age
Dataset %>%
group_by(country, sex) %>%
descr(age)
## Descriptive Statistics
## Dataset$age
## Group: country = SPAIN, sex = M
## N: 384
##
## age
## ----------------- --------
## Mean 21.43
## Std.Dev 3.48
## Min 18.00
## Q1 19.00
## Median 21.00
## Q3 22.00
## Max 48.00
## MAD 2.97
## IQR 3.00
## CV 0.16
## Skewness 2.85
## SE.Skewness 0.12
## Kurtosis 14.11
## N.Valid 384.00
## Pct.Valid 100.00
##
## Group: country = SPAIN, sex = F
## N: 825
##
## age
## ----------------- --------
## Mean 21.52
## Std.Dev 2.77
## Min 18.00
## Q1 20.00
## Median 21.00
## Q3 23.00
## Max 43.00
## MAD 2.97
## IQR 3.00
## CV 0.13
## Skewness 2.29
## SE.Skewness 0.09
## Kurtosis 11.63
## N.Valid 823.00
## Pct.Valid 99.76
##
## Group: country = SPAIN, sex = NA
## N: 7
##
## age
## ----------------- -------
## Mean 21.33
## Std.Dev 4.16
## Min 18.00
## Q1 18.00
## Median 20.00
## Q3 26.00
## Max 26.00
## MAD 2.97
## IQR 4.00
## CV 0.20
## Skewness 0.29
## SE.Skewness 1.22
## Kurtosis -2.33
## N.Valid 3.00
## Pct.Valid 42.86
##
## Group: country = PORTUGAL, sex = M
## N: 203
##
## age
## ----------------- --------
## Mean 20.51
## Std.Dev 1.64
## Min 18.00
## Q1 19.00
## Median 21.00
## Q3 22.00
## Max 23.00
## MAD 1.48
## IQR 3.00
## CV 0.08
## Skewness -0.06
## SE.Skewness 0.17
## Kurtosis -1.23
## N.Valid 203.00
## Pct.Valid 100.00
##
## Group: country = PORTUGAL, sex = F
## N: 223
##
## age
## ----------------- --------
## Mean 20.37
## Std.Dev 1.68
## Min 18.00
## Q1 19.00
## Median 20.00
## Q3 22.00
## Max 23.00
## MAD 1.48
## IQR 3.00
## CV 0.08
## Skewness 0.09
## SE.Skewness 0.16
## Kurtosis -1.30
## N.Valid 223.00
## Pct.Valid 100.00
##
## Group: country = BRAZIL, sex = M
## N: 149
##
## age
## ----------------- --------
## Mean 22.13
## Std.Dev 4.31
## Min 17.00
## Q1 20.00
## Median 21.00
## Q3 23.00
## Max 41.00
## MAD 1.48
## IQR 3.00
## CV 0.19
## Skewness 2.40
## SE.Skewness 0.21
## Kurtosis 6.45
## N.Valid 133.00
## Pct.Valid 89.26
##
## Group: country = BRAZIL, sex = F
## N: 166
##
## age
## ----------------- --------
## Mean 23.41
## Std.Dev 8.87
## Min 17.00
## Q1 19.00
## Median 21.00
## Q3 23.00
## Max 68.00
## MAD 2.97
## IQR 4.00
## CV 0.38
## Skewness 3.18
## SE.Skewness 0.19
## Kurtosis 10.26
## N.Valid 160.00
## Pct.Valid 96.39
Check differences
Dataset %>%
group_by(country) %>%
nest() %>%
mutate(t_test = map(data, ~t.test(.$age ~ .$sex)$p.value)) %>%
unnest(t_test)
## # A tibble: 3 x 3
## # Groups: country [3]
## country data t_test
## <fct> <list> <dbl>
## 1 SPAIN <tibble [1,216 x 93]> 0.669
## 2 PORTUGAL <tibble [426 x 93]> 0.387
## 3 BRAZIL <tibble [315 x 93]> 0.109
This table was checked twice. Last checked on November 4 2020.
The table in the manuscript presents some descriptive analysis and inferential results. However, it does not present a (formal or typical) regression analysis, but rather several univariate tests. These last tests were conducted after some dispute during the peer review. However, the results are parallel and they are presented below.
Dataset %>%
descr(bdi_sum)
## Descriptive Statistics
## Dataset$bdi_sum
## N: 1957
##
## bdi_sum
## ----------------- ---------
## Mean 9.23
## Std.Dev 7.74
## Min 0.00
## Q1 4.00
## Median 8.00
## Q3 13.00
## Max 57.00
## MAD 5.93
## IQR 9.00
## CV 0.84
## Skewness 1.62
## SE.Skewness 0.06
## Kurtosis 4.11
## N.Valid 1949.00
## Pct.Valid 99.59
Dataset %>%
group_by(country) %>%
descr(bdi_sum)
## Descriptive Statistics
## bdi_sum by country
## Data Frame: Dataset
## N: 1216
##
## SPAIN PORTUGAL BRAZIL
## ----------------- --------- ---------- --------
## Mean 8.86 9.05 10.89
## Std.Dev 7.54 7.73 8.29
## Min 0.00 0.00 0.00
## Q1 3.00 4.00 5.00
## Median 7.00 7.00 9.50
## Q3 12.00 12.00 15.00
## Max 57.00 53.00 41.00
## MAD 5.93 5.93 8.15
## IQR 9.00 8.00 10.00
## CV 0.85 0.85 0.76
## Skewness 1.77 1.76 1.01
## SE.Skewness 0.07 0.12 0.14
## Kurtosis 5.12 5.11 0.80
## N.Valid 1212.00 423.00 314.00
## Pct.Valid 99.67 99.30 99.68
Percentages
Dataset %>%
group_by(country) %>%
filter(!is.na(bdi_class)) %>%
count(bdi_class) %>%
mutate(prop=prop.table(n)*100) %>%
select(-n) %>%
pivot_wider(names_from = c(bdi_class),
values_from = prop)
## # A tibble: 3 x 5
## # Groups: country [3]
## country minima leve moderada grave
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 SPAIN 80.4 11.4 5.69 2.56
## 2 PORTUGAL 79.0 11.8 6.38 2.84
## 3 BRAZIL 68.8 16.2 10.8 4.14
Individual chi square for each country
Dataset %>%
group_by(country) %>%
summarise(pvalue = chisq.test(table(bdi_class))$p.value)
## # A tibble: 3 x 2
## country pvalue
## <fct> <dbl>
## 1 SPAIN 0.
## 2 PORTUGAL 1.48e-143
## 3 BRAZIL 2.67e- 71
Dataset %>%
group_by(sex) %>%
descr(bdi_sum)
## Descriptive Statistics
## bdi_sum by sex
## Data Frame: Dataset
## N: 736
##
## M F NA
## ----------------- -------- --------- --------
## Mean 8.68 9.59 5.14
## Std.Dev 7.74 7.73 5.46
## Min 0.00 0.00 1.00
## Q1 3.00 4.00 2.00
## Median 7.00 8.00 3.00
## Q3 12.00 13.00 9.00
## Max 57.00 53.00 16.00
## MAD 5.93 7.41 1.48
## IQR 9.00 9.00 4.00
## CV 0.89 0.81 1.06
## Skewness 1.89 1.47 1.04
## SE.Skewness 0.09 0.07 0.79
## Kurtosis 5.63 3.29 -0.65
## N.Valid 732.00 1210.00 7.00
## Pct.Valid 99.46 99.67 100.00
Percentages
Dataset %>%
filter(!is.na(sex), !is.na(bdi_class)) %>% #<- filtering
group_by(sex) %>%
count(bdi_class) %>%
mutate(prop=prop.table(n)*100) %>%
select(-n) %>%
pivot_wider(names_from = c(bdi_class),
values_from = prop)
## # A tibble: 2 x 5
## # Groups: sex [2]
## sex minima leve moderada grave
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 M 81.0 10.7 5.05 3.28
## 2 F 76.4 13.2 7.69 2.64
Individual chi square for each sex
Dataset %>%
filter(!is.na(sex)) %>% #<- filtering
group_by(sex) %>%
summarise(pvalue = chisq.test(table(bai_class))$p.value)
## # A tibble: 2 x 2
## sex pvalue
## <fct> <dbl>
## 1 M 5.09e-258
## 2 F 1.51e-252
Regress the results of depression (BDI) on sex and country.
mod_completo_bdi <- lm(bdi_sum ~ sex + country, Dataset)
apaTables::apa.aov.table(mod_completo_bdi)
##
##
## ANOVA results using bdi_sum as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2
## (Intercept) 35232.64 1 35232.64 595.18 .000
## sex 528.14 1 528.14 8.92 .003 .00
## country 1179.55 2 589.77 9.96 .000 .01
## Error 114723.93 1938 59.20
## CI_90_partial_eta2
##
## [.00, .01]
## [.00, .02]
##
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
Check the assumptions
#normality of the residuals
olsrr::ols_test_normality(mod_completo_bdi)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.8825 0.0000
## Kolmogorov-Smirnov 0.1111 0.0000
## Cramer-von Mises 171.717 0.0000
## Anderson-Darling 47.4803 0.0000
## -----------------------------------------------
#equal variance of the residuals
olsrr::ols_test_breusch_pagan(mod_completo_bdi)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------
## Response : bdi_sum
## Variables: fitted values of bdi_sum
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 2.501723
## Prob > Chi2 = 0.1137219
#all plots
lindia::gg_diagnose(mod_completo_bdi)
Dataset %>%
descr(bai_sum)
## Descriptive Statistics
## Dataset$bai_sum
## N: 1957
##
## bai_sum
## ----------------- ---------
## Mean 8.48
## Std.Dev 8.11
## Min 0.00
## Q1 3.00
## Median 6.00
## Q3 12.00
## Max 48.00
## MAD 5.93
## IQR 9.00
## CV 0.96
## Skewness 1.71
## SE.Skewness 0.06
## Kurtosis 3.48
## N.Valid 1952.00
## Pct.Valid 99.74
Dataset %>%
group_by(country) %>%
descr(bai_sum)
## Descriptive Statistics
## bai_sum by country
## Data Frame: Dataset
## N: 1216
##
## SPAIN PORTUGAL BRAZIL
## ----------------- --------- ---------- --------
## Mean 8.55 7.92 9.01
## Std.Dev 8.06 8.04 8.40
## Min 0.00 0.00 0.00
## Q1 3.00 2.00 3.00
## Median 6.00 5.00 6.00
## Q3 12.00 11.00 12.00
## Max 48.00 45.00 46.00
## MAD 5.93 5.93 5.93
## IQR 9.00 9.00 9.00
## CV 0.94 1.02 0.93
## Skewness 1.68 1.82 1.64
## SE.Skewness 0.07 0.12 0.14
## Kurtosis 3.38 3.87 3.31
## N.Valid 1213.00 424.00 315.00
## Pct.Valid 99.75 99.53 100.00
Percentages
Dataset %>%
filter(!is.na(sex), !is.na(bai_class)) %>% #<- filtering
group_by(sex) %>%
count(bai_class) %>%
mutate(prop=prop.table(n)*100) %>%
select(-n) %>%
pivot_wider(names_from = c(bai_class),
values_from = prop)
## # A tibble: 2 x 5
## # Groups: sex [2]
## sex minima leve moderada grave
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 M 79.8 13.8 4.38 2.05
## 2 F 65.7 22.5 8.73 3.13
Dataset %>%
filter(!is.na(sex)) %>% #<- filtering
group_by(sex) %>%
summarise(pvalue = chisq.test(table(bai_class))$p.value)
## # A tibble: 2 x 2
## sex pvalue
## <fct> <dbl>
## 1 M 5.09e-258
## 2 F 1.51e-252
mod_completo_bai <- lm(bai_sum ~ sex + country, Dataset)
apaTables::apa.aov.table(mod_completo_bai)
##
##
## ANOVA results using bai_sum as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2
## (Intercept) 23701.50 1 23701.50 372.46 .000
## sex 3248.35 1 3248.35 51.05 .000 .03
## country 252.51 2 126.25 1.98 .138 .00
## Error 123515.25 1941 63.63
## CI_90_partial_eta2
##
## [.02, .04]
## [.00, .01]
##
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
Assumptions
#normality
olsrr::ols_test_normality(mod_completo_bai)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.8549 0.0000
## Kolmogorov-Smirnov 0.1427 0.0000
## Cramer-von Mises 186.2753 0.0000
## Anderson-Darling 72.9096 0.0000
## -----------------------------------------------
#equal variance
olsrr::ols_test_breusch_pagan(mod_completo_bai)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------
## Response : bai_sum
## Variables: fitted values of bai_sum
##
## Test Summary
## -------------------------------
## DF = 1
## Chi2 = 27.60531
## Prob > Chi2 = 1.487695e-07
#all plots
lindia::gg_diagnose(mod_completo_bai)
Assumptions were violated and univariate analyses were suggested.
Kruskall-Walis is a non-parametric procedure, which is similar to perfoming a one-way ANOVA on ranks.
kruskal.test(bdi_sum ~ country, data = Dataset)
##
## Kruskal-Wallis rank sum test
##
## data: bdi_sum by country
## Kruskal-Wallis chi-squared = 17.675, df = 2, p-value = 0.0001452
The results suggest a statistically significant difference. The effect size is compute by the code below.
Dataset %>%
rstatix::kruskal_effsize(bdi_sum ~ country) #effect size
## # A tibble: 1 x 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 bdi_sum 1957 0.00802 eta2[H] small
Pairwise Test for Multiple Comparisons of Mean Rank Sums (Nemenyi-Tests).
PMCMR::posthoc.kruskal.nemenyi.test(x=Dataset$bdi_sum, g=Dataset$country, method="Tukey")
##
## Pairwise comparisons using Tukey and Kramer (Nemenyi) test
## with Tukey-Dist approximation for independent samples
##
## data: Dataset$bdi_sum and Dataset$country
##
## SPAIN PORTUGAL
## PORTUGAL 0.9351 -
## BRAZIL 9.8e-05 0.0031
##
## P value adjustment method: none
And Dunn test.
Dataset %>%
rstatix::dunn_test(bdi_sum ~ country, p.adjust.method = "bonferroni")
## # A tibble: 3 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 bdi_sum SPAIN PORTUGAL 1212 423 0.349 7.27e-1 1.00e+0 ns
## 2 bdi_sum SPAIN BRAZIL 1212 314 4.16 3.22e-5 9.67e-5 ****
## 3 bdi_sum PORTUGAL BRAZIL 423 314 3.27 1.08e-3 3.24e-3 **
dunn.test::dunn.test(Dataset$bdi_sum,Dataset$country,method = "Bonferroni")
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 17.675, df = 2, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | BRAZIL PORTUGAL
## ---------+----------------------
## PORTUGAL | 3.269105
## | 0.0016*
## |
## SPAIN | 4.157182 0.349355
## | 0.0000* 1.0000
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
wilcox.test(bdi_sum ~ sex, data = Dataset)
##
## Wilcoxon rank sum test with continuity correction
##
## data: bdi_sum by sex
## W = 404216, p-value = 0.001232
## alternative hypothesis: true location shift is not equal to 0
Dataset %>%
filter(!is.na(sex)) %>%
{sjstats::mwu(.,bdi_sum, sex,distribution = "asymptotic")}
##
## # Mann-Whitney-U-Test
##
## Groups 1 = M (n = 732) | 2 = F (n = 1210):
## U = 672494.000, W = 404216.000, p = 0.001, Z = -3.231
## effect-size r = 0.073
## rank-mean(1) = 918.71
## rank-mean(2) = 1003.44
kruskal.test(bai_sum ~ country, data = Dataset)
##
## Kruskal-Wallis rank sum test
##
## data: bai_sum by country
## Kruskal-Wallis chi-squared = 5.7509, df = 2, p-value = 0.05639
Dataset %>%
rstatix::kruskal_effsize(bai_sum ~ country) #effect size
## # A tibble: 1 x 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 bai_sum 1957 0.00192 eta2[H] small
Dataset %>%
group_by(country) %>%
filter(!is.na(bai_class)) %>%
count(bai_class) %>%
mutate(prop=prop.table(n)*100) %>%
select(-n) %>%
pivot_wider(names_from = c(bai_class),
values_from = prop)
## # A tibble: 3 x 5
## # Groups: country [3]
## country minima leve moderada grave
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 SPAIN 71.2 19.0 7.17 2.64
## 2 PORTUGAL 72.6 18.4 5.90 3.07
## 3 BRAZIL 67.3 21.6 8.25 2.86
Individual chi square for each country
Dataset %>%
group_by(country) %>%
summarise(pvalue = chisq.test(table(bai_class))$p.value)
## # A tibble: 3 x 2
## country pvalue
## <fct> <dbl>
## 1 SPAIN 2.04e-314
## 2 PORTUGAL 8.19e-116
## 3 BRAZIL 6.20e- 70
wilcox.test(bai_sum ~ sex, data = Dataset)
##
## Wilcoxon rank sum test with continuity correction
##
## data: bai_sum by sex
## W = 348512, p-value = 1.85e-15
## alternative hypothesis: true location shift is not equal to 0
Dataset %>%
filter(!is.na(sex)) %>%
{sjstats::mwu(.,bai_sum, sex,distribution = "asymptotic")}
##
## # Mann-Whitney-U-Test
##
## Groups 1 = M (n = 731) | 2 = F (n = 1214):
## U = 616057.500, W = 348511.500, p < 0.001, Z = -7.951
## effect-size r = 0.180
## rank-mean(1) = 842.76
## rank-mean(2) = 1051.42
cor.test(Dataset$bdi_sum, Dataset$bai_sum, method = "spearman", use = "pairwise.complete.obs")
##
## Spearman's rank correlation rho
##
## data: Dataset$bdi_sum and Dataset$bai_sum
## S = 522131145, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5761969
# CSV
write.table(Dataset, file = "dataset_mapfre.csv", sep = ",", col.names = TRUE,row.names = FALSE, qmethod = "double")
write.table(backup, file = "dataset_original_mapfre.csv", sep = ",", col.names = TRUE,row.names = FALSE, qmethod = "double")
End of codes. If use, please cite.