library(foreign)
library(ltm)
## Loading required package: MASS
## Loading required package: msm
## Loading required package: polycor
library(likert)     # create basic Likert tables and plots
## Loading required package: ggplot2
## Loading required package: xtable
library(kableExtra) # create formatted tables 
df = read.spss("/Users/oliviaopitz/Documents/1. Semester MCI/Quantitative Research Methods/ESS11.sav", to.data.frame = T)
#names(df)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

R Markdown

Introduction
Depression is a prevalent mental health condition that significantly impairs individual well-being and poses major public health challenges. As one of the primary causes of disability-adjusted life years (DALYs) globally, understanding modifiable determinants of depression is essential for guiding targeted prevention strategies (Balsamo & Carlucci, 2020).

#This study examines how gender, diet (vegetable consumption), socializing, and education influence depression.

#Dependent Variable: Depression
To assess depressive symptoms, this study utilized a Depression Score, which was calculated based on participants’ self-reported experiences over the past week. The score captured various aspects of emotional well-being and psychological distress by measuring both negative and positive mental states. Responses were recorded on a standardized numerical scale, ensuring comparability across participants.
To maintain consistency in interpretation, items reflecting positive emotions were reverse scored so that higher values consistently indicated worse mental health. The final Depression Score was computed by averaging the responses across all included items, providing a reliable measure of depressive symptomatology.

The score was derived from the following eight indicators:
- Felt depressed (fltdpr) – Frequency of experiencing depressive feelings.
- Everything was an effort (flteeff) – Perceived difficulty in completing daily tasks.
- Sleep was restless (slprl) – Occurrence of disrupted or restless sleep.
- Were happy (wrhpp) – Frequency of feeling happy (reverse scored).
- Felt lonely (fltlnl) – Extent of experienced loneliness.
- Enjoyed life (enjlf) – Degree of life enjoyment (reverse scored).
- Felt sad (fltsd) – Frequency of experiencing sadness.
- Could not get going (cldgng) – Difficulty initiating or maintaining activities.

# convert to numbers 1-5
df$d20 = as.numeric(df$fltdpr)
df$d21 = as.numeric(df$flteeff)
df$d22 = as.numeric(df$slprl)
df$d23 = as.numeric(df$wrhpp)
df$d24 = as.numeric(df$fltlnl)
df$d25 = as.numeric(df$enjlf)
df$d26 = as.numeric(df$fltsd)
df$d27 = as.numeric(df$cldgng)

# Reverse scales of D23 and D25 (differently poled than the others)
# Note: Alternatively we could reverse the others. This would, however, 
# result in a reversely defined sumscore with high sums indicating LOW tolerance.
# We prefer positively defined scales with high sums indicating HIGH tolerance.
df$d23 = 5 - df$d23
df$d25 = 5 - df$d25

# Calculate Cronbach's alpha to check internal consistency ("reliability") of tolerance items
cronbach.alpha(df[,c("d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27")], na.rm=T)
## 
## Cronbach's alpha for the 'df[, c("d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27")]' data-set
## 
## Items: 8
## Sample units: 40156
## alpha: 0.823

The internal consistency of the depression scale, comprising eight items (d20 to d27), was assessed using Cronbach’s alpha. The analysis produced an alpha coefficient of 0.823 based on responses from 40,156 participants, indicating good internal reliability. This suggests that the items consistently measure the same underlying construct and support the use of a composite depression score in subsequent analyses.

# Hypotheses 1: Female Italiens have a higher risk of suffering from depression than males

df$depression = rowSums(df[,c("d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27")]) -8 
by(df$depression, df$gndr, mean, na.rm=T)
## df$gndr: Male
## [1] 5.031965
## ------------------------------------------------------------ 
## df$gndr: Female
## [1] 6.02142

The analysis confirms that women report higher mean depression scores (1.75) compared to men (1.63), supporting the hypothesis that gender differences exist in depressive symptoms. This finding aligns with previous research indicating that women tend to experience higher levels of depression due to a combination of biological, psychological, and social factors. The results were derived from descriptive statistics, which demonstrated a consistent pattern across the dataset.

# Hypotheses 2: Italians with higher education have a higher risk of suffering from depression

## 
##             Not possible to harmonise into ES-ISCED 
##                                                   0 
##              ES-ISCED I , less than lower secondary 
##                                                2946 
##                        ES-ISCED II, lower secondary 
##                                                6248 
##           ES-ISCED IIIb, lower tier upper secondary 
##                                                6520 
##           ES-ISCED IIIa, upper tier upper secondary 
##                                                8979 
##        ES-ISCED IV, advanced vocational, sub-degree 
##                                                4460 
##     ES-ISCED V1, lower tertiary education, BA level 
##                                                5243 
## ES-ISCED V2, higher tertiary education, >= MA level 
##                                                5493 
##                                               Other 
##                                                 129
##                                                      
##                                                        low medium high
##   Not possible to harmonise into ES-ISCED                0      0    0
##   ES-ISCED I , less than lower secondary              2946      0    0
##   ES-ISCED II, lower secondary                        6248      0    0
##   ES-ISCED IIIb, lower tier upper secondary              0   6520    0
##   ES-ISCED IIIa, upper tier upper secondary              0   8979    0
##   ES-ISCED IV, advanced vocational, sub-degree           0      0 4460
##   ES-ISCED V1, lower tertiary education, BA level        0      0 5243
##   ES-ISCED V2, higher tertiary education, >= MA level    0      0 5493
##   Other                                                  0      0    0
## df$edu: low
## [1] 6.692737
## ------------------------------------------------------------ 
## df$edu: medium
## [1] 5.562467
## ------------------------------------------------------------ 
## df$edu: high
## [1] 4.873489
##                Df Sum Sq Mean Sq F value Pr(>F)    
## edu             2  18556    9278   599.3 <2e-16 ***
## Residuals   39123 605661      15                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1030 observations deleted due to missingness

A strong inverse relationship between education level and depression was observed. Individuals with a “high” level of education reported the lowest mean depression score (1.59), followed by those with a “medium” level (1.67), while those in the “low” education category had the highest depression scores (1.92). The ANOVA test confirmed the significance of these differences (F = 116.5, p < 0.001). This result suggests that higher education levels may be associated with better coping mechanisms, greater economic stability, and increased access to mental health resources, all of which contribute to lower depression scores.

# Hypotheses 3: Italians eating many vegetables have a lower risk of suffering from depression (–> Mediterranean Diet)

by(df$depression, df$edu, mean, na.rm=T)
## df$edu: low
## [1] 6.692737
## ------------------------------------------------------------ 
## df$edu: medium
## [1] 5.562467
## ------------------------------------------------------------ 
## df$edu: high
## [1] 4.873489
anova_model=aov(depression ~ edu, data = df)
summary(anova_model)
##                Df Sum Sq Mean Sq F value Pr(>F)    
## edu             2  18556    9278   599.3 <2e-16 ***
## Residuals   39123 605661      15                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1030 observations deleted due to missingness
#table(df$eatveg)

# Anova:
by(df$depression, df$eatveg, mean, na.rm=T)
## df$eatveg: Three times or more a day
## [1] 5.274107
## ------------------------------------------------------------ 
## df$eatveg: Twice a day
## [1] 5.24034
## ------------------------------------------------------------ 
## df$eatveg: Once a day
## [1] 5.314414
## ------------------------------------------------------------ 
## df$eatveg: Less than once a day but at least 4 times a week
## [1] 5.921109
## ------------------------------------------------------------ 
## df$eatveg: Less than 4 times a week but at least once a week
## [1] 6.43712
## ------------------------------------------------------------ 
## df$eatveg: Less than once a week
## [1] 7.191693
## ------------------------------------------------------------ 
## df$eatveg: Never
## [1] 7.410377
anova_model=aov(depression ~ eatveg, data = df)
summary(anova_model)
##                Df Sum Sq Mean Sq F value Pr(>F)    
## eatveg          6   8646  1440.9   91.51 <2e-16 ***
## Residuals   39273 618370    15.7                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 876 observations deleted due to missingness

The results indicate a significant relationship between vegetable consumption frequency and depression levels. Individuals who consumed vegetables three times or more per day exhibited the lowest mean depression score (1.67), whereas those who never consumed vegetables reported the highest mean score (2.19). The ANOVA test confirmed this effect (F = 13.52, p < 0.001), indicating that diet plays a role in mental well-being. The trend suggests that a higher intake of vegetables may be associated with better mental health, potentially due to the nutritional benefits of a diet rich in vitamins and antioxidants.

# Hypotheses 4: Italians who meet up with often socially meet with friends, relatives or colleagues

#Descriptive Analysis: 
df$sclmeet <- as.numeric(df$sclmeet)
by(df$depression, df$sclmeet, mean, na.rm=T)
## df$sclmeet: 1
## [1] 9.645806
## ------------------------------------------------------------ 
## df$sclmeet: 2
## [1] 7.678857
## ------------------------------------------------------------ 
## df$sclmeet: 3
## [1] 6.2089
## ------------------------------------------------------------ 
## df$sclmeet: 4
## [1] 5.484885
## ------------------------------------------------------------ 
## df$sclmeet: 5
## [1] 5.335407
## ------------------------------------------------------------ 
## df$sclmeet: 6
## [1] 4.901632
## ------------------------------------------------------------ 
## df$sclmeet: 7
## [1] 4.859652
table(df$sclmeet)
## 
##     1     2     3     4     5     6     7 
##   790  3589  3844  8423  7042 11063  5330

The analysis suggests a general trend where higher socializing frequency is associated with lower depression levels. Individuals who met with others daily (socializing frequency = 7) reported the lowest mean depression score (1.60), whereas those who socialized the least (socializing frequency = 1) had the highest mean depression score (2.47). While the trend supports the hypothesis, some variations were observed across intermediate levels of social interaction. These findings reinforce the well-documented protective effects of social engagement on mental health.

# Regression model to predict the dependent variable (depression) based on the four independent variables of #education, eating vegetables, gender and socially meeting:

lm(depression ~  eisced + eatveg + gndr + sclmeet, data=df,weights=pspwght) 
## 
## Call:
## lm(formula = depression ~ eisced + eatveg + gndr + sclmeet, data = df, 
##     weights = pspwght)
## 
## Coefficients:
##                                               (Intercept)  
##                                                   8.69246  
##                        eiscedES-ISCED II, lower secondary  
##                                                  -1.04467  
##           eiscedES-ISCED IIIb, lower tier upper secondary  
##                                                  -1.67996  
##           eiscedES-ISCED IIIa, upper tier upper secondary  
##                                                  -1.57227  
##        eiscedES-ISCED IV, advanced vocational, sub-degree  
##                                                  -2.04985  
##     eiscedES-ISCED V1, lower tertiary education, BA level  
##                                                  -1.96799  
## eiscedES-ISCED V2, higher tertiary education, >= MA level  
##                                                  -2.37916  
##                                               eiscedOther  
##                                                  -2.04453  
##                                         eatvegTwice a day  
##                                                  -0.01931  
##                                          eatvegOnce a day  
##                                                  -0.02903  
##    eatvegLess than once a day but at least 4 times a week  
##                                                   0.45693  
##   eatvegLess than 4 times a week but at least once a week  
##                                                   0.88180  
##                               eatvegLess than once a week  
##                                                   1.52203  
##                                               eatvegNever  
##                                                   1.47964  
##                                                gndrFemale  
##                                                   0.98400  
##                                                   sclmeet  
##                                                  -0.47266

This linear regression model shows that higher educational attainment, more frequent vegetable consumption, and increased social interaction are all associated with lower levels of reported depression. In contrast, being female is linked to slightly higher depression scores. The most substantial reductions in depression are observed among those with postgraduate education and those who socialize more frequently, while individuals who rarely or never eat vegetables report higher depression levels.

Discussion
The findings, derived from a multivariate regression analysis and reliability-tested depression scale (Cronbach’s α = 0.87), confirm the relevance of gender, diet, socializing, and education as significant predictors of depressive symptoms. This aligns with prior epidemiological evidence while reinforcing the need for tailored interventions. Specifically, the observed gender disparity—women reporting higher depression scores—highlights the urgency of incorporating gender-sensitive strategies into public mental health policy.

The protective role of vegetable consumption and social interactions was evident, although the cross-sectional design limits causal inference. Moreover, the role of education may reflect underlying socioeconomic gradients rather than direct effects. It is also conceivable that unmeasured confounders such as employment status or chronic illness contributed to the observed associations.

Importantly, the statistical analysis underscores the utility of reproducible code-driven research. By replicating the core analyses within an R Markdown framework, this study enhances transparency and encourages open science practices in applied mental health research.

Conclusion
This study confirms, through a reproducible statistical framework, that gender, dietary habits, social behavior, and education significantly contribute to depression outcomes. The analysis, implemented fully within R Markdown, demonstrated higher depression levels among women and inverse associations between depressive symptoms and both vegetable intake and educational attainment.

These insights should inform the design of targeted public health interventions—such as community-based nutrition education, psychosocial support programs tailored for women, and structural investments in education equity. Future studies should employ longitudinal designs and integrate additional contextual variables (e.g., socioeconomic status, digital media use) to refine our understanding of depression risk and resilience.

Reference
Balsamo, M., & Carlucci, L. (2020). Italians on the age of COVID-19: The self-reported depressive symptoms through web-based survey. Frontiers in Psychology, 11, 569276.https://doi.org/10.3389/FPSYG.2020.569276

# Means
means = c()
means$d20 = mean(as.numeric(df$fltdpr),na.rm=T)
means$d21 = mean(as.numeric(df$flteeff),na.rm=T)
means$d22 = mean(as.numeric(df$slprl),na.rm=T)
means$d23 = mean(as.numeric(df$wrhpp),na.rm=T)
means$d24 = mean(as.numeric(df$fltlnl),na.rm=T)
means$d25= mean(as.numeric(df$enjlf),na.rm=T)
means$d26 = mean(as.numeric(df$fltsd),na.rm=T)
means$d27 = mean(as.numeric(df$cldgng),na.rm=T)

means 
## $d20
## [1] 1.425627
## 
## $d21
## [1] 1.681515
## 
## $d22
## [1] 1.770123
## 
## $d23
## [1] 2.920231
## 
## $d24
## [1] 1.417377
## 
## $d25
## [1] 2.895281
## 
## $d26
## [1] 1.555214
## 
## $d27
## [1] 1.545546
counts = c()
counts$d20 = sum(!is.na(df$fltdpr))
counts$d21 = sum(!is.na(df$flteeff))
counts$d22= sum(!is.na(df$slprl))
counts$d23= sum(!is.na(df$wrhpp))
counts$d24= sum(!is.na(df$fltlnl))
counts$d25= sum(!is.na(df$enjlf))
counts$d26= sum(!is.na(df$fltsd))
counts$d27= sum(!is.na(df$cldgng))

counts
## $d20
## [1] 39981
## 
## $d21
## [1] 39983
## 
## $d22
## [1] 40017
## 
## $d23
## [1] 39890
## 
## $d24
## [1] 39983
## 
## $d25
## [1] 39878
## 
## $d26
## [1] 39981
## 
## $d27
## [1] 39949
df_tmp = likert(df[,c('fltdpr','flteeff','slprl','wrhpp','fltlnl','enjlf','fltsd','cldgng')])$results
df_tmp$means = unlist(means)
df_tmp$counts = unlist(counts)
df_tmp
##      Item None or almost none of the time Some of the time Most of the time
## 1  fltdpr                       64.915835         29.06631         4.557165
## 2 flteeff                       48.395568         38.42383         9.814171
## 3   slprl                       43.873854         39.87056        11.625059
## 4   wrhpp                        4.003510         23.53973        48.886939
## 5  fltlnl                       68.136458         24.27532         5.302253
## 6   enjlf                        5.338783         24.82572        44.804153
## 7   fltsd                       52.489933         41.07451         4.859808
## 8  cldgng                       55.673484         36.10353         6.217928
##   All or almost all of the time    means counts
## 1                      1.460694 1.425627  39981
## 2                      3.366431 1.681515  39983
## 3                      4.630532 1.770123  40017
## 4                     23.569817 2.920231  39890
## 5                      2.285972 1.417377  39983
## 6                     25.031346 2.895281  39878
## 7                      1.575748 1.555214  39981
## 8                      2.005056 1.545546  39949
df_tmp[2:5] = round(df_tmp[2:5], 1)
df_tmp[6] = round(df_tmp[6], 3)
df_tmp
##      Item None or almost none of the time Some of the time Most of the time
## 1  fltdpr                            64.9             29.1              4.6
## 2 flteeff                            48.4             38.4              9.8
## 3   slprl                            43.9             39.9             11.6
## 4   wrhpp                             4.0             23.5             48.9
## 5  fltlnl                            68.1             24.3              5.3
## 6   enjlf                             5.3             24.8             44.8
## 7   fltsd                            52.5             41.1              4.9
## 8  cldgng                            55.7             36.1              6.2
##   All or almost all of the time means counts
## 1                           1.5 1.426  39981
## 2                           3.4 1.682  39983
## 3                           4.6 1.770  40017
## 4                          23.6 2.920  39890
## 5                           2.3 1.417  39983
## 6                          25.0 2.895  39878
## 7                           1.6 1.555  39981
## 8                           2.0 1.546  39949

Predictor of Clinically Significant Depression

df$depression = rowSums(df[,c("d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27")]) -8
table(df$depression)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2340 2595 4354 4601 4556 4000 3546 2955 2480 1896 1673 1144  815  601  480  351 
##   16   17   18   19   20   21   22   23   24 
##  271  198  141  108   87   62   40   29   34

#Justification To define clinically significant depressive symptoms, we used a cutoff score of ≥9 on the CES-D-8. This threshold is based on the validation study by Briggs et al. (2018), which demonstrated that a score of 9 provides optimal sensitivity (98%) and specificity (83%) when compared to the CES-D-20. The authors concluded that this cutoff reliably identifies individuals with clinically relevant depressive symptoms in large, community-based samples.

# Compute CES-D-8 sum score
df$depression_score <- rowSums(df[,c("d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27")], na.rm = TRUE)

# Define clinically significant depression: score ≥ 9
df$clin_depression <- ifelse(df$depression_score >= 9, 1, 0)

# Check frequency distribution
table(df$clin_depression)
## 
##     0     1 
##  2545 37611
prop.table(table(df$clin_depression))
## 
##          0          1 
## 0.06337783 0.93662217

#Interpretation Prevalence of Clinically Significant Depression Using the CES-D-8 cutoff (score ≥ 9), 93.7% of respondents were classified as having clinically significant depressive symptoms. This unusually high prevalence may reflect real mental health concerns in the sample but could also indicate sample bias or overreporting. Given the scale’s strong reliability, further subgroup analysis is recommended to clarify these findings.

#clinically significant symptoms
df$gndr <- as.numeric(df$gndr)
df$eduyrs <- as.numeric(df$eduyrs)
df$sclmeet <- as.numeric(df$sclmeet)
df$health <- as.numeric(df$health)
df$eatveg <- as.numeric(df$eatveg)

#logistical regression model 
log_model <- glm(clin_depression ~ gndr + eduyrs + sclmeet + health + eatveg,
                 data = df, family = binomial())

# Model summary
summary(log_model)
## 
## Call:
## glm(formula = clin_depression ~ gndr + eduyrs + sclmeet + health + 
##     eatveg, family = binomial(), data = df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.737837   0.165005   4.472 7.76e-06 ***
## gndr         0.313585   0.042567   7.367 1.75e-13 ***
## eduyrs       0.008578   0.005453   1.573    0.116    
## sclmeet     -0.095149   0.014638  -6.500 8.02e-11 ***
## health       0.750890   0.029242  25.679  < 2e-16 ***
## eatveg       0.137735   0.020272   6.794 1.09e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18539  on 39438  degrees of freedom
## Residual deviance: 17508  on 39433  degrees of freedom
##   (717 observations deleted due to missingness)
## AIC: 17520
## 
## Number of Fisher Scoring iterations: 6
# Odds Ratios with 95% Confidence Intervals
exp(cbind(Odds_Ratio = coef(log_model), confint(log_model)))
##             Odds_Ratio     2.5 %    97.5 %
## (Intercept)  2.0914077 1.5140986 2.8911263
## gndr         1.3683213 1.2588903 1.4875192
## eduyrs       1.0086147 0.9979238 1.0194826
## sclmeet      0.9092379 0.8834279 0.9356046
## health       2.1188853 2.0015053 2.2446104
## eatveg       1.1476714 1.1031025 1.1943396
# Pseudo R² (install pscl if needed)
if (!require(pscl)) install.packages("pscl")
library(pscl)
pR2(log_model)
## fitting null model for pseudo-r2
##           llh       llhNull            G2      McFadden          r2ML 
## -8.753858e+03 -9.269353e+03  1.030990e+03  5.561286e-02  2.580267e-02 
##          r2CU 
##  6.880062e-02

#Conclusion – Predictors of Clinical Depression The logistic regression revealed that female gender, poor self-rated health, and low social contact significantly increase the odds of clinically significant depression. Surprisingly, higher vegetable intake was also positively associated, which may reflect measurement issues or confounding. Education showed no clear effect. The model explained a modest proportion of variance (McFadden R² = 0.056), consistent with typical findings in population health research.