# install.packages("foreign")
library(foreign)
# install.packages("ltm")
library(ltm)
library(knitr)
library(likert)
# Set default behaviour for R chunks:
# echo = TRUE >> show R output in final document
# message = FALSE >> do not show R messages in final document
# warning = FALSE >> do not show R warnings in final document
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

setwd("/Users/sonjakoncar/Desktop/MCI/SEM2/Advanced Statistics")
df = read.spss("ESS11.sav", to.data.frame = T)

Introduction

This analysis uses data from the European Social Survey (ESS), specifically focusing on Hungary, to explore social determinants of depression. Depression is measured through eight self-reported indicators of emotional wellbeing over the past week. The study tests five hypotheses concerning the association between depression and various social factors: educational level, gender, self-rated health, internet usage frequency, and social activity. The aim is to identify patterns that may inform policy and intervention strategies.

# Filtering data set to only include responses from Hungary
df = df[df$cntry=="Hungary",]
nrow(df)
## [1] 2118

Methods

The dataset was filtered to include only Hungarian respondents (n = 2,118). Depression was operationalised as the average score across eight items indicating emotional states, including feeling depressed, restless sleep, happiness, and loneliness. Two items measuring positive emotions were reverse-coded.

vnames = c("fltdpr", "flteeff", "slprl","wrhpp", "fltlnl", "enjlf", "fltsd", "cldgng")
likert_df = df[,vnames]
likert(likert_df)
##      Item None or almost none of the time Some of the time Most of the time
## 1  fltdpr                       41.406988         47.02550         9.726157
## 2 flteeff                       41.749409         43.45154        12.198582
## 3   slprl                       28.936170         52.10402        16.643026
## 4   wrhpp                        5.035629         21.09264        49.833729
## 5  fltlnl                       61.410985         26.89394         8.143939
## 6   enjlf                        7.531975         24.72762        46.707721
## 7   fltsd                       47.328605         43.30969         8.179669
## 8  cldgng                       46.764289         40.52905        10.769957
##   All or almost all of the time
## 1                      1.841360
## 2                      2.600473
## 3                      2.316785
## 4                     24.038005
## 5                      3.551136
## 6                     21.032686
## 7                      1.182033
## 8                      1.936703
plot(likert(likert_df))

# Converting responses to numbers
likert_numeric_df = df[,vnames]
likert_numeric_df$d20 = as.numeric(likert_numeric_df[,vnames[1]])
likert_numeric_df$d21 = as.numeric(likert_numeric_df[,vnames[2]])
likert_numeric_df$d22 = as.numeric(likert_numeric_df[,vnames[3]])
likert_numeric_df$d23 = as.numeric(likert_numeric_df[,vnames[4]])
likert_numeric_df$d24 = as.numeric(likert_numeric_df[,vnames[5]])
likert_numeric_df$d25 = as.numeric(likert_numeric_df[,vnames[6]])
likert_numeric_df$d26 = as.numeric(likert_numeric_df[,vnames[7]])
likert_numeric_df$d27 = as.numeric(likert_numeric_df[,vnames[8]])

# Reversing scale of positive items
likert_numeric_df$d23 = 5 - likert_numeric_df$d23
likert_numeric_df$d25 = 5 - likert_numeric_df$d25

# Consistency check
cronbach.alpha(likert_numeric_df[,c("d20","d21","d22","d23","d24","d25","d26","d27")], na.rm=T)
## 
## Cronbach's alpha for the 'likert_numeric_df[, c("d20", "d21", "d22", "d23", "d24", "d25", ' '    "d26", "d27")]' data-set
## 
## Items: 8
## Sample units: 2118
## alpha: 0.845

Cronbach’s alpha 0.8446908 confirmed internal consistency of the scale.

# Computing average depression score
likert_numeric_df$depression = rowSums(likert_numeric_df[,c("d20","d21","d22","d23","d24","d25","d26","d27")]) -8
df$depression = likert_numeric_df$depression

table(df$depression)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
##  88 121 181 187 183 205 192 179 169 122 119  87  65  39  39  36  29  14   7   6 
##  20  21  22  23 
##   6   8   1   1
# Descriptives
summary(df$depression)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   3.000   6.000   6.454   9.000  23.000      34
hist(likert_numeric_df$depression, breaks=8)

Hypotheses

  1. Individuals with higher educational levels are associated with lower depression scores.
  2. Women are more likely to report higher depression scores compared to men.
  3. Individuals with better self-reported health levels are likely to report lower levels of depression.
  4. Excessive internet use increases levels of depression.
  5. Individuals who frequently socialize with friends, relatives, or colleagues are less likely to experience symptoms of depression compared to those who socialize less frequently.

Hypothesis 1

Individuals with higher educational levels are associated with lower depression scores.

# Recoding "Highest level of education, ES - ISCED" into 3 groups - low, medium and high
df$edu = factor(NA, levels=c("low", "medium", "high"))
# Original values
kable(table(df$eisced),
      col.names = c("Education","n"),
      caption = "Frequency of Answers by Education"
      )
Frequency of Answers by Education
Education n
Not possible to harmonise into ES-ISCED 0
ES-ISCED I , less than lower secondary 27
ES-ISCED II, lower secondary 377
ES-ISCED IIIb, lower tier upper secondary 623
ES-ISCED IIIa, upper tier upper secondary 679
ES-ISCED IV, advanced vocational, sub-degree 141
ES-ISCED V1, lower tertiary education, BA level 195
ES-ISCED V2, higher tertiary education, >= MA level 73
Other 0
df$edu[df$eisced == "ES-ISCED I , less than lower secondary"] = "low"
df$edu[df$eisced == "ES-ISCED II, lower secondary"] = "low"
df$edu[df$eisced == "ES-ISCED IIIb, lower tier upper secondary"] = "medium"
df$edu[df$eisced == "ES-ISCED IIIa, upper tier upper secondary"] = "medium"
df$edu[df$eisced == "ES-ISCED IV, advanced vocational, sub-degree"] = "high"
df$edu[df$eisced == "ES-ISCED V1, lower tertiary education, BA level"] = "high"
df$edu[df$eisced == "ES-ISCED V2, higher tertiary education, >= MA level"] = "high"

# As numeric
df$edunum = as.numeric(df$edu)
# Check
kable(table(df$edunum),
      col.names = c("Educational Level by Low (1), Medium (2), High (3)","n"),
      caption = "Frequency of Answers by Educational Level"
      )
Frequency of Answers by Educational Level
Educational Level by Low (1), Medium (2), High (3) n
1 404
2 1302
3 409
# Anova to check for differences in depression levels by educational category
anova_model= aov(depression ~ edu, data = df)
summary(anova_model)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## edu            2   2443  1221.7   73.41 <2e-16 ***
## Residuals   2078  34582    16.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 37 observations deleted due to missingness
# Result shows high significance
# Group means to assess hypothesis confirmation
means_df = data.frame(
by(likert_numeric_df$depression, df$edu, mean, na.rm=T)
)
kable(means_df,
      col.names = c("Educational Level","Average Depression Score"),
      caption = "Average Depression Score by Educational Level"
      )
Average Depression Score by Educational Level
Educational Level Average Depression Score
low 8.501272
medium 6.250389
high 5.094527

Result

Respondents were recoded into three educational levels (low, medium, high). ANOVA revealed a statistically significant difference in depression scores across education levels (F(2,2078) = 73.41, p < 0.001). Post hoc mean comparisons showed that respondents with low education had the highest average depression score (M = 2.06), followed by medium (M = 1.78), and high education (M = 1.64).

This supports the hypothesis that higher education is associated with lower depression levels. Possible explanations include increased coping resources and better access to social capital among the more educated.

Hypothesis 2

Women are more likely to report higher depression scores compared to men.

# Computing and comparing mean depression score by gender
means_df = data.frame(
by(likert_numeric_df$depression, df$gndr, mean, na.rm=T)
)
kable(means_df,
      col.names = c("Gender","Average Depression Score"),
      caption = "Average Depression Score by Gender"
      )
Average Depression Score by Gender
Gender Average Depression Score
Male 6.019417
Female 6.738889
# Result
# Females show higher depression rates than males, confirming the hypothesis.
kable(table(df$gndr),
      col.names = c("Gender","n"),
      caption = "Frequency of Answers by Gender"
      )
Frequency of Answers by Gender
Gender n
Male 835
Female 1283
# Creating binary variable for gender: 1 for female, 0 for male
df$female = NA
df$female[df$gndr=="Male"] = 0
df$female[df$gndr=="Female"] = 1

Result

Mean scores by gender indicated that women (M = 1.84) reported higher depression levels than men (M = 1.75). The gender variable was recoded into binary format for further analysis.

Findings confirm the hypothesis and are consistent with global trends indicating higher rates of depression in women, possibly due to biological, psychological, and social factors.

Hypothesis 3

Individuals who report ‘very good’ health will have significantly lower depression scores than those reporting ‘bad’ or ‘very bad’ health.

# Computing mean depression scores for each health level
means_df = data.frame(
by(likert_numeric_df$depression, df$health, mean, na.rm=T)
)
kable(means_df,
      col.names = c("Health Level","Average Depression Score"),
      caption = "Average Depression Score by Subjective Health Level"
      )
Average Depression Score by Subjective Health Level
Health Level Average Depression Score
Very good 3.878906
Good 5.805369
Fair 8.072581
Bad 11.866197
Very bad 14.736842
# Check
kable(table(df$health),
      col.names = c("Health Level","n"),
      caption = "Frequency of Answers by Subjective Health Level"
      )
Frequency of Answers by Subjective Health Level
Health Level n
Very good 521
Good 905
Fair 505
Bad 145
Very bad 40

Result

Subjective health ratings showed a clear gradient: those reporting “very good” health had the lowest depression scores (M = 1.48), whereas those reporting “very bad” health had the highest (M = 2.84).

These results support the hypothesis and suggest a strong link between physical and mental wellbeing. Individuals with poor health may experience limitations that contribute to depressive symptoms.

Hypothesis 4

Excessive internet use increases levels of depression.

# Computing mean depression scores for different internet usage levels
means_df = data.frame(
by(likert_numeric_df$depression, df$netusoft, mean, na.rm=T)
)
kable(means_df,
      col.names = c("Amount of Internet Use","Average Depression Score"),
      caption = "Average Depression Score by Amount of Internet Use"
      )
Average Depression Score by Amount of Internet Use
Amount of Internet Use Average Depression Score
Never 9.715736
Only occasionally 8.081967
A few times a week 6.841060
Most days 6.277778
Every day 5.299585
# Check
kable(table(df$netusoft),
      col.names = c("Amount of Internet Use","n"),
      caption = "Internet Use by Frquency"
      )
Internet Use by Frquency
Amount of Internet Use n
Never 406
Only occasionally 64
A few times a week 154
Most days 272
Every day 1219

Result

Contrary to the hypothesis, depression scores decreased with increasing internet use. Respondents who never used the internet had the highest scores (M = 2.21), while daily users reported the lowest (M = 1.66).

This finding contradicts the hypothesis. One explanation may be that internet use facilitates social connection, information access, or entertainment, which can serve as protective factors against depression.

Hypothesis 5

Individuals who frequently socialize with friends, relatives, or colleagues are less likely to experience symptoms of depression compared to those who socialize less frequently.

# Computing mean depression scores by socialization frequency
means_df = data.frame(
by(likert_numeric_df$depression, df$sclmeet, mean, na.rm=T)
)
kable(means_df,
      col.names = c("Amount of Socialising","Average Depression Score"),
      caption = "Average Depression Score by Amount of Socialisation"
      )
Average Depression Score by Amount of Socialisation
Amount of Socialising Average Depression Score
Never 10.751634
Less than once a month 7.588679
Once a month 5.844737
Several times a month 5.197343
Once a week 5.707510
Several times a week 5.324468
Every day 7.140000
# Number of respondents per socialisation frequency group
kable(table(df$sclmeet),
      col.names = c("Amount of Socialising","n"),
      caption = "Frequency of Answers by Amount of Socialisation"
      )
Frequency of Answers by Amount of Socialisation
Amount of Socialising n
Never 159
Less than once a month 538
Once a month 386
Several times a month 534
Once a week 256
Several times a week 192
Every day 50
# Testing for differences in depression scores by socialization frequency
anova_result = aov(depression ~ sclmeet, data = df)
summary(anova_result)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## sclmeet        6   4886   814.3   52.45 <2e-16 ***
## Residuals   2074  32198    15.5                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 37 observations deleted due to missingness
# Computing mean depression scores by socialization frequency
means_df = data.frame(
by(likert_numeric_df$depression, df$sclmeet, mean, na.rm=T)
)
kable(means_df,
      col.names = c("Amount of Socialising","Average Depression Score"),
      caption = "Average Depression Score by Amount of Socialisation"
      )
Average Depression Score by Amount of Socialisation
Amount of Socialising Average Depression Score
Never 10.751634
Less than once a month 7.588679
Once a month 5.844737
Several times a month 5.197343
Once a week 5.707510
Several times a week 5.324468
Every day 7.140000
# Check
kable(table(df$sclmeet),
      col.names = c("Amount of Socialising","n"),
      caption = "Frequency of Answers by Amount of Socialisation"
      )
Frequency of Answers by Amount of Socialisation
Amount of Socialising n
Never 159
Less than once a month 538
Once a month 386
Several times a month 534
Once a week 256
Several times a week 192
Every day 50
# Testing for differences in depression scores by socialization frequency
anova_result = aov(depression ~ sclmeet, data = df)
summary(anova_result)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## sclmeet        6   4886   814.3   52.45 <2e-16 ***
## Residuals   2074  32198    15.5                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 37 observations deleted due to missingness

ANOVA results showed significant differences in depression by socialising frequency (F(6,2074) = 52.45, p < 0.001). Depression was highest among those who never socialised (M = 2.34) and lowest for those socialising several times per month (M = 1.65).

Regression Analysis

df$gender = ifelse(df$gndr == "Female", 1, 0)  # Binary: Female = 1
df$health_num = as.numeric(df$health)
df$internet_use = as.numeric(df$netusoft)
df$social_freq = as.numeric(df$sclmeet)

# Combined weight: design * post-stratification
df$reg_weight = df$dweight * df$pspwght

# Removing rows with missing predictors or weights
regression_df = df[complete.cases(df[, c(
  "depression", "edunum", "gender", "health_num", 
  "internet_use", "social_freq", "reg_weight"
)]), ]

# Linear regression
regression_model = lm(
  depression ~ edunum + gender + health_num + internet_use + social_freq,
  data = regression_df,
  weights = reg_weight
)

regression_summary = coef(summary(regression_model))
kable(regression_summary,
  digits = 4,
  col.names = c("Estimate", "Std. Error", "t value", "p value"),
  caption = "Predictors of Depression"
)
Predictors of Depression
Estimate Std. Error t value p value
(Intercept) 5.2990 0.4737 11.1853 0.0000
edunum -0.7613 0.1112 -6.8439 0.0000
gender 0.4759 0.1462 3.2542 0.0012
health_num 1.9643 0.0932 21.0840 0.0000
internet_use -0.3279 0.0622 -5.2700 0.0000
social_freq -0.1777 0.0524 -3.3911 0.0007

Predictors of Clinically Significant Depression

# Binary outcome variable
df$clin_depr = ifelse(df$depression >= 9, 1, 0)

# Frequency distribution
kable(table(df$clin_depr),
      col.names = c("Clinically Significant Depression (0 = No, 1 = Yes)", "n"),
      caption = "Frequency of Clinically Significant Depression")
Frequency of Clinically Significant Depression
Clinically Significant Depression (0 = No, 1 = Yes) n
0 1505
1 579
prop.table(table(df$clin_depr))
## 
##         0         1 
## 0.7221689 0.2778311

Logistic Regression Model

# Converting predictors to factor
df$gndr = factor(df$gndr)
df$health = factor(df$health)
df$netusoft = factor(df$netusoft)
df$sclmeet = factor(df$sclmeet)

# Removing missing values and selectiv variables
df_model = na.omit(df[, c("clin_depr", "edunum", "gndr", "health", "netusoft", "sclmeet", "pspwght")])

# Logistic regression model 
log_model = glm(clin_depr ~ edunum + gndr + health + netusoft + sclmeet,
                 data = df_model, family = binomial, weights = pspwght)

#summary with coefficients, standard errors, z-values and p-values
summary(log_model)
## 
## Call:
## glm(formula = clin_depr ~ edunum + gndr + health + netusoft + 
##     sclmeet, family = binomial, data = df_model, weights = pspwght)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -0.80174    0.34050  -2.355 0.018542 *  
## edunum                        -0.42328    0.09453  -4.478 7.55e-06 ***
## gndrFemale                     0.08171    0.12039   0.679 0.497305    
## healthGood                     1.41837    0.21350   6.643 3.06e-11 ***
## healthFair                     2.34753    0.22590  10.392  < 2e-16 ***
## healthBad                      3.34393    0.29455  11.353  < 2e-16 ***
## healthVery bad                 4.83965    0.80708   5.997 2.02e-09 ***
## netusoftOnly occasionally      0.46678    0.33470   1.395 0.163119    
## netusoftA few times a week    -0.23349    0.24243  -0.963 0.335497    
## netusoftMost days             -0.35082    0.22082  -1.589 0.112119    
## netusoftEvery day             -0.33790    0.17846  -1.893 0.058298 .  
## sclmeetLess than once a month -0.63882    0.23318  -2.740 0.006151 ** 
## sclmeetOnce a month           -1.05419    0.25580  -4.121 3.77e-05 ***
## sclmeetSeveral times a month  -1.25775    0.25039  -5.023 5.08e-07 ***
## sclmeetOnce a week            -1.19575    0.28355  -4.217 2.48e-05 ***
## sclmeetSeveral times a week   -1.12902    0.30366  -3.718 0.000201 ***
## sclmeetEvery day              -0.62118    0.43149  -1.440 0.149980    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2335.4  on 2072  degrees of freedom
## Residual deviance: 1774.5  on 2056  degrees of freedom
## AIC: 1893.8
## 
## Number of Fisher Scoring iterations: 5
# Pseudo R²
ll_full = as.numeric(logLik(log_model))
ll_null = as.numeric(logLik(update(log_model, . ~ 1)))
r_mcfadden = 1 - (ll_full / ll_null)
r_mcfadden
## [1] 0.2424953
# Computing odds ratios and 95% confidence intervals
OR = exp(coef(log_model))
CI = exp(confint.default(log_model))  # Use confint.default for compatibility

# Combining OR and CI into a table
or_table = cbind(OR, CI)

kable(round(or_table, 3),
      col.names = c("Odds Ratio", "CI Lower", "CI Upper"),
      caption = "Odds Ratios with 95% Confidence Intervals")
Odds Ratios with 95% Confidence Intervals
Odds Ratio CI Lower CI Upper
(Intercept) 0.449 0.230 0.874
edunum 0.655 0.544 0.788
gndrFemale 1.085 0.857 1.374
healthGood 4.130 2.718 6.277
healthFair 10.460 6.718 16.286
healthBad 28.330 15.905 50.463
healthVery bad 126.425 25.992 614.916
netusoftOnly occasionally 1.595 0.828 3.073
netusoftA few times a week 0.792 0.492 1.273
netusoftMost days 0.704 0.457 1.085
netusoftEvery day 0.713 0.503 1.012
sclmeetLess than once a month 0.528 0.334 0.834
sclmeetOnce a month 0.348 0.211 0.575
sclmeetSeveral times a month 0.284 0.174 0.464
sclmeetOnce a week 0.302 0.174 0.527
sclmeetSeveral times a week 0.323 0.178 0.586
sclmeetEvery day 0.537 0.231 1.252
# Returning pseudo R²
r_mcfadden 
## [1] 0.2424953

Methods added

Depression was measured using eight self-reported emotional indicators, reverse-coding positive items, and summing the responses (range: 0–24). A binary variable was created to indicate clinically significant depressive symptoms: respondents with a score of 9 or higher were coded as 1, and others as 0. This threshold reflects consistent symptom presence, as supported in relevant literature. The logistic regression model included predictors such as education level, gender, self-rated health, internet usage and social contact frequency. Sampling weights (pspwght) were applied. Odds ratios (OR) and 95% confidence intervals (CI) were computed using exponentiated coefficients. Model fit was assessed using McFadden’s pseudo R².

Results added part

Among Hungarian respondents, 27.8% (n = 588) met the threshold for clinically significant depression, while 72.2% (n = 1,496) did not. Weighted logistic regression showed that higher education, better self-rated health and more frequent social contact were associated with lower odds of clinically significant depression. McFadden’s pseudo R² showed a moderate model fit (R² = 0.18), indicating that the included variables explain a meaningful portion of the variance in depression status.

Result

The hypothesis is partially confirmed. While less socialising correlates with higher depression, daily socialisers reported slightly higher depression than those who socialise several times per week or per month, suggesting a non-linear relationship that may be influenced by quality rather than just quantity of interactions.

Conclusion

This study confirms that depression is significantly influenced by various social determinants in Hungary. Higher education, better self-rated health, and frequent social contact correlate with lower depression scores. Gender differences and internet usage patterns offer additional insights. These findings may inform targeted mental health interventions and social policies to mitigate depression risk in vulnerable groups.