library(foreign)
library(ltm)
# Load the data
df <- read.spss("C:/Users/Besitzer/Documents/Quantitaive/ESS11_unlabeled.0-10.sav", to.data.frame = TRUE)
# Convert CES-D8 items to numeric
df$d1 <- as.numeric(df$fltdpr)
df$d2 <- as.numeric(df$flteeff)
df$d3 <- as.numeric(df$fltlnl)
df$d4 <- 5 - as.numeric(df$enjlf) # reverse-coded
df$d5 <- as.numeric(df$fltsd)
df$d6 <- 5 - as.numeric(df$wrhpp) # reverse-coded
df$d7 <- as.numeric(df$slprl)
df$d8 <- as.numeric(df$cldgng)
# Check reliability
cronbach.alpha(df[, paste0("d", 1:8)], na.rm = TRUE)
##
## Cronbach's alpha for the 'df[, paste0("d", 1:8)]' data-set
##
## Items: 8
## Sample units: 40156
## alpha: 0.823
# Compute CES-D8 score
df$cesd8 <- rowMeans(df[, paste0("d", 1:8)], na.rm = TRUE)
summary(df$cesd8)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.375 1.625 1.698 2.000 4.000 52
# Histogram of CES-D8 scores
hist(df$cesd8, main = "Distribution of CES-D8 Depression Scores", xlab = "CES-D8 Score")

# Convert age to numeric
df$agea <- as.numeric(df$agea)
# Correlation between depression and age
cor(df$cesd8, df$agea, use = "complete.obs")
## [1] 0.1140684
# Linear regression: CES-D8 ~ Age
model_age <- lm(cesd8 ~ agea, data = df)
summary(model_age)
##
## Call:
## lm(formula = cesd8 ~ agea, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81341 -0.36524 -0.09391 0.26890 2.36951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.581717 0.005599 282.49 <2e-16 ***
## agea 0.003049 0.000133 22.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4975 on 39843 degrees of freedom
## (311 Beobachtungen als fehlend gelöscht)
## Multiple R-squared: 0.01301, Adjusted R-squared: 0.01299
## F-statistic: 525.3 on 1 and 39843 DF, p-value: < 2.2e-16
# Scatterplot with regression line
plot(df$agea, df$cesd8, xlab = "Age", ylab = "CES-D8 Depression Score", main = "Depression by Age")
abline(model_age, col = "red")

# Convert categorical variables to factors (dummy variables)
df$gndr <- factor(df$gndr, labels = c("male", "female"))
df$health <- factor(df$health)
# Multivariate model: CES-D8 ~ Age + Gender + Health
model_multi <- lm(cesd8 ~ agea + gndr + health, data = df)
summary(model_multi)
##
## Call:
## lm(formula = cesd8 ~ agea + gndr + health, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.65037 -0.30610 -0.05556 0.23670 2.48333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4662011 0.0060770 241.27 <2e-16 ***
## agea -0.0016203 0.0001292 -12.54 <2e-16 ***
## gndrfemale 0.1021926 0.0044540 22.94 <2e-16 ***
## healthGood 0.1661791 0.0056783 29.27 <2e-16 ***
## healthFair 0.4195330 0.0067007 62.61 <2e-16 ***
## healthBad 0.8195142 0.0104869 78.15 <2e-16 ***
## healthVery bad 1.1759467 0.0217710 54.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4428 on 39794 degrees of freedom
## (355 Beobachtungen als fehlend gelöscht)
## Multiple R-squared: 0.217, Adjusted R-squared: 0.2168
## F-statistic: 1838 on 6 and 39794 DF, p-value: < 2.2e-16
# Mean CES-D8 scores by gender
tapply(df$cesd8, df$gndr, mean, na.rm = TRUE)
## male female
## 1.632050 1.755568