library(foreign)
library(ltm)




# Load the data

df <- read.spss("C:/Users/Besitzer/Documents/Quantitaive/ESS11_unlabeled.0-10.sav", to.data.frame = TRUE)

# Convert CES-D8 items to numeric
df$d1 <- as.numeric(df$fltdpr)
df$d2 <- as.numeric(df$flteeff)
df$d3 <- as.numeric(df$fltlnl)
df$d4 <- 5 - as.numeric(df$enjlf) # reverse-coded
df$d5 <- as.numeric(df$fltsd)
df$d6 <- 5 - as.numeric(df$wrhpp) # reverse-coded
df$d7 <- as.numeric(df$slprl)
df$d8 <- as.numeric(df$cldgng)

# Check reliability
cronbach.alpha(df[, paste0("d", 1:8)], na.rm = TRUE)
## 
## Cronbach's alpha for the 'df[, paste0("d", 1:8)]' data-set
## 
## Items: 8
## Sample units: 40156
## alpha: 0.823
# Compute CES-D8 score
df$cesd8 <- rowMeans(df[, paste0("d", 1:8)], na.rm = TRUE)
summary(df$cesd8)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.375   1.625   1.698   2.000   4.000      52
# Histogram of CES-D8 scores
hist(df$cesd8, main = "Distribution of CES-D8 Depression Scores", xlab = "CES-D8 Score")

# Convert age to numeric
df$agea <- as.numeric(df$agea)

# Correlation between depression and age
cor(df$cesd8, df$agea, use = "complete.obs")
## [1] 0.1140684
# Linear regression: CES-D8 ~ Age
model_age <- lm(cesd8 ~ agea, data = df)
summary(model_age)
## 
## Call:
## lm(formula = cesd8 ~ agea, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81341 -0.36524 -0.09391  0.26890  2.36951 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.581717   0.005599  282.49   <2e-16 ***
## agea        0.003049   0.000133   22.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4975 on 39843 degrees of freedom
##   (311 Beobachtungen als fehlend gelöscht)
## Multiple R-squared:  0.01301,    Adjusted R-squared:  0.01299 
## F-statistic: 525.3 on 1 and 39843 DF,  p-value: < 2.2e-16
# Scatterplot with regression line
plot(df$agea, df$cesd8, xlab = "Age", ylab = "CES-D8 Depression Score", main = "Depression by Age")
abline(model_age, col = "red")

# Convert categorical variables to factors (dummy variables)
df$gndr <- factor(df$gndr, labels = c("male", "female"))
df$health <- factor(df$health)

# Multivariate model: CES-D8 ~ Age + Gender + Health
model_multi <- lm(cesd8 ~ agea + gndr + health, data = df)
summary(model_multi)
## 
## Call:
## lm(formula = cesd8 ~ agea + gndr + health, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.65037 -0.30610 -0.05556  0.23670  2.48333 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.4662011  0.0060770  241.27   <2e-16 ***
## agea           -0.0016203  0.0001292  -12.54   <2e-16 ***
## gndrfemale      0.1021926  0.0044540   22.94   <2e-16 ***
## healthGood      0.1661791  0.0056783   29.27   <2e-16 ***
## healthFair      0.4195330  0.0067007   62.61   <2e-16 ***
## healthBad       0.8195142  0.0104869   78.15   <2e-16 ***
## healthVery bad  1.1759467  0.0217710   54.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4428 on 39794 degrees of freedom
##   (355 Beobachtungen als fehlend gelöscht)
## Multiple R-squared:  0.217,  Adjusted R-squared:  0.2168 
## F-statistic:  1838 on 6 and 39794 DF,  p-value: < 2.2e-16
# Mean CES-D8 scores by gender
tapply(df$cesd8, df$gndr, mean, na.rm = TRUE)
##     male   female 
## 1.632050 1.755568