Task Description for Homework 5:
Develop meaningful and informative graphics for selected aspects of your
paper on depression. For each graphic, follow these steps:
# Subset Data: United Kingdom (UK)
# Rows where country is "United Kingdom", all columns
df_uk = df[df$cntry == "United Kingdom", ]
# Convert relevant columns to numeric in df_uk
df_uk$d20 = as.numeric(df_uk$fltdpr)
df_uk$d21 = as.numeric(df_uk$flteeff)
df_uk$d22 = as.numeric(df_uk$slprl)
df_uk$d23 = as.numeric(df_uk$wrhpp)
df_uk$d24 = as.numeric(df_uk$fltlnl)
df_uk$d25 = as.numeric(df_uk$enjlf)
df_uk$d26 = as.numeric(df_uk$fltsd)
df_uk$d27 = as.numeric(df_uk$cldgng)
# Calculate depression score
df_uk$depression = rowSums(df_uk[, c("d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27")], na.rm = FALSE) / 8
The dataset was subset to include only participants from the United Kingdom (UK), resulting in an initial sample size of n = 1684. However, the final sample consisted of n = 1635 valid respondents, aged 15 to 90 years, for the depression variable in the UK sample.
In this section, “agea” (age) is converted into numeric, only participants with non-missing values (age and depression) are selected, and age groups are created. Further, intermediate depression values are rounded to whole numbers (1–4).
# Convert "agea" (age) into numeric
df_uk$age = as.numeric(as.character(df_uk[,"agea"]))
# Select only participants with non-missing age and depression (49 missing depression, 37 missing age)
df_uk_complete = df_uk[!is.na(df_uk$age) & !is.na(df_uk$depression), ]
# Create age groups
df_uk_complete$age_group = cut( # just for me --> cut(): Split number vectors into intervals
df_uk_complete$age,
breaks = c(15, 24, 34, 44, 54, 64, 74, 90),
labels = c("15–24", "25–34", "35–44", "45–54", "55–64", "65–74", "75+"),
include.lowest = TRUE # just for me --> the lowest limit value (here 15) is included in the first interval
)
# Round depression scores to whole numbers (1-4) and name it "df_uk_complete"
# Meaning: 1 = no or very mild symptoms; 2 = mild symptoms; 3 = moderate symptoms; 4 = severe symptoms
# Explanation: There are many intermediate values; displaying them all would look confusing.
df_uk_complete$depression_round = round(df_uk_complete$depression)
As for sample descriptions, in the pure sciences, tables are typically used to present data (N. Mevenkamp, personal communication, June 12, 2025). Both absolute and relative frequencies are presented in the form of a table. Furthermore, visualization was chosen to clearly show group differences.
# Analyze depression by age group
# How frequently does each level of depression occur in each age group?
table_absolute = table(df_uk_complete$age_group, df_uk_complete$depression_round)
# Add column sums only (bottom row)
column_sums = addmargins(table_absolute, margin = 1) # 1 = row
# Name is "Total"
rownames(column_sums)[nrow(column_sums)] = "Total" # Distribution of depression across age groups
# Show the table
scroll_box(
kable_styling(
kable(
column_sums,
col.names = c(
"Age groups",
"1 = no or very mild symptoms",
"2 = mild symptoms",
"3 = moderate symptoms",
"4 = severe symptoms"
),
caption = "Table 1: Absolute frequencies of depression by age group (N = 1635)",
align = c("l", "r", "r", "r", "r", "r"),
row.names = TRUE
),
full_width = TRUE,
font_size = 13,
bootstrap_options = c("hover", "condensed")
),
height = "300px"
)
| Age groups | 1 = no or very mild symptoms | 2 = mild symptoms | 3 = moderate symptoms | 4 = severe symptoms |
|---|---|---|---|---|
| 15–24 | 1 | 84 | 9 | 0 |
| 25–34 | 4 | 211 | 19 | 0 |
| 35–44 | 10 | 228 | 11 | 0 |
| 45–54 | 3 | 213 | 18 | 1 |
| 55–64 | 5 | 238 | 12 | 0 |
| 65–74 | 5 | 257 | 11 | 0 |
| 75+ | 6 | 245 | 14 | 0 |
| Total | 34 | 1476 | 94 | 1 |
# Grouped bar chart – creating a visual representation of a cross-tabulation
# How many people are in each age group
# How depression levels (1–4) are distributed across these age groups
# Shows differences in counts of depression severity by age.
ggplot(df_uk_complete, aes(x = age_group, fill = as.factor(depression_round))) +
geom_bar(position = "dodge") +
scale_fill_manual(values = c("1" = "#F8766D", "2" = "#7CAE00", "3" = "#00BFC4", "4" = "#C77CFF")) +
labs(
title = "Figure 1: Depression Levels (1-4) by Age Group (Absolute Counts)",
x = "Age Group",
y = "Count",
fill = "Depression Score"
) +
theme_minimal()
# Relative frequencies (row-wise %) # independent variable (age)
table_relative = prop.table(table_absolute, margin = 1)
# Round and convert to percentages
table_relative_percentages = round(table_relative * 100, 1)
# Show table
scroll_box(
kable_styling(
kable(
table_relative_percentages,
col.names = c("1 = no or very mild symptoms", "2 = mild symptoms", "3 = moderate symptoms", "4 = severe symptoms"),
caption = "Table 2: Relative (%) frequencies of depression by age group",
align = c("l", "r", "r", "r", "r", "r"),
),
full_width = TRUE,
font_size = 13,
bootstrap_options = c("hover", "condensed")
),
height = "300px"
)
| 1 = no or very mild symptoms | 2 = mild symptoms | 3 = moderate symptoms | 4 = severe symptoms | |
|---|---|---|---|---|
| 15–24 | 1.1 | 89.4 | 9.6 | 0.0 |
| 25–34 | 1.7 | 90.2 | 8.1 | 0.0 |
| 35–44 | 4.0 | 91.6 | 4.4 | 0.0 |
| 45–54 | 1.3 | 90.6 | 7.7 | 0.4 |
| 55–64 | 2.0 | 93.3 | 4.7 | 0.0 |
| 65–74 | 1.8 | 94.1 | 4.0 | 0.0 |
| 75+ | 2.3 | 92.5 | 5.3 | 0.0 |
# Stacked bar chart: visualizes the proportion of each depression level within each age group.
# compares how depression severity varies relatively across age categories
ggplot(df_uk_complete, aes(x = age_group, fill = as.factor(depression_round))) +
geom_bar(position = "fill") + # relative frequencies
scale_y_continuous(labels = percent) +
scale_fill_manual(values = c("1" = "#F8766D", "2" = "#7CAE00", "3" = "#00BFC4", "4" = "#C77CFF")) +
labs(
title = "Figure 2: Depression levels by age group (relative)",
x = "Age Group",
y = "Percentage",
fill = "Depression Score"
) +
theme_minimal()
# How many people of each gender are in the sample?
# How is the distribution and balance of gender in the data?
# Create frequency table
gender_table = table(df_uk$gndr)
# Convert to data frame
gender_df_uk = as.data.frame(gender_table)
names(gender_df_uk) = c("Gender", "Count")
# Show table
scroll_box(
kable_styling(
kable(
gender_df_uk,
caption = "Table 3: Frequency distribution of Gender",
align = c("l", "r")
),
full_width = TRUE,
font_size = 13,
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
position = "left"
),
height = "300px"
)
| Gender | Count |
|---|---|
| Male | 824 |
| Female | 860 |
# Univariate frequency distribution visualization
# Ranking for nominal scale (gndr)
ggplot(df_uk, aes(gndr)) +
geom_bar(aes(fill = gndr)) +
scale_fill_manual(values = c("Female" = "orchid", "Male" = "cornflowerblue")) +
labs(
title = "Figure 3: Gender distribution",
subtitle = "ESS Round11",
x = "Gender",
y = "Count",
caption = "Valentina Lanser",
fill = "Gender"
) +
theme_minimal()
# Create the visual representation of a cross tabulation - bivariate analysis
# Bar chart
# Show gender distribution within each depression score
# Are there certain depression levels which are more prevalent among one gender?
# 1. Keep only valid values (1 to 4)
df_uk_validvalues=df_uk[df_uk$depression %in% 1:4, ]
# 2. "depression" is numeric - therefore, convert to factor and set the order
df_uk_validvalues$depression = factor(df_uk_validvalues$depression, levels = c(1, 2, 3, 4))
# 3. Create graph
ggplot(df_uk_validvalues, aes(depression)) +
geom_bar(aes(fill = gndr), position = "fill", width = 0.6) +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = c("cornflowerblue", "orchid")) +
labs(
title = "Figure 4: Depression by Gender",
subtitle = "ESS Round11",
x = "Depression Level",
y = "",
fill = "Gender"
) +
theme_minimal()