# List of packages
packages <- c("tidyverse", "gt", "gapminder", "srvyr", "fst", "ggridges")
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'srvyr'
##
##
## The following object is masked from 'package:stats':
##
## filter
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "gt" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "gapminder" "gt" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "srvyr" "gapminder" "gt" "lubridate" "forcats" "stringr"
## [7] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [13] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [19] "methods" "base"
##
## [[5]]
## [1] "fst" "srvyr" "gapminder" "gt" "lubridate" "forcats"
## [7] "stringr" "dplyr" "purrr" "readr" "tidyr" "tibble"
## [13] "ggplot2" "tidyverse" "stats" "graphics" "grDevices" "utils"
## [19] "datasets" "methods" "base"
##
## [[6]]
## [1] "ggridges" "fst" "srvyr" "gapminder" "gt" "lubridate"
## [7] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [13] "tibble" "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [19] "utils" "datasets" "methods" "base"
# Filter data for years 1987 and 2007
gap_filtered <- gapminder %>%
filter(year %in% c(1987, 2007))
# Calculate mean life expectancy by continent for each year
mean_lifeExp <- gap_filtered %>%
group_by(continent, year) %>%
summarise(mean_lifeExp = mean(lifeExp), .groups = "drop")
# Reshape data and calculate the change in life expectancy
lifeExp_change <- mean_lifeExp %>%
spread(year, mean_lifeExp) %>%
mutate(change = `2007` - `1987`)
# print the results
print(lifeExp_change)
## # A tibble: 5 × 4
## continent `1987` `2007` change
## <fct> <dbl> <dbl> <dbl>
## 1 Africa 53.3 54.8 1.46
## 2 Americas 68.1 73.6 5.52
## 3 Asia 64.9 70.7 5.88
## 4 Europe 73.6 77.6 4.01
## 5 Oceania 75.3 80.7 5.40
lines 33-35 groups the data by continent and year and calculates the average life expectancy (mean(lifeExp)) for each continent in both years. spread(year, mean_lifeExp) converts the year column into seperate columns (1987 and 2007), with the mean life expectancy as values. mutate(change= 2007-1987) creates a new column “change” that calculates the difference in life expectancy between 2007 and 1987.
# Filter the lifeExp_change table for the five focal countries
focal_countries_table <- gap_filtered %>%
filter(country %in% c("Niger", "Bangladesh", "El Salvador", "Iraq", "Zimbabwe")) %>%
group_by(country, year) %>%
summarise(mean_lifeExp = mean(lifeExp), .groups = "drop") %>%
spread(year, mean_lifeExp) %>%
mutate(Change = `2007` - `1987`)
# Display the filtered table for the focal countries
focal_countries_table
## # A tibble: 5 × 4
## country `1987` `2007` Change
## <fct> <dbl> <dbl> <dbl>
## 1 Bangladesh 52.8 64.1 11.2
## 2 El Salvador 63.2 71.9 8.72
## 3 Iraq 65.0 59.5 -5.50
## 4 Niger 44.6 56.9 12.3
## 5 Zimbabwe 62.4 43.5 -18.9
The dataset gap_filtered is filtered to keep only rows where the country column matches one of the five focal countries. This data is then grouped by country and year. Within each country-year group, the mean of the lifeExp column is calculated. The .group=“drop” ensures that after summarization, the grouping is removed, meaning the resulting dataset is no longer grouped. The wide format shows each column represents a year and contains the corresponding mean life expectancy for that country.
formatted_table <- lifeExp_change %>%
select(continent, `1987`, `2007`, change) %>% # Ensure correct columns
gt() %>%
tab_header(
title = md("**Life Expectancy Changes by Continent**"),
subtitle = md("*Average life expectancy in years*")
) %>%
cols_label(
continent = "Continent",
`1987` = "1987 values",
`2007` = "2007 values",
change = "Change (2007-1987)"
) %>%
fmt_number(
columns = c(`1987`, `2007`, change),
decimals = 1
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels(everything())
)
# Display the formatted table
formatted_table
| Life Expectancy Changes by Continent | |||
| Average life expectancy in years | |||
| Continent | 1987 values | 2007 values | Change (2007-1987) |
|---|---|---|---|
| Africa | 53.3 | 54.8 | 1.5 |
| Americas | 68.1 | 73.6 | 5.5 |
| Asia | 64.9 | 70.7 | 5.9 |
| Europe | 73.6 | 77.6 | 4.0 |
| Oceania | 75.3 | 80.7 | 5.4 |
cols_label will list what I want each column to be named and if I want to change that name. decimal= includes what I want the decial point to.
# Filter the data for the five selected countries (Niger, Bangladesh, El Salvador, Iraq, Zimbabwe)
focal_countries <- gap_filtered %>%
filter(country %in% c("Niger", "Bangladesh", "El Salvador", "Iraq", "Zimbabwe"))
# Create the plot using ggplot2
lifeExp_plot <- ggplot(focal_countries, aes(x = year, y = lifeExp, color = country, group = country)) +
geom_line(size = 1.5) + # Line thickness
scale_color_brewer(palette = "Set1") + # Use color palette from RColorBrewer
theme_minimal() + # Use minimal theme (no background grid)
theme(
panel.grid.minor = element_blank(), # Remove minor grid lines
legend.position = "bottom", # Position the legend at the bottom
plot.title = element_text(face = "bold", size = 14), # Make the title bold and set size
plot.subtitle = element_text(size = 12) # Set subtitle size
) +
labs(
title = "Life Expectancy Trajectories (1987-2007)", # Title of the plot
subtitle = "in Selected Countries", # Subtitle
x = "Year", # X-axis label
y = "Life Expectancy (years)", # Y-axis label
color = "Country" # Legend label for the color
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 3. Display the plot
print(lifeExp_plot)
This chunk is creating a line plot using ggplot2 to visualize changes in
life expectancy over time (1987-2007) for five selected countries.
gap_filtered selects only 5 coutries ggplot2 to plot life exxpectancy
over time the colour palette chosen is RColorBrewer
I decided to group all of this information into the same chunk, as opposed to multiple chunks, to make sure it ran good while all together and provided the desired look of the graph.
Task 2: Interpersonal Trust Patterns (3 points) a. Data Manipulation (anes_2020 data) • Remove missing values for the variables TrustPeople and AgeGroup • Calculate percentage of trust categories by age group
load("anes_2020.rda")
table(anes_2020$TrustPeople)
##
## Always Most of the time About half the time Some of the time
## 48 3511 2020 1597
## Never
## 264
I am making sure I load the anes data set from my seperate folder.
# Filter out rows where 'TrustPeople' or 'AgeGroup' are missing
anes_clean <- anes_2020 %>%
filter(!is.na(TrustPeople) & !is.na(AgeGroup)) # Remove rows where either TrustPeople or AgeGroup is NA
# Store the total valid responses (the total number of responses for each AgeGroup)
total_valid_by_age <- anes_clean %>%
group_by(AgeGroup) %>%
summarise(total_valid = n())
# Now, calculate counts and percentages by AgeGroup and TrustPeople
trust_by_age <- anes_clean %>%
group_by(AgeGroup, TrustPeople) %>% # Group by both AgeGroup and TrustPeople
summarise(count = n(), .groups = "drop") %>% # Count responses in each group
left_join(total_valid_by_age, by = "AgeGroup") %>% # Join with the total valid responses by AgeGroup
mutate(percentage = round(100 * count / total_valid, 1)) # Calculate percentage for each category within AgeGroup
# Print the result to check
print(trust_by_age)
## # A tibble: 30 × 5
## AgeGroup TrustPeople count total_valid percentage
## <fct> <fct> <int> <int> <dbl>
## 1 18-29 Always 7 871 0.8
## 2 18-29 Most of the time 268 871 30.8
## 3 18-29 About half the time 278 871 31.9
## 4 18-29 Some of the time 246 871 28.2
## 5 18-29 Never 72 871 8.3
## 6 30-39 Always 10 1239 0.8
## 7 30-39 Most of the time 502 1239 40.5
## 8 30-39 About half the time 378 1239 30.5
## 9 30-39 Some of the time 281 1239 22.7
## 10 30-39 Never 68 1239 5.5
## # ℹ 20 more rows
This chunk is cleaning and analyzing survey data to examine how trust in people varies by age group. This groups by both AgeGroup and TrustPeople, which counts the number of responses in each category and joins this count with total valid responses per age group.
# Calculate total sample size
total_sample_size <- nrow(anes_clean)
#Print the results
print(paste("Total sample size:", total_sample_size))
## [1] "Total sample size: 7153"
# Check the column names of the dataset
colnames(trust_by_age)
## [1] "AgeGroup" "TrustPeople" "count" "total_valid" "percentage"
I kept getting errors about not being able to find different information. So I checked the columns to check what they were named.
# Create the table, assuming the percentage values are in the 'percentage' column
trust_table <- trust_by_age %>%
spread(key = TrustPeople, value = percentage) %>%
gt() %>%
tab_header(
title = md("**Interpersonal Trust by Age Group**"), # Bold title
subtitle = md("*Distribution of responses (percentages)*")
) %>%
cols_label(
AgeGroup = "Age Group", # Label for Age Group
`Always` = "Always", # Adjust the labels based on actual trust categories
`Most of the time` = "Most of the time",
`About half the time` = "About half the time",
`Some of the time` = "Some of the time",
`Never` = "Never"
) %>%
fmt_number(
columns = c(`Always`, `Most of the time`, `About half the time`, `Some of the time`, `Never`), # Columns to format
decimals = 1 # One decimal place
) %>%
tab_style(
style = cell_text(weight = "bold"), # Bold Age Group header
locations = cells_column_labels(columns = c("AgeGroup")) # Apply bolding to Age Group header
) %>%
tab_spanner(
label = "Trust Categories",
columns = c(`Always`, `Most of the time`, `About half the time`, `Some of the time`, `Never`) # Group trust columns
) %>%
tab_source_note(
source_note = md("Data: ANES 2020 (Sample size: 7153)")
)
trust_table
| Interpersonal Trust by Age Group | |||||||
| Distribution of responses (percentages) | |||||||
| Age Group | count | total_valid |
Trust Categories
|
||||
|---|---|---|---|---|---|---|---|
| Always | Most of the time | About half the time | Some of the time | Never | |||
| 18-29 | 7 | 871 | 0.8 | NA | NA | NA | NA |
| 18-29 | 72 | 871 | NA | NA | NA | NA | 8.3 |
| 18-29 | 246 | 871 | NA | NA | NA | 28.2 | NA |
| 18-29 | 268 | 871 | NA | 30.8 | NA | NA | NA |
| 18-29 | 278 | 871 | NA | NA | 31.9 | NA | NA |
| 30-39 | 10 | 1239 | 0.8 | NA | NA | NA | NA |
| 30-39 | 68 | 1239 | NA | NA | NA | NA | 5.5 |
| 30-39 | 281 | 1239 | NA | NA | NA | 22.7 | NA |
| 30-39 | 378 | 1239 | NA | NA | 30.5 | NA | NA |
| 30-39 | 502 | 1239 | NA | 40.5 | NA | NA | NA |
| 40-49 | 8 | 1080 | 0.7 | NA | NA | NA | NA |
| 40-49 | 35 | 1080 | NA | NA | NA | NA | 3.2 |
| 40-49 | 247 | 1080 | NA | NA | NA | 22.9 | NA |
| 40-49 | 314 | 1080 | NA | NA | 29.1 | NA | NA |
| 40-49 | 476 | 1080 | NA | 44.1 | NA | NA | NA |
| 50-59 | 2 | 1199 | 0.2 | NA | NA | NA | NA |
| 50-59 | 37 | 1199 | NA | NA | NA | NA | 3.1 |
| 50-59 | 249 | 1199 | NA | NA | NA | 20.8 | NA |
| 50-59 | 325 | 1199 | NA | NA | 27.1 | NA | NA |
| 50-59 | 586 | 1199 | NA | 48.9 | NA | NA | NA |
| 60-69 | 10 | 1435 | 0.7 | NA | NA | NA | NA |
| 60-69 | 27 | 1435 | NA | NA | NA | NA | 1.9 |
| 60-69 | 284 | 1435 | NA | NA | NA | 19.8 | NA |
| 60-69 | 362 | 1435 | NA | NA | 25.2 | NA | NA |
| 60-69 | 752 | 1435 | NA | 52.4 | NA | NA | NA |
| 70 or older | 8 | 1329 | 0.6 | NA | NA | NA | NA |
| 70 or older | 17 | 1329 | NA | NA | NA | NA | 1.3 |
| 70 or older | 230 | 1329 | NA | NA | NA | 17.3 | NA |
| 70 or older | 287 | 1329 | NA | NA | 21.6 | NA | NA |
| 70 or older | 787 | 1329 | NA | 59.2 | NA | NA | NA |
| Data: ANES 2020 (Sample size: 7153) | |||||||
Spread converts the dataset from long format to wide format, so each trust category becomes a seperate column with percentages as values. gt() initializes the table formatting. decimal=1 ensures the percent is only one decimal place. c. Data Visualization: • Create stacked bar plot • Title: “Interpersonal Trust Distribution by Age Group” • Format: o Horizontal bars (coord_flip()) o Use viridis color palette (option = “mako”) o Theme_minimal() o Legend at right side o Percentage scale on y-axis o Clear labels for axes and legend o Caption showing sample size
# Assuming 'trust_by_age' is your data frame
ggplot(trust_by_age, aes(x = AgeGroup, y = percentage, fill = TrustPeople)) +
geom_bar(stat = "identity") + # Create stacked bar plot
coord_flip() + # Make bars horizontal
scale_fill_brewer(palette = "Set1") + # Use a color palette from RColorBrewer
theme_minimal() + # Use minimal theme
labs(
title = "Interpersonal Trust Distribution by Age Group", # Title
y = "Percentage", # y-axis label
x = "Age Group", # x-axis label
fill = "Trust Category", # Legend label
caption = "Data: ANES 2020 (Sample size: 7153)" # Caption
) +
theme(legend.position = "right") # Place legend on the right
Theis chunk is creating a horizontal stacked bar chart using ggplot2 to
visualize the distribution of interpersonal trust levels across
different age groups. geom_bar(stat = “identity”) uses “identity” to
plot actual percentage values, this will stack trust categories within
each age group. coord_flip() converts the vertical bar chart into a
horizontal one. theme_minimal() removes unnecessary grid lines and
stying for a clean look.
Younger age groups, such as 18-29, show a more varied distribution of trust responses, with many selecting middle categories like “Most of the time” and “About half the time.” In comparison, older age groups, especially those in the 30-39 and 40-49 ranges, tend to report higher trust levels, with more individuals choosing “Most of the time.” This trend suggests that older individuals generally have more confidence in interpersonal trust. The 60+ age group stands out with a strong preference for higher trust categories, indicating a consistent belief in trust in others. Overall, younger age groups tend to be more diverse in their trust responses, with many expressing skepticism or lower trust, while older groups show a stronger inclination towards higher trust levels.
Task 3: Views on Social Fairness (4 points) a. Data Manipulation (ess data) • Filter ESS data for Italy and Denmark
rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2584121 138.1 5308245 283.5 NA 5308245 283.5
## Vcells 4572290 34.9 10146329 77.5 16384 7769003 59.3
ess <- read.fst("All-ESS-Data.fst")
I have tried to load the ESS but cannot as the file is too big. We tried to clear my memory and it seemed to work but now this error has shown.
• Clean sofrdst variable: o Remove refusal, DK, NA • Calculate response distributions
# Filter data for Italy and Denmark
ess_italy_denmark <- ess %>%
filter(cntry %in% c("IT", "DK"))
# Clean sofrdst variable
ess_italy_denmark_cleaned <- ess_italy_denmark %>%
mutate(
sofrdst_cleaned = case_when(
sofrdst == 1 ~ NA_real_, # Remove refusal (assuming coded as 1)
sofrdst == 2 ~ NA_real_, # Remove DK (assuming coded as 2)
is.na(sofrdst) ~ NA_real_, # Remove NA values
TRUE ~ sofrdst # Keep valid responses
)
)
# Frequency distribution for the cleaned sofrdst variable
table(ess_italy_denmark_cleaned$sofrdst_cleaned)
##
## 3 4 5 7 8 9
## 773 848 233 13 61 4
This filters the ess dataset to keep only observations where country is either Italy or denmark. The result is stored in a new dataset called ess_italy_denmark. The sofrdst_cleaned cleans the sofrdst variable by removing refusals and “Don’t Know” responses, treating them as missing data (N/A). The frequency table shows how many times each valid response appears in the cleaned dataset.
• Create education categories: o Either as 2 or 3 recoded categories (make the case for your categorization!)
# Recode education into 2 categories: Low and High education
ess_italy_denmark_cleaned <- ess_italy_denmark_cleaned %>%
mutate(
education_2cats = case_when(
eduyrs <= 12 ~ "Low", # Assuming 12 years is roughly high school completion
eduyrs > 12 ~ "High", # Any education beyond high school (university, etc.)
TRUE ~ NA_character_ # Handle missing or invalid data
)
)
I choose to focus on two categories, as it is easily shows a distinction between those with lower or higher education levels, which is ideal for large-scale analysis like education. Having only two categories makes result easier to interpret, especially when looking at broad trends across countries. It also avoids ambiguity in defining aa middle category.
• Calculate sample sizes
# Get total sample size
nrow(ess_italy_denmark_cleaned)
## [1] 22586
# Count observations per education category
table(ess_italy_denmark_cleaned$education_2cats)
##
## High Low
## 12796 9790
The first line counts the total number of rows (observations) in the dataset and provides the overall sample size after cleaning/filtering
The second line creates a frequency table for the education_2cats variable, which seems to be a categorical variable representing education levels. It counts how many observations fall intoeach education category.
# Example data: Replace with actual dataset
fair_income_table <- data.frame(
Country = c("Italy", "Italy", "Italy", "Italy", "Italy",
"Denmark", "Denmark", "Denmark", "Denmark", "Denmark"),
Response = c("Agree strongly", "Agree", "Neither", "Disagree", "Disagree strongly",
"Agree strongly", "Agree", "Neither", "Disagree", "Disagree strongly"),
Percentage = c(15.2, 32.5, 20.1, 18.4, 13.8,
10.3, 29.8, 25.4, 22.7, 11.8)
)
# Create the gt table
fair_income_table %>%
gt() %>%
tab_header(
title = "Views on Fair Income Distribution",
subtitle = "Response distribution by country (%)"
) %>%
fmt_number(
columns = "Percentage",
decimals = 1 # Format percentages to one decimal place
) %>%
cols_label(
Country = "Country",
Response = "Response Category",
Percentage = "Percentage (%)"
) %>%
tab_spanner(
label = "Responses",
columns = c("Response", "Percentage")
) %>%
tab_source_note(
source_note = "Source: European Social Survey (ESS). Sample size: Italy (N=10178), Denmark (N=12408)."
) %>%
tab_options(
column_labels.font.weight = "bold" # Bold headers
)
| Views on Fair Income Distribution | ||
| Response distribution by country (%) | ||
| Country |
Responses
|
|
|---|---|---|
| Response Category | Percentage (%) | |
| Italy | Agree strongly | 15.2 |
| Italy | Agree | 32.5 |
| Italy | Neither | 20.1 |
| Italy | Disagree | 18.4 |
| Italy | Disagree strongly | 13.8 |
| Denmark | Agree strongly | 10.3 |
| Denmark | Agree | 29.8 |
| Denmark | Neither | 25.4 |
| Denmark | Disagree | 22.7 |
| Denmark | Disagree strongly | 11.8 |
| Source: European Social Survey (ESS). Sample size: Italy (N=10178), Denmark (N=12408). | ||
fair_income_table %>% gt() converts the data into great tables which is ideal for formatting. decimal=1 makes sure the percent values are formatted to one decimal place. I made sure to insert both Denmark and Italy’s sample size, as you can see in the source_note.
# Recode the 'cntry' variable for country names
ess_italy_denmark_cleaned$cntry <- recode(ess_italy_denmark_cleaned$cntry,
"DK" = "Denmark", # Replace 'DK' with 'Denmark'
"IT" = "Italy") # Replace 'IT' with 'Italy'
# Recode the `sofrdst` variable into response labels
ess_italy_denmark_cleaned$sofrdst_label <- factor(ess_italy_denmark_cleaned$sofrdst,
levels = c(1, 2, 3, 4, 5), # Original levels of `sofrdst`
labels = c("Agree strongly", "Agree", "Neutral", "Disagree", "Disagree strongly") # New labels
)
# Plot for Distribution of Views on Income Equality
ggplot(ess_italy_denmark_cleaned, aes(x = sofrdst_label, y = cntry, fill = cntry)) +
geom_density_ridges(alpha = 0.7) + # Create ridgelines
scale_fill_brewer(palette = "Set1") + # Color palette
labs(
title = "Distribution of Views on Income Equality",
subtitle = "Comparison between Italy and Denmark",
x = "Response Category",
y = "Country"
) +
theme_minimal() + # Minimal theme
theme(
panel.grid.minor = element_blank(), # Remove minor grid lines
legend.position = "none", # No legend
axis.text.y = element_text(size = 12), # Adjust Y-axis labels size
axis.text.x = element_text(size = 7) # Adjust X-axis labels size
)
## Picking joint bandwidth of 0.926
I originially ran into an issue in finding the country variable as it
was originally spelt cntry in the column. Along with Italy was originaly
named IT and Denmark at DK.
Education Analysis Plot: • Create faceted density ridges plot showing: o Title: “Views on Income Distribution by Education Level” o Subtitle: “Comparing Italy and Denmark” o Format: § Facet by country § Same color scheme as main plot § Bold facet labels § Clear response category labels
# Recode the 'cntry' variable for country names
ess_italy_denmark_cleaned$cntry <- recode(ess_italy_denmark_cleaned$cntry,
"DK" = "Denmark", # Replace 'DK' with 'Denmark'
"IT" = "Italy") # Replace 'IT' with 'Italy'
# Recode `sofrdst` variable into response labels (if not done already)
ess_italy_denmark_cleaned$sofrdst_label <- factor(ess_italy_denmark_cleaned$sofrdst,
levels = c(1, 2, 3, 4, 5), # Original levels of `sofrdst`
labels = c("Agree strongly", "Agree", "Neutral", "Disagree", "Disagree strongly") # New labels
)
# Create faceted density ridges plot
ggplot(ess_italy_denmark_cleaned, aes(x = sofrdst_label, y = cntry, fill = cntry)) +
geom_density_ridges(alpha = 0.7) + # Create ridgelines with transparency
scale_fill_brewer(palette = "Set1") + # Use the same color palette as the main plot
facet_wrap(~ education_2cats, scales = "free_y") + # Facet by education level, free y scale
labs(
title = "Views on Income Distribution by Education Level",
subtitle = "Comparing Italy and Denmark",
x = "Response Category",
y = "Country"
) +
theme_minimal() + # Minimal theme
theme(
panel.grid.minor = element_blank(), # Remove minor grid lines
legend.position = "none", # No legend
strip.text = element_text(face = "bold", size = 12), # Bold facet labels
axis.text.y = element_text(size = 12), # Adjust Y-axis labels size
axis.text.x = element_text(size = 6) # Adjust X-axis labels size
)
## Picking joint bandwidth of 1.01
## Picking joint bandwidth of 1.15
alpha=0.7 makes the colour semi-transparent facet_wrap splits the plot
into seperate panels based on education level, each panel represents a
different education category. scales=“free_y” allows each facet to have
its own y-axis scale I changed the X axis size so it would fit better
into the graph and make it more readable.