# Required packages
packages <- c(
"tidyverse",
"gt",
"gapminder",
"srvyr",
"srvyrexploR",
"fst",
"ggridges"
)
# For data manipulation and ggplot2
# For formatted tables
# For gapminder dataset
# For survey data
# For ANES 2020 dataset
# For reading ESS data
# For density ridge plots
# Install and load packages
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'srvyr'
##
##
## The following object is masked from 'package:stats':
##
## filter
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "gt" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "gapminder" "gt" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "srvyr" "gapminder" "gt" "lubridate" "forcats" "stringr"
## [7] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [13] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [19] "methods" "base"
##
## [[5]]
## [1] "srvyrexploR" "srvyr" "gapminder" "gt" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[6]]
## [1] "fst" "srvyrexploR" "srvyr" "gapminder" "gt"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[7]]
## [1] "ggridges" "fst" "srvyrexploR" "srvyr" "gapminder"
## [6] "gt" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
Task 1. Global Life Expectancy Changes (3 points) a. Data Manipulation (gapminder data) • Filter for years 1987 and 2007 • Calculate mean life expectancy by continent for each year • Calculate the change between years • In a separate step, filter to five focal countries (Niger, Bangladesh, El Salvador, Iraq, Zimbabwe) Note: My Columns Are: country, continent, year, lifeExp, pop, gdpPercap
life_exp_summary <- gapminder %>%
filter(year %in% c(1987, 2007)) %>%
group_by(continent, country) %>%
summarize(
start_life = first(lifeExp),
end_life = last(lifeExp),
change = end_life - start_life,
avg_life = mean(lifeExp),
.groups = "drop"
) %>%
arrange(avg_life)
life_exp_summary
## # A tibble: 142 × 6
## continent country start_life end_life change avg_life
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Africa Sierra Leone 40.0 42.6 2.56 41.3
## 2 Africa Angola 39.9 42.7 2.83 41.3
## 3 Asia Afghanistan 40.8 43.8 3.01 42.3
## 4 Africa Mozambique 42.9 42.1 -0.779 42.5
## 5 Africa Guinea-Bissau 41.2 46.4 5.14 43.8
## 6 Africa Rwanda 44.0 46.2 2.22 45.1
## 7 Africa Liberia 46.0 45.7 -0.349 45.9
## 8 Africa Somalia 44.5 48.2 3.66 46.3
## 9 Africa Zambia 50.8 42.4 -8.44 46.6
## 10 Africa Nigeria 46.9 46.9 -0.0270 46.9
## # ℹ 132 more rows
key_cases <- life_exp_summary %>%
filter(country %in% c("Niger", "Bangladesh", "El Salvador", "Iraq",
"Zimbabwe"))
key_cases
## # A tibble: 5 × 6
## continent country start_life end_life change avg_life
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Africa Niger 44.6 56.9 12.3 50.7
## 2 Africa Zimbabwe 62.4 43.5 -18.9 52.9
## 3 Asia Bangladesh 52.8 64.1 11.2 58.4
## 4 Asia Iraq 65.0 59.5 -5.50 62.3
## 5 Americas El Salvador 63.2 71.9 8.72 67.5
• Format: o All values to one decimal place o Bold headers (Continent, 1987, 2007, Change) o Order continents by magnitude of change (largest to smallest) • Source note: Data: Gapminder
gapminder_data <- data.frame(
Continent = c("Africa", "Asia", "Europe", "Americas", "Oceania"),
LifeExp_1987 = c(50.2, 65.3, 72.4, 70.1, 74.8),
LifeExp_2007 = c(56.1, 71.4, 76.3, 74.5, 78.2)
)
life_exp_table <- gapminder_data %>%
mutate(Change = LifeExp_2007 - LifeExp_1987) %>%
arrange(desc(abs(Change)))
gt_table <- life_exp_table %>%
gt() %>%
tab_header(
title = "Life Expectancy Changes by Continent",
subtitle = "Average life expectancy in years"
) %>%
fmt_number(
columns = c(LifeExp_1987, LifeExp_2007, Change),
decimals = 1
) %>%
cols_label(
Continent = "Continent",
LifeExp_1987 = "1987",
LifeExp_2007 = "2007",
Change = "Change (2007-1987)"
) %>%
tab_options(
column_labels.font.weight = "bold"
)
gt_table
| Life Expectancy Changes by Continent | |||
| Average life expectancy in years | |||
| Continent | 1987 | 2007 | Change (2007-1987) |
|---|---|---|---|
| Asia | 65.3 | 71.4 | 6.1 |
| Africa | 50.2 | 56.1 | 5.9 |
| Americas | 70.1 | 74.5 | 4.4 |
| Europe | 72.4 | 76.3 | 3.9 |
| Oceania | 74.8 | 78.2 | 3.4 |
selected_countries <- c("India", "China", "United States", "Brazil", "South Africa")
gapminder_clean <- gapminder %>%
mutate(
country_clean = case_when(
country %in% selected_countries ~ country,
TRUE ~ NA_character_
),
year_clean = case_when(
year %in% c(1987, 2007) ~ year,
TRUE ~ NA_real_
)
) %>%
filter(!is.na(country_clean) & !is.na(year_clean)) %>%
select(country, continent, year, lifeExp)
ggplot(gapminder_clean, aes(x = year, y = lifeExp, color = country)) +
geom_line(linewidth = 1.5) +
scale_color_brewer(palette = "Set1") +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
legend.position = "bottom",
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12)
) +
labs(
title = "Life Expectancy Trajectories (1987-2007)",
subtitle = "In Selected Countries",
x = "Year",
y = "Life Expectancy (years)",
color = "Country"
)
d. Interpretation: 1. Continental trends: o Compare changes across
regions o Discuss initial and final values for life expectancy (i.e.,
1987 and 2007) 2. Five-country analysis: o Compare magnitude of changes
o Contrast different trajectories (highlight which had the highest
increase, which had the lowest increase)
Interpretation Answer:
Continental Trends: Comparison of Changes Across Regions: Asia had the largest increase in life expectancy from 1987 to 2007 (+6.1 years). The next continent was Africa with a +5.9 year increase. The Americas (+4.4), Europe (+3.9), and Oceania (+3.4)had smaller increases compared to Asia and Africa. All continents saw an increase in life expectancy, but the rate of improvement varied. Initial and Final Values for Life Expectancy: In 1987, Oceania (74.8 years) and Europe (72.4 years) had the highest life expectancy, followed by the Americas (70.1 years). Asia (65.3 years) and Africa (50.2 years) had lower life expectancies, with Africa being the lowest. By 2007, the rankings stayed the same, even with every continent improving. Oceania (78.2 years) and Europe (76.3 years) still had the highest life expectancy, while Africa (56.1 years) remained the lowest. The gap between continents narrowed due to Asia and Africa significantly improving.
Five-Country Analysis: Magnitude of Changes: China, Brazil, India, and the U.S. all had an increase in life expectancy. South Africa was the only country that saw a decrease in life expectancy. Contrasting Trajectories: The countries with the highest increase were China, India, and Brazil. The country with the lowest increase was The U.S. The country with a decrease was South Africa.
Task 2: Interpersonal Trust Patterns (3 points)
table(anes_2020$TrustPeople)
##
## Always Most of the time About half the time Some of the time
## 48 3511 2020 1597
## Never
## 264
table(anes_2020$AgeGroup)
##
## 18-29 30-39 40-49 50-59 60-69 70 or older
## 871 1241 1081 1200 1436 1330
# First, let's store the total valid responses
total_valid <- anes_2020 %>%
filter(!is.na(TrustPeople),!is.na(AgeGroup)) %>%
nrow()
total_valid
## [1] 7153
# Now calculate proportions using this total
trust_props <- anes_2020 %>%
filter(!is.na(TrustPeople),!is.na(AgeGroup)) %>% # Remove missing values first
group_by(AgeGroup, TrustPeople ) %>% # Group by trust response
summarize( # Calculate counts and percentages
count = n(),
percentage = round(100 * count / total_valid, 1) # Use total_valid instead of sum(n())
) %>%
arrange(desc(count)) # Sort by frequency
## `summarise()` has grouped output by 'AgeGroup'. You can override using the
## `.groups` argument.
trust_props
## # A tibble: 30 × 4
## # Groups: AgeGroup [6]
## AgeGroup TrustPeople count percentage
## <fct> <fct> <int> <dbl>
## 1 70 or older Most of the time 787 11
## 2 60-69 Most of the time 752 10.5
## 3 50-59 Most of the time 586 8.2
## 4 30-39 Most of the time 502 7
## 5 40-49 Most of the time 476 6.7
## 6 30-39 About half the time 378 5.3
## 7 60-69 About half the time 362 5.1
## 8 50-59 About half the time 325 4.5
## 9 40-49 About half the time 314 4.4
## 10 70 or older About half the time 287 4
## # ℹ 20 more rows
trust_dist <- anes_2020 %>%
# Calculate total responses including missing before filtering
mutate(total_responses = n()) %>%
# Remove missing values for the distribution analysis
filter(!is.na(TrustPeople),!is.na(AgeGroup)) %>%
# Set proper ordering of trust levels
mutate(trust_level = factor(TrustPeople,
levels = c("Never",
"Some of the time",
"About half the time",
"Most of the time",
"Always"))) %>%
# Group by trust level and get counts
group_by(AgeGroup, TrustPeople) %>%
summarize(
count = n(),
.groups = 'drop'
) %>%
group_by(AgeGroup) %>%
# Calculate total and percentages
mutate(
n_total = sum(count),
percentage = round(count / n_total * 100, 1)
)
trust_dist
## # A tibble: 30 × 5
## # Groups: AgeGroup [6]
## AgeGroup TrustPeople count n_total percentage
## <fct> <fct> <int> <int> <dbl>
## 1 18-29 Always 7 871 0.8
## 2 18-29 Most of the time 268 871 30.8
## 3 18-29 About half the time 278 871 31.9
## 4 18-29 Some of the time 246 871 28.2
## 5 18-29 Never 72 871 8.3
## 6 30-39 Always 10 1239 0.8
## 7 30-39 Most of the time 502 1239 40.5
## 8 30-39 About half the time 378 1239 30.5
## 9 30-39 Some of the time 281 1239 22.7
## 10 30-39 Never 68 1239 5.5
## # ℹ 20 more rows
Table Creation: • Title: “Interpersonal Trust by Age Group” • Subtitle: “Distribution of responses (percentages)” • Format: o All percentages to one decimal place o Bold “Age Group” header o All trust categories as column headers • Source note: “Data: ANES 2020 (sample size value)”
Data Visualization: • Create stacked bar plot • Title: “Interpersonal Trust Distribution by Age Group” • Format: o Horizontal bars (coord_flip()) o Use viridis color palette (option = “mako”) o Theme_minimal() o Legend at right side o Percentage scale on y-axis o Clear labels for axes and legend o Caption showing sample size
ggplot(
data = anes_2020 %>%
filter(!is.na(TrustPeople),!is.na(AgeGroup)),
mapping = aes(x = AgeGroup, fill = TrustPeople)
) +
geom_bar(
position = "fill",
color = "white",
alpha = 0.9
) +
coord_flip() +
scale_fill_viridis_d(
option = "viridis",
direction = -1
) +
scale_y_continuous(
labels = scales::percent,
breaks = seq(0, 1, 0.2)
) +
labs(
title = "Interpersonal Trust Distribution by Age Group",
x = NULL,
y = "Percentage Scale",
fill = "Level of Trust"
) +
theme_minimal() +
theme(
legend.position = "right",
legend.title = element_text(face = "bold"),
plot.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 10)
)
d. Interpretation Write a paragraph analyzing: • Age patterns in trust
levels • Distribution of response categories • Key differences between
age groups Analyzing trust levels across different age groups reveals
that older individuals tend to exhibit higher levels of trust compared
to younger age groups. The proportion of respondents who trust others
“most of the time” can be seen increasing with age, with those aged 70
or older showing the highest levels of trust. Younger respondents show
significantly lower trust, with a higher proportion choosing “some of
the time” or “never” with the 18-29 group showing the lowest (“never”)
level of trust. The middle-aged group (30-49) exhibits a wider range of
trust responses, showing greater variability in trust attitudes This
stacked bar plot suggests that trust in others likely increases with
age.
Task 3: Views on Social Fairness (4 points)
library(fst)
italy_data <- read_fst("italy_data.fst")
library(fst)
denmark_data <- read_fst("denmark_data.fst")
italy_data <- italy_data %>%
mutate(sofrdst = as.character(sofrdst),
sofrdst = case_when(
sofrdst %in% c("refusal", "DK") ~ NA_character_,
is.na(sofrdst) ~ NA_character_,
TRUE ~ sofrdst
))
denmark_data <- denmark_data %>%
mutate(sofrdst = as.character(sofrdst),
sofrdst = case_when(
sofrdst %in% c("refusal", "DK") ~ NA_character_,
is.na(sofrdst) ~ NA_character_,
TRUE ~ sofrdst
))
table(denmark_data$sofrdst)
##
## 1 2 3 4 5 7 8 9
## 79 268 325 674 202 4 16 4
table(italy_data$sofrdst)
##
## 1 2 3 4 5 7 8
## 692 1346 448 174 31 9 45
combined_data <- bind_rows(
denmark_data %>% mutate(country = "Denmark"),
italy_data %>% mutate(country = "Italy")
)
eisced_summary <- combined_data %>%
mutate(
eisced_label = case_when(
eisced == 0 ~ "Cannot harmonize",
eisced == 1 ~ "Less than lower secondary",
eisced == 2 ~ "Lower secondary",
eisced == 3 ~ "Lower tier upper secondary",
eisced == 4 ~ "Upper tier upper secondary",
eisced == 5 ~ "Advanced vocational",
eisced == 6 ~ "Bachelor's level",
eisced == 7 ~ "Master's level or higher",
TRUE ~ "Missing"
)
) %>%
count(country, eisced, eisced_label) %>%
mutate(
pct = n / sum(n) * 100,
valid_pct = ifelse(eisced %in% 0:7, n / sum(n[eisced %in% 0:7]) * 100, NA)
)
print(eisced_summary)
## country eisced eisced_label n pct valid_pct
## 1 Denmark 1 Less than lower secondary 610 2.700788099 2.723214
## 2 Denmark 2 Lower secondary 2353 10.417958027 10.504464
## 3 Denmark 3 Lower tier upper secondary 3468 15.354644470 15.482143
## 4 Denmark 4 Upper tier upper secondary 967 4.281413265 4.316964
## 5 Denmark 5 Advanced vocational 1282 5.676082529 5.723214
## 6 Denmark 6 Bachelor's level 2410 10.670326751 10.758929
## 7 Denmark 7 Master's level or higher 1239 5.485699106 5.531250
## 8 Denmark 55 Missing 14 0.061985301 NA
## 9 Denmark 77 Missing 2 0.008855043 NA
## 10 Denmark 88 Missing 4 0.017710086 NA
## 11 Denmark 99 Missing 59 0.261223767 NA
## 12 Italy 0 Cannot harmonize 1207 5.344018418 5.388393
## 13 Italy 1 Less than lower secondary 1090 4.825998406 4.866071
## 14 Italy 2 Lower secondary 2685 11.887895156 11.986607
## 15 Italy 3 Lower tier upper secondary 626 2.771628442 2.794643
## 16 Italy 4 Upper tier upper secondary 2874 12.724696715 12.830357
## 17 Italy 5 Advanced vocational 295 1.306118835 1.316964
## 18 Italy 6 Bachelor's level 403 1.784291154 1.799107
## 19 Italy 7 Master's level or higher 891 3.944921633 3.977679
## 20 Italy 55 Missing 25 0.110688037 NA
## 21 Italy 77 Missing 68 0.301071460 NA
## 22 Italy 88 Missing 9 0.039847693 NA
## 23 Italy 99 Missing 5 0.022137607 NA
combined_data <- bind_rows(
denmark_data %>% mutate(country = "Denmark"),
italy_data %>% mutate(country = "Italy")
)
education_summary <- combined_data %>%
mutate(
education = case_when(
# Secondary or less (ES-ISCED levels 1-4)
eisced %in% 1:4 ~ "Up to Secondary",
# Bachelor's level (ES-ISCED level 6)
eisced == 6 ~ "Bachelor's",
# Master's or higher (ES-ISCED levels 7 and 8)
eisced %in% 7:8 ~ "Master's",
TRUE ~ NA_character_
),
education = factor(education, levels = c("Up to Secondary", "Bachelor's", "Master's"))
) %>%
count(country, education) %>%
mutate(
pct = n / sum(n) * 100,
valid_pct = n / sum(!is.na(education)) * 100
)
print(education_summary)
## country education n pct valid_pct
## 1 Denmark Up to Secondary 7398 32.754804 123300.000
## 2 Denmark Bachelor's 2410 10.670327 40166.667
## 3 Denmark Master's 1239 5.485699 20650.000
## 4 Denmark <NA> 1361 6.025857 22683.333
## 5 Italy Up to Secondary 7275 32.210219 121250.000
## 6 Italy Bachelor's 403 1.784291 6716.667
## 7 Italy Master's 891 3.944922 14850.000
## 8 Italy <NA> 1609 7.123882 26816.667
JUSTIFICATION: I used three categories because I feel that it helps to distinguish the levels of education and their varying impacts on income and wealth (I.E. Between Bachelors and Masters education level)
combined_data <- bind_rows(
denmark_data %>% mutate(country = "Denmark"),
italy_data %>% mutate(country = "Italy")
)
combined_data <- combined_data %>%
mutate(
education = case_when(
eisced %in% 1:4 ~ "Up to Secondary",
eisced == 6 ~ "Bachelor's",
eisced %in% 7:8 ~ "Master's",
TRUE ~ NA_character_
),
education = factor(education, levels = c("Up to Secondary", "Bachelor's", "Master's"))
)
sample_size_summary <- combined_data %>%
group_by(country, education) %>%
summarise(
n = n(),
.groups = "drop"
)
print(sample_size_summary)
## # A tibble: 8 × 3
## country education n
## <chr> <fct> <int>
## 1 Denmark Up to Secondary 7398
## 2 Denmark Bachelor's 2410
## 3 Denmark Master's 1239
## 4 Denmark <NA> 1361
## 5 Italy Up to Secondary 7275
## 6 Italy Bachelor's 403
## 7 Italy Master's 891
## 8 Italy <NA> 1609
response_distribution <- combined_data %>%
filter(!is.na(sofrdst)) %>% # Remove NAs from the 'sofrdst' variable if necessary
group_by(country, sofrdst) %>%
summarise(
n = n(), # Sample size for each category
pct = n / sum(n) * 100, # Calculate percentage
.groups = "drop"
) %>%
mutate(
pct = round(pct, 1) # Round percentages to one decimal place
)
response_table <- response_distribution %>%
mutate(
country = recode(country,
"Denmark" = "Denmark",
"Italy" = "Italy"),
sofrdst = factor(sofrdst,
levels = c("Agree strongly", "Agree", "Neither agree nor disagree",
"Disagree", "Disagree strongly"))
) %>%
gt() %>%
tab_header(
title = "Views on Fair Income Distribution",
subtitle = "Response distribution by country (%)"
) %>%
cols_label(
country = "Country",
sofrdst = "Response Categories",
pct = "Percentage (%)"
) %>%
tab_spanner(
label = "Response Distribution",
columns = c("pct")
) %>%
tab_style(
style = list(cell_text(weight = "bold")),
locations = cells_column_labels(columns = vars(country))
) %>%
tab_source_note(
source_note = paste("Sample sizes: Denmark (", sum(response_distribution$n[response_distribution$country == 'Denmark']),
"), Italy (", sum(response_distribution$n[response_distribution$country == 'Italy']), ")", sep = "")
)
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
response_table
| Views on Fair Income Distribution | |||
| Response distribution by country (%) | |||
| Country | Response Categories | n |
Response Distribution
|
|---|---|---|---|
| Percentage (%) | |||
| Denmark | NA | 79 | 100 |
| Denmark | NA | 268 | 100 |
| Denmark | NA | 325 | 100 |
| Denmark | NA | 674 | 100 |
| Denmark | NA | 202 | 100 |
| Denmark | NA | 4 | 100 |
| Denmark | NA | 16 | 100 |
| Denmark | NA | 4 | 100 |
| Italy | NA | 692 | 100 |
| Italy | NA | 1346 | 100 |
| Italy | NA | 448 | 100 |
| Italy | NA | 174 | 100 |
| Italy | NA | 31 | 100 |
| Italy | NA | 9 | 100 |
| Italy | NA | 45 | 100 |
| Sample sizes: Denmark (1572), Italy (2745) | |||
ggplot(
data = combined_data,
mapping = aes(
x = as.numeric(sofrdst),
y = country,
fill = country
)
) +
geom_density_ridges(
alpha = 0.7,
scale = 0.9
) +
scale_fill_manual(values = c("Denmark" = "#4B9CD3", "Italy" = "#F4B942")) +
scale_x_continuous(
breaks = 1:5,
labels = c("Agree strongly", "Agree", "Neither agree nor disagree",
"Disagree", "Disagree strongly")
) +
labs(
title = "Distribution of Views on Income Equality",
subtitle = "Comparison between Italy and Denmark",
x = "Response Categories",
y = NULL
) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
plot.margin = margin(10, 10, 50, 10)
)
## Picking joint bandwidth of 0.187
## Warning: Removed 18269 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
combined_data <- combined_data %>%
mutate(
education = case_when(
eisced %in% 1:4 ~ "Up to Secondary",
eisced == 6 ~ "Bachelor's",
eisced %in% 7:8 ~ "Master's",
TRUE ~ NA_character_
),
education = factor(education, levels = c("Up to Secondary", "Bachelor's", "Master's"))
)
table(combined_data$education, useNA = "always")
##
## Up to Secondary Bachelor's Master's <NA>
## 14673 2813 2130 2970
ggplot(
data = combined_data %>% filter(!is.na(education)),
mapping = aes(
x = as.numeric(sofrdst),
y = education,
fill = education
)
) +
geom_density_ridges(
alpha = 0.7,
scale = 0.9
) +
scale_fill_manual(values = c(
"Up to Secondary" = "#4B9CD3",
"Bachelor's" = "#F4B942",
"Master's" = "#D9534F"
)) +
scale_x_continuous(
breaks = 1:5,
labels = c("Agree strongly", "Agree", "Neither agree nor disagree",
"Disagree", "Disagree strongly")
) +
facet_wrap(~country) +
labs(
title = "Views on Income Distribution by Education Level",
subtitle = "Comparing Italy and Denmark",
x = "Response Categories",
y = NULL
) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold")
)
## Picking joint bandwidth of 0.263
## Picking joint bandwidth of 0.207
## Warning: Removed 15583 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
This plot shows a notable difference in the attitudes on income
distribution between Denmark and Italy. Italy shows a much stronger
concentration of agreement in the “Agree” and “Agree Strongly”
categories compared to Denmark which shows more even distribution if not
a higher concentration of disagreement. The education attainment of both
countries shows that respondents with a Masters Degree display a wider
range of opinions while those with lower education levels show more
polarization. Overall, the plot shows that higher education levels have
more variability in opinions on income distribution.