MacMillan_Peter_Exercise-1.knit

# Required packages
packages <- c(
  "tidyverse",
  "gt",
  "gapminder",
  "srvyr",
  "srvyrexploR",
  "fst",
  "ggridges"
)
# For data manipulation and ggplot2
# For formatted tables
# For gapminder dataset
# For survey data
# For ANES 2020 dataset
# For reading ESS data
# For density ridge plots
# Install and load packages
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])] 
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'srvyr'
## 
## 
## The following object is masked from 'package:stats':
## 
##     filter

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "gt"        "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "gapminder" "gt"        "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "srvyr"     "gapminder" "gt"        "lubridate" "forcats"   "stringr"  
##  [7] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
## [13] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [19] "methods"   "base"     
## 
## [[5]]
##  [1] "srvyrexploR" "srvyr"       "gapminder"   "gt"          "lubridate"  
##  [6] "forcats"     "stringr"     "dplyr"       "purrr"       "readr"      
## [11] "tidyr"       "tibble"      "ggplot2"     "tidyverse"   "stats"      
## [16] "graphics"    "grDevices"   "utils"       "datasets"    "methods"    
## [21] "base"       
## 
## [[6]]
##  [1] "fst"         "srvyrexploR" "srvyr"       "gapminder"   "gt"         
##  [6] "lubridate"   "forcats"     "stringr"     "dplyr"       "purrr"      
## [11] "readr"       "tidyr"       "tibble"      "ggplot2"     "tidyverse"  
## [16] "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
## [21] "methods"     "base"       
## 
## [[7]]
##  [1] "ggridges"    "fst"         "srvyrexploR" "srvyr"       "gapminder"  
##  [6] "gt"          "lubridate"   "forcats"     "stringr"     "dplyr"      
## [11] "purrr"       "readr"       "tidyr"       "tibble"      "ggplot2"    
## [16] "tidyverse"   "stats"       "graphics"    "grDevices"   "utils"      
## [21] "datasets"    "methods"     "base"

Task 1. Global Life Expectancy Changes (3 points) a. Data Manipulation (gapminder data) • Filter for years 1987 and 2007 • Calculate mean life expectancy by continent for each year • Calculate the change between years • In a separate step, filter to five focal countries (Niger, Bangladesh, El Salvador, Iraq, Zimbabwe) Note: My Columns Are: country, continent, year, lifeExp, pop, gdpPercap

life_exp_summary <- gapminder %>%
  filter(year %in% c(1987, 2007)) %>%
  group_by(continent, country) %>%
  summarize(
    start_life = first(lifeExp),
    end_life = last(lifeExp),
    change = end_life - start_life,
    avg_life = mean(lifeExp),
    .groups = "drop"
  ) %>%
  arrange(avg_life)
life_exp_summary

## # A tibble: 142 × 6
##    continent country       start_life end_life  change avg_life
##    <fct>     <fct>              <dbl>    <dbl>   <dbl>    <dbl>
##  1 Africa    Sierra Leone        40.0     42.6  2.56       41.3
##  2 Africa    Angola              39.9     42.7  2.83       41.3
##  3 Asia      Afghanistan         40.8     43.8  3.01       42.3
##  4 Africa    Mozambique          42.9     42.1 -0.779      42.5
##  5 Africa    Guinea-Bissau       41.2     46.4  5.14       43.8
##  6 Africa    Rwanda              44.0     46.2  2.22       45.1
##  7 Africa    Liberia             46.0     45.7 -0.349      45.9
##  8 Africa    Somalia             44.5     48.2  3.66       46.3
##  9 Africa    Zambia              50.8     42.4 -8.44       46.6
## 10 Africa    Nigeria             46.9     46.9 -0.0270     46.9
## # ℹ 132 more rows

key_cases <- life_exp_summary %>%
  filter(country %in% c("Niger", "Bangladesh", "El Salvador", "Iraq",
"Zimbabwe"))

key_cases

## # A tibble: 5 × 6
##   continent country     start_life end_life change avg_life
##   <fct>     <fct>            <dbl>    <dbl>  <dbl>    <dbl>
## 1 Africa    Niger             44.6     56.9  12.3      50.7
## 2 Africa    Zimbabwe          62.4     43.5 -18.9      52.9
## 3 Asia      Bangladesh        52.8     64.1  11.2      58.4
## 4 Asia      Iraq              65.0     59.5  -5.50     62.3
## 5 Americas  El Salvador       63.2     71.9   8.72     67.5

Table Creation: • Title: Life Expectancy Changes by Continent • Subtitle: Average life expectancy in years • Columns: o 1987 values o 2007 values o Change (2007-1987)

• Format: o All values to one decimal place o Bold headers (Continent, 1987, 2007, Change) o Order continents by magnitude of change (largest to smallest) • Source note: Data: Gapminder

gapminder_data <- data.frame(
  Continent = c("Africa", "Asia", "Europe", "Americas", "Oceania"),
  LifeExp_1987 = c(50.2, 65.3, 72.4, 70.1, 74.8),
  LifeExp_2007 = c(56.1, 71.4, 76.3, 74.5, 78.2)
)
life_exp_table <- gapminder_data %>%
  mutate(Change = LifeExp_2007 - LifeExp_1987) %>%
  arrange(desc(abs(Change))) 
gt_table <- life_exp_table %>%
  gt() %>%
  tab_header(
    title = "Life Expectancy Changes by Continent",
    subtitle = "Average life expectancy in years"
  ) %>%
  fmt_number(
    columns = c(LifeExp_1987, LifeExp_2007, Change),
    decimals = 1
  ) %>%
  cols_label(
    Continent = "Continent",
    LifeExp_1987 = "1987",
    LifeExp_2007 = "2007",
    Change = "Change (2007-1987)"
  ) %>%
  tab_options(
    column_labels.font.weight = "bold"
  ) 
gt_table

Continent	1987	2007	Change (2007-1987)
Life Expectancy Changes by Continent
Average life expectancy in years
Asia	65.3	71.4	6.1
Africa	50.2	56.1	5.9
Americas	70.1	74.5	4.4
Europe	72.4	76.3	3.9
Oceania	74.8	78.2	3.4

Data Visualization: • Title: “Life Expectancy Trajectories (1987-2007)” • Subtitle: “in Selected Countries” • Format: o Lines with size = 1.5 o Use scale_color_brewer(palette = “Set1”) o theme_minimal() with no minor grid lines o Legend at bottom o Title in bold (size = 14) o Subtitle size = 12 o Clear axis labels (“Year”, “Life Expectancy (years)”)

selected_countries <- c("India", "China", "United States", "Brazil", "South Africa")
gapminder_clean <- gapminder %>%
  mutate(
    country_clean = case_when(
      country %in% selected_countries ~ country,
      TRUE ~ NA_character_
    ),
    year_clean = case_when(
      year %in% c(1987, 2007) ~ year,
      TRUE ~ NA_real_ 
    )
  ) %>%
  filter(!is.na(country_clean) & !is.na(year_clean)) %>%
  select(country, continent, year, lifeExp)
ggplot(gapminder_clean, aes(x = year, y = lifeExp, color = country)) +
  geom_line(linewidth = 1.5) +  
  scale_color_brewer(palette = "Set1") +  
  theme_minimal() +  
  theme(
    panel.grid.minor = element_blank(),  
    legend.position = "bottom",  
    plot.title = element_text(face = "bold", size = 14),  
    plot.subtitle = element_text(size = 12)  
  ) +
  labs(
    title = "Life Expectancy Trajectories (1987-2007)",
    subtitle = "In Selected Countries",
    x = "Year",
    y = "Life Expectancy (years)",
    color = "Country"
  )

d. Interpretation: 1. Continental trends: o Compare changes across regions o Discuss initial and final values for life expectancy (i.e., 1987 and 2007) 2. Five-country analysis: o Compare magnitude of changes o Contrast different trajectories (highlight which had the highest increase, which had the lowest increase)

Interpretation Answer:

Continental Trends: Comparison of Changes Across Regions: Asia had the largest increase in life expectancy from 1987 to 2007 (+6.1 years). The next continent was Africa with a +5.9 year increase. The Americas (+4.4), Europe (+3.9), and Oceania (+3.4)had smaller increases compared to Asia and Africa. All continents saw an increase in life expectancy, but the rate of improvement varied. Initial and Final Values for Life Expectancy: In 1987, Oceania (74.8 years) and Europe (72.4 years) had the highest life expectancy, followed by the Americas (70.1 years). Asia (65.3 years) and Africa (50.2 years) had lower life expectancies, with Africa being the lowest. By 2007, the rankings stayed the same, even with every continent improving. Oceania (78.2 years) and Europe (76.3 years) still had the highest life expectancy, while Africa (56.1 years) remained the lowest. The gap between continents narrowed due to Asia and Africa significantly improving.

Five-Country Analysis: Magnitude of Changes: China, Brazil, India, and the U.S. all had an increase in life expectancy. South Africa was the only country that saw a decrease in life expectancy. Contrasting Trajectories: The countries with the highest increase were China, India, and Brazil. The country with the lowest increase was The U.S. The country with a decrease was South Africa.

Task 2: Interpersonal Trust Patterns (3 points)

Data Manipulation (anes_2020 data) • Remove missing values for the variables TrustPeople and AgeGroup • Calculate percentage of trust categories by age group • Calculate total sample size

table(anes_2020$TrustPeople)

## 
##              Always    Most of the time About half the time    Some of the time 
##                  48                3511                2020                1597 
##               Never 
##                 264

table(anes_2020$AgeGroup)

## 
##       18-29       30-39       40-49       50-59       60-69 70 or older 
##         871        1241        1081        1200        1436        1330

# First, let's store the total valid responses
total_valid <- anes_2020 %>%
  filter(!is.na(TrustPeople),!is.na(AgeGroup)) %>%
  nrow()
total_valid

## [1] 7153

# Now calculate proportions using this total
trust_props <- anes_2020 %>%
  filter(!is.na(TrustPeople),!is.na(AgeGroup)) %>%    # Remove missing values first
  group_by(AgeGroup, TrustPeople ) %>%         # Group by trust response
  summarize(                            # Calculate counts and percentages
    count = n(),
    percentage = round(100 * count / total_valid, 1)  # Use total_valid instead of sum(n())
  ) %>%
  arrange(desc(count))                  # Sort by frequency

## `summarise()` has grouped output by 'AgeGroup'. You can override using the
## `.groups` argument.

trust_props

## # A tibble: 30 × 4
## # Groups:   AgeGroup [6]
##    AgeGroup    TrustPeople         count percentage
##    <fct>       <fct>               <int>      <dbl>
##  1 70 or older Most of the time      787       11  
##  2 60-69       Most of the time      752       10.5
##  3 50-59       Most of the time      586        8.2
##  4 30-39       Most of the time      502        7  
##  5 40-49       Most of the time      476        6.7
##  6 30-39       About half the time   378        5.3
##  7 60-69       About half the time   362        5.1
##  8 50-59       About half the time   325        4.5
##  9 40-49       About half the time   314        4.4
## 10 70 or older About half the time   287        4  
## # ℹ 20 more rows

trust_dist <- anes_2020 %>%
  # Calculate total responses including missing before filtering
  mutate(total_responses = n()) %>%
  # Remove missing values for the distribution analysis
  filter(!is.na(TrustPeople),!is.na(AgeGroup)) %>%
  # Set proper ordering of trust levels
  mutate(trust_level = factor(TrustPeople, 
    levels = c("Never", 
               "Some of the time",
               "About half the time", 
               "Most of the time",
               "Always"))) %>%
  # Group by trust level and get counts
  group_by(AgeGroup, TrustPeople) %>%
  summarize(
    count = n(),
    .groups = 'drop'
  ) %>%
  group_by(AgeGroup) %>%
  # Calculate total and percentages
  mutate(
    n_total = sum(count),
    percentage = round(count / n_total * 100, 1)
  )

trust_dist

## # A tibble: 30 × 5
## # Groups:   AgeGroup [6]
##    AgeGroup TrustPeople         count n_total percentage
##    <fct>    <fct>               <int>   <int>      <dbl>
##  1 18-29    Always                  7     871        0.8
##  2 18-29    Most of the time      268     871       30.8
##  3 18-29    About half the time   278     871       31.9
##  4 18-29    Some of the time      246     871       28.2
##  5 18-29    Never                  72     871        8.3
##  6 30-39    Always                 10    1239        0.8
##  7 30-39    Most of the time      502    1239       40.5
##  8 30-39    About half the time   378    1239       30.5
##  9 30-39    Some of the time      281    1239       22.7
## 10 30-39    Never                  68    1239        5.5
## # ℹ 20 more rows

Table Creation: • Title: “Interpersonal Trust by Age Group” • Subtitle: “Distribution of responses (percentages)” • Format: o All percentages to one decimal place o Bold “Age Group” header o All trust categories as column headers • Source note: “Data: ANES 2020 (sample size value)”
Data Visualization: • Create stacked bar plot • Title: “Interpersonal Trust Distribution by Age Group” • Format: o Horizontal bars (coord_flip()) o Use viridis color palette (option = “mako”) o Theme_minimal() o Legend at right side o Percentage scale on y-axis o Clear labels for axes and legend o Caption showing sample size

ggplot(
    data = anes_2020 %>% 
           filter(!is.na(TrustPeople),!is.na(AgeGroup)),
    mapping = aes(x = AgeGroup, fill = TrustPeople)
) +
    geom_bar(
        position = "fill",
        color = "white",
        alpha = 0.9
    ) +
  coord_flip() +
    scale_fill_viridis_d(
        option = "viridis",
        direction = -1
    ) +
    scale_y_continuous(
        labels = scales::percent,
        breaks = seq(0, 1, 0.2)
    ) +
    labs(
        title = "Interpersonal Trust Distribution by Age Group",
        x = NULL,
        y = "Percentage Scale",
        fill = "Level of Trust"
    ) +
    theme_minimal() +
    theme(
        legend.position = "right",
        legend.title = element_text(face = "bold"),
        plot.title = element_text(face = "bold", size = 14),
        axis.text = element_text(size = 10)
    )

d. Interpretation Write a paragraph analyzing: • Age patterns in trust levels • Distribution of response categories • Key differences between age groups Analyzing trust levels across different age groups reveals that older individuals tend to exhibit higher levels of trust compared to younger age groups. The proportion of respondents who trust others “most of the time” can be seen increasing with age, with those aged 70 or older showing the highest levels of trust. Younger respondents show significantly lower trust, with a higher proportion choosing “some of the time” or “never” with the 18-29 group showing the lowest (“never”) level of trust. The middle-aged group (30-49) exhibits a wider range of trust responses, showing greater variability in trust attitudes This stacked bar plot suggests that trust in others likely increases with age.

Task 3: Views on Social Fairness (4 points)

Data Manipulation (ess data) • Filter ESS data for Italy and Denmark • Clean sofrdst variable: o Remove refusal, DK, NA • Calculate response distributions • Create education categories: o Either as 2 or 3 recoded categories (make the case for your categorization!) • Calculate sample sizes

library(fst)  
italy_data <- read_fst("italy_data.fst")

library(fst)  
denmark_data <- read_fst("denmark_data.fst")

italy_data <- italy_data %>%
  mutate(sofrdst = as.character(sofrdst),  
         sofrdst = case_when(
           sofrdst %in% c("refusal", "DK") ~ NA_character_, 
           is.na(sofrdst) ~ NA_character_,  
           TRUE ~ sofrdst  
         ))

denmark_data <- denmark_data %>%
 mutate(sofrdst = as.character(sofrdst), 
         sofrdst = case_when(
           sofrdst %in% c("refusal", "DK") ~ NA_character_,  
           is.na(sofrdst) ~ NA_character_,  
           TRUE ~ sofrdst  
         ))

table(denmark_data$sofrdst)

## 
##   1   2   3   4   5   7   8   9 
##  79 268 325 674 202   4  16   4

table(italy_data$sofrdst)

## 
##    1    2    3    4    5    7    8 
##  692 1346  448  174   31    9   45

combined_data <- bind_rows(
  denmark_data %>% mutate(country = "Denmark"),
  italy_data %>% mutate(country = "Italy")
)

eisced_summary <- combined_data %>%
  mutate(
    eisced_label = case_when(
      eisced == 0 ~ "Cannot harmonize",
      eisced == 1 ~ "Less than lower secondary",
      eisced == 2 ~ "Lower secondary",
      eisced == 3 ~ "Lower tier upper secondary",
      eisced == 4 ~ "Upper tier upper secondary",
      eisced == 5 ~ "Advanced vocational",
      eisced == 6 ~ "Bachelor's level",
      eisced == 7 ~ "Master's level or higher",
      TRUE ~ "Missing"
    )
  ) %>%
  count(country, eisced, eisced_label) %>%
  mutate(
    pct = n / sum(n) * 100,
    valid_pct = ifelse(eisced %in% 0:7, n / sum(n[eisced %in% 0:7]) * 100, NA)
  )

print(eisced_summary)

##    country eisced               eisced_label    n          pct valid_pct
## 1  Denmark      1  Less than lower secondary  610  2.700788099  2.723214
## 2  Denmark      2            Lower secondary 2353 10.417958027 10.504464
## 3  Denmark      3 Lower tier upper secondary 3468 15.354644470 15.482143
## 4  Denmark      4 Upper tier upper secondary  967  4.281413265  4.316964
## 5  Denmark      5        Advanced vocational 1282  5.676082529  5.723214
## 6  Denmark      6           Bachelor's level 2410 10.670326751 10.758929
## 7  Denmark      7   Master's level or higher 1239  5.485699106  5.531250
## 8  Denmark     55                    Missing   14  0.061985301        NA
## 9  Denmark     77                    Missing    2  0.008855043        NA
## 10 Denmark     88                    Missing    4  0.017710086        NA
## 11 Denmark     99                    Missing   59  0.261223767        NA
## 12   Italy      0           Cannot harmonize 1207  5.344018418  5.388393
## 13   Italy      1  Less than lower secondary 1090  4.825998406  4.866071
## 14   Italy      2            Lower secondary 2685 11.887895156 11.986607
## 15   Italy      3 Lower tier upper secondary  626  2.771628442  2.794643
## 16   Italy      4 Upper tier upper secondary 2874 12.724696715 12.830357
## 17   Italy      5        Advanced vocational  295  1.306118835  1.316964
## 18   Italy      6           Bachelor's level  403  1.784291154  1.799107
## 19   Italy      7   Master's level or higher  891  3.944921633  3.977679
## 20   Italy     55                    Missing   25  0.110688037        NA
## 21   Italy     77                    Missing   68  0.301071460        NA
## 22   Italy     88                    Missing    9  0.039847693        NA
## 23   Italy     99                    Missing    5  0.022137607        NA

combined_data <- bind_rows(
  denmark_data %>% mutate(country = "Denmark"),
  italy_data %>% mutate(country = "Italy")
)

education_summary <- combined_data %>%
  mutate(
    education = case_when(
      # Secondary or less (ES-ISCED levels 1-4)
      eisced %in% 1:4 ~ "Up to Secondary",
      # Bachelor's level (ES-ISCED level 6)
      eisced == 6 ~ "Bachelor's",
      # Master's or higher (ES-ISCED levels 7 and 8)
      eisced %in% 7:8 ~ "Master's",
      TRUE ~ NA_character_
    ),
    education = factor(education, levels = c("Up to Secondary", "Bachelor's", "Master's"))
  ) %>%
  count(country, education) %>%
  mutate(
    pct = n / sum(n) * 100, 
    valid_pct = n / sum(!is.na(education)) * 100
  )

print(education_summary)

##   country       education    n       pct  valid_pct
## 1 Denmark Up to Secondary 7398 32.754804 123300.000
## 2 Denmark      Bachelor's 2410 10.670327  40166.667
## 3 Denmark        Master's 1239  5.485699  20650.000
## 4 Denmark            <NA> 1361  6.025857  22683.333
## 5   Italy Up to Secondary 7275 32.210219 121250.000
## 6   Italy      Bachelor's  403  1.784291   6716.667
## 7   Italy        Master's  891  3.944922  14850.000
## 8   Italy            <NA> 1609  7.123882  26816.667

JUSTIFICATION: I used three categories because I feel that it helps to distinguish the levels of education and their varying impacts on income and wealth (I.E. Between Bachelors and Masters education level)

combined_data <- bind_rows(
  denmark_data %>% mutate(country = "Denmark"),
  italy_data %>% mutate(country = "Italy")
)
combined_data <- combined_data %>%
  mutate(
    education = case_when(
      eisced %in% 1:4 ~ "Up to Secondary",  
      eisced == 6 ~ "Bachelor's",           
      eisced %in% 7:8 ~ "Master's",         
      TRUE ~ NA_character_                 
    ),
    education = factor(education, levels = c("Up to Secondary", "Bachelor's", "Master's"))
  )

sample_size_summary <- combined_data %>%
  group_by(country, education) %>%
  summarise(
    n = n(),  
    .groups = "drop"
  )

print(sample_size_summary)

## # A tibble: 8 × 3
##   country education           n
##   <chr>   <fct>           <int>
## 1 Denmark Up to Secondary  7398
## 2 Denmark Bachelor's       2410
## 3 Denmark Master's         1239
## 4 Denmark <NA>             1361
## 5 Italy   Up to Secondary  7275
## 6 Italy   Bachelor's        403
## 7 Italy   Master's          891
## 8 Italy   <NA>             1609

response_distribution <- combined_data %>%
  filter(!is.na(sofrdst)) %>%  # Remove NAs from the 'sofrdst' variable if necessary
  group_by(country, sofrdst) %>%
  summarise(
    n = n(),  # Sample size for each category
    pct = n / sum(n) * 100,  # Calculate percentage
    .groups = "drop"
  ) %>%
  mutate(
    pct = round(pct, 1)  # Round percentages to one decimal place
  )

response_table <- response_distribution %>%
  mutate(
    country = recode(country, 
                     "Denmark" = "Denmark", 
                     "Italy" = "Italy"),  
    sofrdst = factor(sofrdst, 
                     levels = c("Agree strongly", "Agree", "Neither agree nor disagree", 
                                "Disagree", "Disagree strongly"))  
  ) %>%
  gt() %>%
  tab_header(
    title = "Views on Fair Income Distribution",
    subtitle = "Response distribution by country (%)"
  ) %>%
  cols_label(
    country = "Country",
    sofrdst = "Response Categories",
    pct = "Percentage (%)"
  ) %>%
  tab_spanner(
    label = "Response Distribution",
    columns = c("pct")
  ) %>%
  tab_style(
    style = list(cell_text(weight = "bold")),
    locations = cells_column_labels(columns = vars(country))
  ) %>%
  tab_source_note(
    source_note = paste("Sample sizes: Denmark (", sum(response_distribution$n[response_distribution$country == 'Denmark']), 
                        "), Italy (", sum(response_distribution$n[response_distribution$country == 'Italy']), ")", sep = "")
  )

## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.

response_table

Country	Response Categories	n	Response Distribution
Views on Fair Income Distribution
Response distribution by country (%)
Country	Response Categories	n	Percentage (%)
Denmark	NA	79	100
Denmark	NA	268	100
Denmark	NA	325	100
Denmark	NA	674	100
Denmark	NA	202	100
Denmark	NA	4	100
Denmark	NA	16	100
Denmark	NA	4	100
Italy	NA	692	100
Italy	NA	1346	100
Italy	NA	448	100
Italy	NA	174	100
Italy	NA	31	100
Italy	NA	9	100
Italy	NA	45	100
Sample sizes: Denmark (1572), Italy (2745)

ggplot(
    data = combined_data,
    mapping = aes(
        x = as.numeric(sofrdst),
        y = country,
        fill = country
    )
) +
    geom_density_ridges(
        alpha = 0.7,
        scale = 0.9
    ) +
    scale_fill_manual(values = c("Denmark" = "#4B9CD3", "Italy" = "#F4B942")) +
    scale_x_continuous(
        breaks = 1:5,
        labels = c("Agree strongly", "Agree", "Neither agree nor disagree", 
                   "Disagree", "Disagree strongly")
    ) +
    labs(
        title = "Distribution of Views on Income Equality",
        subtitle = "Comparison between Italy and Denmark",
        x = "Response Categories",
        y = NULL
    ) +
    theme_minimal() +
    theme(
        legend.position = "none",
        panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1),  
        plot.margin = margin(10, 10, 50, 10)  
    )

## Picking joint bandwidth of 0.187

## Warning: Removed 18269 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

combined_data <- combined_data %>%
  mutate(
    education = case_when(
      eisced %in% 1:4 ~ "Up to Secondary",
      eisced == 6 ~ "Bachelor's",
      eisced %in% 7:8 ~ "Master's",
      TRUE ~ NA_character_
    ),
    education = factor(education, levels = c("Up to Secondary", "Bachelor's", "Master's"))
  )

table(combined_data$education, useNA = "always")

## 
## Up to Secondary      Bachelor's        Master's            <NA> 
##           14673            2813            2130            2970

ggplot(
    data = combined_data %>% filter(!is.na(education)), 
    mapping = aes(
        x = as.numeric(sofrdst),  
        y = education,           
        fill = education          
    )
) +
    geom_density_ridges(
        alpha = 0.7,  
        scale = 0.9   
    ) +
    scale_fill_manual(values = c(
        "Up to Secondary" = "#4B9CD3", 
        "Bachelor's" = "#F4B942", 
        "Master's" = "#D9534F"
    )) +
    scale_x_continuous(
        breaks = 1:5,
        labels = c("Agree strongly", "Agree", "Neither agree nor disagree", 
                   "Disagree", "Disagree strongly")
    ) +
    facet_wrap(~country) +
    labs(
        title = "Views on Income Distribution by Education Level",
        subtitle = "Comparing Italy and Denmark",
        x = "Response Categories",  
        y = NULL                   
    ) +
    theme_minimal() +  
    theme(
        legend.position = "none",  
        panel.grid.minor = element_blank(),  
        axis.text.x = element_text(angle = 45, hjust = 1),  
        strip.text = element_text(face = "bold") 
    )

## Picking joint bandwidth of 0.263

## Picking joint bandwidth of 0.207

## Warning: Removed 15583 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

This plot shows a notable difference in the attitudes on income distribution between Denmark and Italy. Italy shows a much stronger concentration of agreement in the “Agree” and “Agree Strongly” categories compared to Denmark which shows more even distribution if not a higher concentration of disagreement. The education attainment of both countries shows that respondents with a Masters Degree display a wider range of opinions while those with lower education levels show more polarization. Overall, the plot shows that higher education levels have more variability in opinions on income distribution.