# Required packages
packages <- c(
"tidyverse",
# For data manipulation and ggplot2
"gt",
# For formatted tables
"gapminder",
# For gapminder dataset
"srvyr",
# For survey data
"fst",
# For reading ESS data
"ggridges"
# For density ridge plots
)# Install and load packages
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Adjuntando el paquete: 'srvyr'
##
##
## The following object is masked from 'package:stats':
##
## filter
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "gt" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "gapminder" "gt" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "srvyr" "gapminder" "gt" "lubridate" "forcats" "stringr"
## [7] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [13] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [19] "methods" "base"
##
## [[5]]
## [1] "fst" "srvyr" "gapminder" "gt" "lubridate" "forcats"
## [7] "stringr" "dplyr" "purrr" "readr" "tidyr" "tibble"
## [13] "ggplot2" "tidyverse" "stats" "graphics" "grDevices" "utils"
## [19] "datasets" "methods" "base"
##
## [[6]]
## [1] "ggridges" "fst" "srvyr" "gapminder" "gt" "lubridate"
## [7] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [13] "tibble" "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [19] "utils" "datasets" "methods" "base"
file.exists("anes_2020.rda")
## [1] TRUE
load("anes_2020.rda")
head(gapminder)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
Notes: Name in the columes: country, continent, year, lifeExp, pop, gdpPercap Some notes may be written in Spanish for the author’s convenience.
life_exp_cont <- gapminder %>%
filter(year %in% c(1987, 2007))%>%
group_by(continent) %>%
summarise(
life_1987 = first(lifeExp),
life_2007 = last(lifeExp),
change = life_1987 - life_2007,
avg_life = mean(lifeExp),
.groups = "drop"
) %>%
arrange(desc(avg_life))
life_exp_cont
## # A tibble: 5 × 5
## continent life_1987 life_2007 change avg_life
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Oceania 76.3 80.2 -3.88 78.0
## 2 Europe 72 79.4 -7.42 75.6
## 3 Americas 70.8 73.7 -2.97 70.8
## 4 Asia 40.8 62.7 -21.9 67.8
## 5 Africa 65.8 43.5 22.3 54.1
Explanation of the code:
Only the years 1987 and 2007 are filtered. The data is grouped by continent. The average life expectancy for each year is calculated. The change in life expectancy (2007 - 1987) is calculated. Finally, the continents are ordered according to the magnitude of the change.
life_exp_country <- gapminder %>%
filter(year >= 1985 & year <= 2007) %>%
group_by(country,year) %>%
summarise(
avg_life = mean(lifeExp),
.groups = "drop"
) %>%
arrange(avg_life)
focal_countries <-life_exp_country %>%
filter(country %in% c("Niger", "Bangladesh", "El Salvador", "Iraq", "Zimbabwe"))
print (life_exp_country)
## # A tibble: 710 × 3
## country year avg_life
## <fct> <int> <dbl>
## 1 Rwanda 1992 23.6
## 2 Rwanda 1997 36.1
## 3 Sierra Leone 1992 38.3
## 4 Zambia 2002 39.2
## 5 Swaziland 2007 39.6
## 6 Somalia 1992 39.7
## 7 Sierra Leone 1997 39.9
## 8 Angola 1987 39.9
## 9 Zimbabwe 2002 40.0
## 10 Sierra Leone 1987 40.0
## # ℹ 700 more rows
focal_countries
## # A tibble: 25 × 3
## country year avg_life
## <fct> <int> <dbl>
## 1 Zimbabwe 2002 40.0
## 2 Zimbabwe 2007 43.5
## 3 Niger 1987 44.6
## 4 Zimbabwe 1997 46.8
## 5 Niger 1992 47.4
## 6 Niger 1997 51.3
## 7 Bangladesh 1987 52.8
## 8 Niger 2002 54.5
## 9 Bangladesh 1992 56.0
## 10 Niger 2007 56.9
## # ℹ 15 more rows
library(gt)
enhanced_table <- life_exp_cont %>%
select(-avg_life) %>%
gt() %>%
cols_label(
continent = "Continent",
life_1987 = "1987",
life_2007 = "2007",
change = "Change (2007-1987)"
) %>%
fmt_number(
columns = c('life_2007','life_1987', 'change'),
decimals = 1,
use_seps = TRUE
) %>%
tab_header(
title = md("**Life Expectancy Changes by Continent**"),
subtitle = md("**Average life expectancy in years**")
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
) %>%
tab_source_note(
source_note = "Data: gapminder"
)
enhanced_table
| Life Expectancy Changes by Continent | |||
| Average life expectancy in years | |||
| Continent | 1987 | 2007 | Change (2007-1987) |
|---|---|---|---|
| Oceania | 76.3 | 80.2 | −3.9 |
| Europe | 72.0 | 79.4 | −7.4 |
| Americas | 70.8 | 73.7 | −3.0 |
| Asia | 40.8 | 62.7 | −21.9 |
| Africa | 65.8 | 43.5 | 22.3 |
| Data: gapminder | |||
gt() is used to create a nicely formatted table. Not only that, but the values are also formatted to a decimal. Afterwards, it is only a matter of fixing the text layout, where title, subtitle and source notes are added and finally, bold is applied to the column headers.
library(ggplot2)
ggplot(focal_countries, aes(x = year, y = avg_life, group = country, color = country)) +
geom_line(size = 1.5) +
geom_point(size = 3) +
scale_color_brewer(palette = "Set1") +
theme_minimal() +
theme(
legend.position = "bottom",
legend.title = element_blank(),
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12),
panel.grid.minor = element_blank() # Remove minor grid lines
) +
labs(
title = "Life Expectancy Trajectories (1987-2007)",
subtitle = "in Selected Countries",
x = "Year",
y = "Life Expectancy (years)"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Functions like ggplot2 are used to visualize the evolution of life
expectancy.
While scale_color_brewer(palette = “Set1”) can be used for a light color palette. This ends up making theme_minimal() finish the job and gives us a cleaner design, where minor grid lines are removed for clarity and labels and titles are added with the appropriate format.
Interpreting changes in life expectancy (1987-2007)
The data show something that, in retrospect, is predictable: life expectancy has increased in all regions, but not uniformly. Each continent has its own pace of development, marked by social, economic and political factors that directly influence the quality and length of life of its inhabitants.
If we focus on the continents, we find a clear pattern. Africa and Asia show the most drastic changes in these twenty years. This is no coincidence: investment in public health, the decrease in infectious diseases and improved access to medicines have been decisive in this growth. But, although the change is significant, Africa still lags behind other continents, which reminds us that an increase does not necessarily mean equity.
Europe and North America, on the other hand, show a much smaller variation. This is logical: they were already starting from high values. In 1987, so the improvement is more gradual. Latin America and Oceania fall somewhere in between: notable progress, but without the extreme transformations of Africa and Asia. If we look at the evolution of the data, the gap between continents has narrowed, but there are still marked differences.
When we move from the continental scale to specific cases, the picture becomes more diverse. Bangladesh is the country that has made the most progress in terms of life expectancy. It is not difficult to understand why: in these years, the country has drastically improved its public health conditions, reducing infant mortality rates and preventable diseases.
El Salvador follows a similar trajectory, with constant improvement, although to a lesser extent. Iraq, on the other hand, has less stable growth. And here a factor comes into play that numbers alone do not fully explain: the political context. War and internal conflicts have affected its development in these decades, limiting what otherwise might have been more evident progress.
But the most extreme case is Zimbabwe. Not only did it have the least improvement, but at times its life expectancy decreased. It is no coincidence: the country has faced political, economic and health crises. It is a reminder that development is not linear and that structural problems can slow down or even reverse growth in quality of life.
anes_clean <- anes_2020 %>%
select(TrustPeople, AgeGroup) %>%
drop_na()
summary(anes_clean)
## TrustPeople AgeGroup
## Always : 45 18-29 : 871
## Most of the time :3371 30-39 :1239
## About half the time:1944 40-49 :1080
## Some of the time :1537 50-59 :1199
## Never : 256 60-69 :1435
## 70 or older:1329
trust_distribution <- anes_clean %>%
group_by(AgeGroup, TrustPeople) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(percentage = (count / sum(count)) * 100)
trust_distribution
## # A tibble: 30 × 4
## AgeGroup TrustPeople count percentage
## <fct> <fct> <int> <dbl>
## 1 18-29 Always 7 0.0979
## 2 18-29 Most of the time 268 3.75
## 3 18-29 About half the time 278 3.89
## 4 18-29 Some of the time 246 3.44
## 5 18-29 Never 72 1.01
## 6 30-39 Always 10 0.140
## 7 30-39 Most of the time 502 7.02
## 8 30-39 About half the time 378 5.28
## 9 30-39 Some of the time 281 3.93
## 10 30-39 Never 68 0.951
## # ℹ 20 more rows
To analyze the distribution of interpersonal trust across age groups, the data must first be grouped using the variables AgeGroup and TrustPeople. This allows us to count how many responses are in each combination of age group and trust level. This is essential, as it helps us understand how trust perceptions vary within each age cohort.
After counting the responses, the percentage of each category within its respective age group is calculated. This is achieved by dividing the number of responses in each trust level by the total responses in that group and then multiplying by 100 to obtain the percentage.
Finally, the results are printed to the console to verify that the calculations are correct.
library(gt)
trust_table <- trust_distribution %>%
pivot_wider(names_from = TrustPeople, values_from = percentage) %>%
gt() %>%
cols_label(
AgeGroup = "Age Group"
) %>%
fmt_number(
columns = -AgeGroup,
decimals = 1
) %>%
tab_header(
title = md("**Interpersonal Trust by Age Group**"),
subtitle = md("**Distribution of responses (percentages)**")
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
) %>%
tab_source_note(
source_note = md("**Data: ANES 2020**")
)
trust_table
| Interpersonal Trust by Age Group | ||||||
| Distribution of responses (percentages) | ||||||
| Age Group | count | Always | Most of the time | About half the time | Some of the time | Never |
|---|---|---|---|---|---|---|
| 18-29 | 7.0 | 0.1 | NA | NA | NA | NA |
| 18-29 | 268.0 | NA | 3.7 | NA | NA | NA |
| 18-29 | 278.0 | NA | NA | 3.9 | NA | NA |
| 18-29 | 246.0 | NA | NA | NA | 3.4 | NA |
| 18-29 | 72.0 | NA | NA | NA | NA | 1.0 |
| 30-39 | 10.0 | 0.1 | NA | NA | NA | NA |
| 30-39 | 502.0 | NA | 7.0 | NA | NA | NA |
| 30-39 | 378.0 | NA | NA | 5.3 | NA | NA |
| 30-39 | 281.0 | NA | NA | NA | 3.9 | NA |
| 30-39 | 68.0 | NA | NA | NA | NA | 1.0 |
| 40-49 | 8.0 | 0.1 | NA | NA | NA | NA |
| 40-49 | 476.0 | NA | 6.7 | NA | NA | NA |
| 40-49 | 314.0 | NA | NA | 4.4 | NA | NA |
| 40-49 | 247.0 | NA | NA | NA | 3.5 | NA |
| 40-49 | 35.0 | NA | NA | NA | NA | 0.5 |
| 50-59 | 2.0 | 0.0 | NA | NA | NA | NA |
| 50-59 | 586.0 | NA | 8.2 | NA | NA | NA |
| 50-59 | 325.0 | NA | NA | 4.5 | NA | NA |
| 50-59 | 249.0 | NA | NA | NA | 3.5 | NA |
| 50-59 | 37.0 | NA | NA | NA | NA | 0.5 |
| 60-69 | 10.0 | 0.1 | NA | NA | NA | NA |
| 60-69 | 752.0 | NA | 10.5 | NA | NA | NA |
| 60-69 | 362.0 | NA | NA | 5.1 | NA | NA |
| 60-69 | 284.0 | NA | NA | NA | 4.0 | NA |
| 60-69 | 27.0 | NA | NA | NA | NA | 0.4 |
| 70 or older | 8.0 | 0.1 | NA | NA | NA | NA |
| 70 or older | 787.0 | NA | 11.0 | NA | NA | NA |
| 70 or older | 287.0 | NA | NA | 4.0 | NA | NA |
| 70 or older | 230.0 | NA | NA | NA | 3.2 | NA |
| 70 or older | 17.0 | NA | NA | NA | NA | 0.2 |
| Data: ANES 2020 | ||||||
library(ggplot2)
ggplot(trust_distribution, aes(x = AgeGroup, y = percentage, fill = TrustPeople)) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_viridis_d(option = "mako") +
theme_minimal() +
theme(
legend.position = "right",
legend.title = element_blank(),
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12),
axis.title.y = element_text(face = "bold"),
panel.grid.major.x = element_blank()
) +
labs(
title = "Interpersonal Trust Distribution by Age Group",
subtitle = "Proportion of responses in each category",
x = "Age Group",
y = "Proportion",
caption = "Data: ANES 2020"
) +
coord_flip() # Voltea las barras para mejor lectura
The stacked bar chart is built using geom_bar(stat = “identity”,
position = “fill”). This setting is key because it allows us to
represent the distribution of responses as proportions within each age
group rather than counting absolute responses. By using position =
“fill”, we ensure that each bar has the same overall height (100%),
making it easier to visually compare proportions across different
categories. It is also good to know that if you use coord_flip(), it
swaps the X and Y axes, turning the graph into a horizontal bar format.
This choice is not merely aesthetic: by flipping the bars, we make it
easier to read the labels of each age group and avoid overlapping text,
making the graph clearer and more understandable.
Interpretation Interpersonal Trust Patterns:
The data show that interpersonal trust varies significantly across age groups. Generally speaking, older adults tend to express higher trust in people compared to younger groups, who are more skeptical. The distribution of responses also suggests that the perception of trust could be influenced by life experiences, social stability, and generation. In particular, the younger group presents the highest distrust, which could be related to the current social context and exposure to global crisis events. In summary, age seems to be a relevant factor in how people perceive the trustworthiness of others, and these data provide clear evidence of a generational pattern in interpersonal trust.
library(tidyverse)
library(fst) # Para leer archivos .fst
denmark_data <- read_fst("denmark_data.fst")
italy_data <- read_fst("italy_data.fst")
clean_denmark <- denmark_data %>%
filter(!sofrdst %in% c("Refusal", "DK", "NA"))
clean_italy <- italy_data %>%
filter(!sofrdst %in% c("Refusal", "DK", "NA"))
denmark_distribution <- clean_denmark %>%
group_by(sofrdst) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(percentage = (count / sum(count)) * 100)
italy_distribution <- clean_italy %>%
group_by(sofrdst) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(percentage = (count / sum(count)) * 100)
# Combinar ambos datasets para facilitar la comparación, similar a lo que paso con el grupo de continentes al inicio
fairness_distribution <- bind_rows(
denmark_distribution %>% mutate(country = "Denmark"),
italy_distribution %>% mutate(country = "Italy")
)
Combine both datasets to facilitate comparison, similar to what happened with the continent group at the beginning
library(gt)
fairness_table <- fairness_distribution %>%
pivot_wider(names_from = sofrdst, values_from = percentage) %>%
gt() %>%
cols_label(
country = "Country"
) %>%
fmt_number(
columns = -country,
decimals = 1
) %>%
tab_header(
title = md("**Views on Fair Income Distribution**"),
subtitle = md("**Response distribution by country (%)**")
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
) %>%
tab_source_note(
source_note = md("**Data: ESS (Denmark & Italy)**")
)
fairness_table
| Views on Fair Income Distribution | ||||||||||
| Response distribution by country (%) | ||||||||||
| count | Country | 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | NA |
|---|---|---|---|---|---|---|---|---|---|---|
| 79.0 | Denmark | 0.6 | NA | NA | NA | NA | NA | NA | NA | NA |
| 268.0 | Denmark | NA | 2.2 | NA | NA | NA | NA | NA | NA | NA |
| 325.0 | Denmark | NA | NA | 2.6 | NA | NA | NA | NA | NA | NA |
| 674.0 | Denmark | NA | NA | NA | 5.4 | NA | NA | NA | NA | NA |
| 202.0 | Denmark | NA | NA | NA | NA | 1.6 | NA | NA | NA | NA |
| 4.0 | Denmark | NA | NA | NA | NA | NA | 0.0 | NA | 0.0 | NA |
| 16.0 | Denmark | NA | NA | NA | NA | NA | NA | 0.1 | NA | NA |
| 10,836.0 | Denmark | NA | NA | NA | NA | NA | NA | NA | NA | 87.3 |
| 692.0 | Italy | 6.8 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1,346.0 | Italy | NA | 13.2 | NA | NA | NA | NA | NA | NA | NA |
| 448.0 | Italy | NA | NA | 4.4 | NA | NA | NA | NA | NA | NA |
| 174.0 | Italy | NA | NA | NA | 1.7 | NA | NA | NA | NA | NA |
| 31.0 | Italy | NA | NA | NA | NA | 0.3 | NA | NA | NA | NA |
| 9.0 | Italy | NA | NA | NA | NA | NA | 0.1 | NA | NA | NA |
| 45.0 | Italy | NA | NA | NA | NA | NA | NA | 0.4 | NA | NA |
| 7,433.0 | Italy | NA | NA | NA | NA | NA | NA | NA | NA | 73.0 |
| Data: ESS (Denmark & Italy) | ||||||||||
Visualizing Response Distribution: “I hated this part”
library(ggplot2)
ggplot(fairness_distribution, aes(x = sofrdst, y = percentage, fill = country)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_blank(),
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12),
axis.title.x = element_text(face = "bold")
) +
labs(
title = "Distribution of Views on Income Equality",
subtitle = "Comparison between Italy and Denmark",
x = "Response Category",
y = "Percentage",
caption = "Data: ESS"
)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
education_levels <- function(eisced) {
case_when(
eisced %in% c(0, 1, 2) ~ "Basic", # Educación primaria y secundaria baja
eisced %in% c(3, 4) ~ "Intermediate", # Secundaria alta y post-secundaria
eisced %in% c(5, 6) ~ "Higher", # Universitario y superior
TRUE ~ "Other"
)
}
clean_denmark <- clean_denmark %>% mutate(education_group = education_levels(eisced))
clean_italy <- clean_italy %>% mutate(education_group = education_levels(eisced))
education_distribution <- bind_rows(
clean_denmark %>%
group_by(education_group, sofrdst) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(percentage = (count / sum(count)) * 100) %>%
mutate(country = "Denmark"),
clean_italy %>%
group_by(education_group, sofrdst) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(percentage = (count / sum(count)) * 100) %>%
mutate(country = "Italy")
)
ggplot(education_distribution, aes(x = sofrdst, y = percentage, fill = education_group)) +
geom_bar(stat = "identity", position = "dodge") + # Barras agrupadas para comparar
scale_fill_brewer(palette = "Set1") + # Colores diferenciados
facet_wrap(~country) + # Un gráfico por país
theme_minimal() +
theme(
legend.position = "top",
legend.title = element_blank(),
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12),
axis.title.x = element_text(face = "bold")
) +
labs(
title = "Views on Income Distribution by Education Level",
subtitle = "Comparing Italy and Denmark",
x = "Response Category",
y = "Percentage",
caption = "Data: ESS"
)
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).
At the beginning I made a mistake, which is that the variable that I was trying to use, education, does not exist with that name in the datasets for Italy and Denmark. To solve this, it was necessary to verify what the correct name of the variable representing the educational level is
I had to run the colnames code in RStudio to list the column names in each dataset. Find the column that corresponds to the educational level (it can be called something like education_level, edu, schooling, etc.). And Based on the names, the most appropriate variable to represent the educational level in the European Social Survey (ESS) is usually eisced, which corresponds to the International Standard Classification of Education (ISCED). This variable categorizes the educational level in a standard format.
After solving this, I was able to create a faceted graph to compare the distribution of opinions on fairness in income distribution in Italy and Denmark, segmented by educational level.
"The process crashed the computer about three times"
Interpretation of Results
Differences between countries:
Denmark shows a more homogeneous tendency towards equity in income distribution, with a higher percentage of people believing that income should be fairer. Italy, in contrast, has a more fragmented distribution, with more divided opinions on whether the distribution is fair or not.
Educational patterns:
The data suggest that as the level of education increases, the perception of income equity changes. In both countries, people with higher education tend to be more supportive of the idea of a more equitable distribution, while those with basic education have more dispersed responses.
Education appears to be a key factor in the perception of economic justice. However, cultural differences between Italy and Denmark also influence how income equity is perceived.