# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "viridis" "viridisLite" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
spain_data <- read.fst("spain_data.fst")
spain_data <- spain_data %>%
mutate(
trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt), # set values 77, 88, and 99 to NA.
)
table(spain_data$trstplt)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 5165 1830 2329 2441 2085 2890 1154 639 355 80 71
spain_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
spain_data$year[spain_data$essround == i] <- replacements[i]
}
table(spain_data$year)
##
## 2002 2004 2006 2008 2010 2012 2014 2016 2018 2020
## 1729 1663 1876 2576 1885 1889 1925 1958 1668 2283
trust_by_year <- spain_data %>%
group_by(year) %>%
summarize(mean_trust = mean(trstplt, na.rm = TRUE))
trust_by_year
## # A tibble: 10 × 2
## year mean_trust
## <dbl> <dbl>
## 1 2002 3.41
## 2 2004 3.66
## 3 2006 3.49
## 4 2008 3.32
## 5 2010 2.72
## 6 2012 1.91
## 7 2014 2.23
## 8 2016 2.40
## 9 2018 2.55
## 10 2020 1.94
ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 3) +
labs(title = "Trust in Parliament in Spain (2002-2020)",
x = "Survey Year",
y = "Average Trust in Parliament (0-5 scale)") +
ylim(0, 5) + # Setting the y-axis limits from 0 to 5
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
geom_line(aes(group = 1), color = "blue", size = 1, linetype = "longdash") + # Dashed line for the trend
# geom_point(aes(color = mean_trust), size = 3) + # if you want to try alternative + strive to keep improving
# scale_color_viridis(option = "D", end = 0.9, direction = -1) + # try the alternative, and try to keep improving
labs(title = "Trust in Parliament in Spain (2002-2020)",
x = "Survey Year",
y = "Average Trust in Parliament (0-5 scale)") +
ylim(1, 5) + # change the numbers to truncate more or less or keep the full 0 to 10 scale
theme_minimal() + # Minimal theme for a clean look
theme(legend.position = "none") # Remove the legend
Prompt: in the tutorial, we calculated the average trust in others for France and visualized it. Using instead the variable ‘Trust in Parliament’ (trstplt) and the country of Spain (country file provided on course website), visualize the average trust by survey year. You can truncate the y-axis if you wish. Provide appropriate titles and labels given the changes. What are your main takeaways based on the visual (e.g., signs of increase, decrease, or stall)?
Answer: The average trust in parliament in Spain was highest in 2004 at 3.6 after which there was a gradual decline. Although there is a decline till approximately approximately 2012, we can see a gradual increase in the average trust in parliament from 2013 to 2018 (Average from 2.0 to 2.5). Based on this visual, we can conclude that trust in parliament began to decline in the 2000s and has stalled since 2012.
Provide answer only.
Prompt and question: Based on the figure we produced above called task2_plot, tell us: what are your main takeaways regarding France relative to Italy and Norway? Make sure to be concrete and highlight at least two important comparative trends visualized in the graph.
Answer: From 1920 to 1950, France is the only country out of the three which displayed an increasing trend towards citizens saying “yes” to feeling closenes to a party by cohort, while the proportion of those saying “yes” for Italy and Norway were both decreasing. However, by 1950, the proportion of citizens saying yes in France began to drop at a rapid rate (from 0.60 to 0.23). Additionally, the decrease is more gradual for Italy and Norway and has a small increase in “yes” responses from 1945-1950. The key takeaway from this graph is that the proportion of French individuals who responded “yes” has severely decreased since 1950 whereas Italy and Norway experienced a gradual decrease in citizens saying “yes.”
italy_data <- read.fst("italy_data.fst")
italy_data <- italy_data %>%
# Modify 'clsprty' column: set values of 2 to 0, and values in 7, 8, 9 to NA. Retain other values as is.
mutate(
clsprty = ifelse(clsprty == 2, 1, ifelse(clsprty %in% c(7, 8, 9), NA, clsprty))
) %>%
# Modify 'yrbrn' column: set specific values (7777, 8888, 9999) to NA.
mutate(
yrbrn = ifelse(yrbrn %in% c(7777, 8888, 9999), NA, yrbrn)
)
italy_data <- italy_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_ # Set anything that is not 1 or 2 to NA
),
lrscale = case_when(
lrscale %in% 1 ~ "Yes",
lrscale %in% 2 ~ "No",
TRUE ~ NA_character_ # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA
)
)
lrscale_percentages <- italy_data %>% # Begin with the dataset 'france_data'
filter(!is.na(lrscale), !is.na(gndr)) %>% # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
group_by(gndr, lrscale) %>% # Group the data by 'gender' and 'lrscale' categories
summarise(count = n(), .groups = 'drop') %>% # Summarise each group to get counts, and then drop groupings
mutate(percentage = count / sum(count) * 100) # Calculate percentage for each group by dividing count by total count and multiplying by 100
lrscale_percentages # The resulting dataframe
## # A tibble: 4 × 4
## gndr lrscale count percentage
## <chr> <chr> <int> <dbl>
## 1 Female No 225 33.7
## 2 Female Yes 107 16.0
## 3 Male No 248 37.1
## 4 Male Yes 88 13.2
Provide code and answer.
Question: What is the marginal percentage of Italian men who feel close to a particular political party?
Answer: Of all Italian respondents, 13.2% are males who feel close to a particular political party.
Provide code and output only.
Prompt: In the tutorial, we calculated then visualized the percentage distribution for left vs. right by gender for France. Your task is to replicate the second version of the visualization but for the country of Sweden instead.
sweden_data <- read.fst("sweden_data.fst")
sweden_data <- sweden_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_ # Set anything that is not 1 or 2 to NA
),
lrscale = case_when(
lrscale %in% 0:3 ~ "Left Wing", # Left-wing (0 to 3)
lrscale %in% 7:10 ~ "Right Wing", # Right-wing (7 to 10)
TRUE ~ NA_character_ # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA
)
)
lrscale_percentages <- sweden_data %>% # Begin with the dataset 'france_data'
filter(!is.na(lrscale), !is.na(gndr)) %>% # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
group_by(gndr, lrscale) %>% # Group the data by 'gender' and 'lrscale' categories
summarise(count = n(), .groups = 'drop') %>% # Summarise each group to get counts, and then drop groupings
mutate(percentage = count / sum(count) * 100) # Calculate percentage for each group by dividing count by total count and multiplying by 100
lrscale_percentages # The resulting dataframe
## # A tibble: 4 × 4
## gndr lrscale count percentage
## <chr> <chr> <int> <dbl>
## 1 Female Left Wing 2296 23.0
## 2 Female Right Wing 2530 25.3
## 3 Male Left Wing 2062 20.6
## 4 Male Right Wing 3107 31.1
# Create a ggplot object for horizontal bar chart with the specified style
lrscale_plot_v2 <- ggplot(lrscale_percentages,
aes(x = percentage, # Use percentage directly
y = reorder(gndr, -percentage), # Order bars within each gender
fill = gndr)) + # Fill color based on Gender
# Create horizontal bar chart
geom_col() + # Draws the bars using the provided data
coord_flip() + # Flip coordinates to make bars horizontal
# Remove fill color legend
guides(fill = "none") + # Removes legend for the fill aesthetic
# Split the plot based on Political Orientation
facet_wrap(~ lrscale, nrow = 1) + # Separate plots for Left/Right
# Labels and titles for the plot
labs(x = "Percentage of Respondents", # X-axis label
y = NULL, # Remove Y-axis label
title = "Political Orientation by Gender", # Main title
subtitle = "Comparing the percentage distribution of Left vs. Right for Sweden (2002-2020)") + # Subtitle
# Adjust visual properties of the plot
theme(plot.title = element_text(size = 16, face = "bold"), # Format title
plot.subtitle = element_text(size = 12), # Format subtitle
axis.title.y = element_blank(), # Remove Y-axis title
legend.position = "bottom") # Position the legend at the bottom
# Display the ggplot object
lrscale_plot_v2
hungary_data <- read.fst("hungary_data.fst")
hungary_data <- hungary_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_ # Set anything that is not 1 or 2 to NA
),
lrscale = case_when(
lrscale %in% 1 ~ "Yes", # Left-wing (0 to 3)
lrscale %in% 2 ~ "No", # Right-wing (7 to 10)
TRUE ~ NA_character_ # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA
)
)
lrscale_percentages <- hungary_data %>% # Begin with the dataset 'hungary_data'
filter(!is.na(lrscale), !is.na(gndr)) %>% # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
group_by(gndr, lrscale) %>% # Group the data by 'gender' and 'lrscale' categories
summarise(count = n(), .groups = 'drop') %>% # Summarise each group to get counts, and then drop groupings
mutate(percentage = count / sum(count) * 100) # Calculate percentage for each group by dividing count by total count and multiplying by 100
lrscale_percentages # The resulting dataframe
## # A tibble: 4 × 4
## gndr lrscale count percentage
## <chr> <chr> <int> <dbl>
## 1 Female No 409 39.9
## 2 Female Yes 185 18.1
## 3 Male No 275 26.9
## 4 Male Yes 155 15.1
# Recode clsprty and geo variables, removing NAs
hungary_data <- hungary_data %>%
mutate(
geo = recode(as.character(domicil),
'1' = "Urban",
'2' = "Urban",
'3' = "Rural",
'4' = "Rural",
'5' = "Rural",
'7' = NA_character_,
'8' = NA_character_,
'9' = NA_character_)
) %>%
filter(!is.na(lrscale), !is.na(geo)) # Removing rows with NA in clsprty or geo
# Calculate conditional probabilities, excluding NAs
cond <- hungary_data %>%
count(lrscale, geo) %>%
group_by(geo) %>%
mutate(prob = n / sum(n))
cond
## # A tibble: 4 × 4
## # Groups: geo [2]
## lrscale geo n prob
## <chr> <chr> <int> <dbl>
## 1 No Rural 469 0.668
## 2 No Urban 215 0.668
## 3 Yes Rural 233 0.332
## 4 Yes Urban 107 0.332
Provide code and answer: In Hungary, what is the conditional probability of NOT feeling close to any particular party given that the person lives in a rural area?
Answer: Given that a citizen in Hungary lives in a rural area, the conditional probability of NOT feeling close to any political party is 66.8%.