Homework 2

packages <- c("tidyverse", "fst", "modelsummary", "viridis") 

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"
rm(list=ls()); gc() 
##           used (Mb) gc trigger  (Mb) limit (Mb) max used (Mb)
## Ncells 1081133 57.8    2195805 117.3         NA  1520606 81.3
## Vcells 1870722 14.3    8388608  64.0      16384  2374949 18.2
ess <- read_fst("All-ESS-Data.fst")

Task 1: in the tutorial, we calculated the average trust in others for France and visualized it. Using instead the variable ‘Trust in Parliament’ (trstplt) and the country of Spain (country file provided on course website), visualize the average trust by survey year. You can truncate the y-axis if you wish. Provide appropriate titles and labels given the changes. What are your main takeaways based on the visual (e.g., signs of increase, decrease, or stall)?

spain_data <- read.fst("spain_data.fst")
spain_data <- spain_data %>%
  mutate(
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt), # set values 77, 88, and 99 to NA.
  )
table(spain_data$trstplt)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
## 5165 1830 2329 2441 2085 2890 1154  639  355   80   71
spain_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  spain_data$year[spain_data$essround == i] <- replacements[i]
}
table(spain_data$year)
## 
## 2002 2004 2006 2008 2010 2012 2014 2016 2018 2020 
## 1729 1663 1876 2576 1885 1889 1925 1958 1668 2283
trust_by_year <- spain_data %>%
  group_by(year) %>%
  summarize(mean_trust = mean(trstplt, na.rm = TRUE))
trust_by_year
## # A tibble: 10 × 2
##     year mean_trust
##    <dbl>      <dbl>
##  1  2002       3.41
##  2  2004       3.66
##  3  2006       3.49
##  4  2008       3.32
##  5  2010       2.72
##  6  2012       1.91
##  7  2014       2.23
##  8  2016       2.40
##  9  2018       2.55
## 10  2020       1.94
ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
  geom_line(aes(group = 1), color = "blue", size = 1, linetype = "longdash") +  # Dashed line for the trend
#  geom_point(aes(color = mean_trust), size = 3) +  # if you want to try alternative + strive to keep improving
#  scale_color_viridis(option = "D", end = 0.9, direction = -1) +  # try the alternative, and try to keep improving
  labs(title = "Trust in Parliament in Spain (2002-2020)", 
       x = "Survey Year", 
       y = "Average Trustin Parliament (0-10 scale)") +
  ylim(1, 5) +  # change the numbers to truncate more or less or keep the full 0 to 10 scale
  theme_minimal() +  # Minimal theme for a clean look
  theme(legend.position = "none")  # Remove the legend
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

So therefore, average trust in parliment in spain was the highest in 2004 at 3.6. the level of trust gradually declined after, with a slight increase between the year 2013 to 2018.

Task 2: Based on the figure we produced above called task2_plot, tell us: what are your main takeaways regarding France relative to Italy and Norway? Make sure to be concrete and highlight at least two important comparative trends visualized in the graph.

france_data <- read.fst("france_data.fst")
norway_data <- read.fst("norway_data.fst")
italy_data <- read.fst("italy_data.fst")
# Combine data from three countries (France, Norway, Italy) into a single dataset
combined_countries_data <- bind_rows(
  france_data %>% mutate(cntry = "France"),  # Add a new column 'cntry' with the value 'France' to france_data
  norway_data %>% mutate(cntry = "Norway"), # Do the same for norway_data with 'Norway'
  italy_data %>% mutate(cntry = "Italy")    # And for italy_data with 'Italy'
) %>%
  filter(!yrbrn %in% c(7777, 8888)) %>%  # Remove rows where 'yrbrn' (year born) is either 7777 or 8888 (likely missing data codes)
  mutate(
    yrbrn = ifelse(yrbrn > 2005, NA, yrbrn), # Replace 'yrbrn' values greater than 2005 with NA (as they are invalid)
    clsprty = ifelse(clsprty %in% c(7, 8, 9), NA, clsprty), # Replace 'clsprty' values of 7, 8, 9 with NA (considered as invalid or missing)
    clsprty = ifelse(clsprty == 2, 0, clsprty) # Convert 'clsprty' value of 2 to 0 (recoding for analysis purposes)
  )

# Compute proportions for 'clsprty' by 'yrbrn' for each country
clsprty_proportions_compare <- combined_countries_data %>%
  filter(!is.na(yrbrn) & yrbrn >= 1920 & !is.na(clsprty)) %>% # Filter out rows with NA in 'yrbrn' and 'clsprty', and where 'yrbrn' is earlier than 1920
  group_by(yrbrn, cntry) %>% # Group the data by 'yrbrn' (year born) and 'cntry' (country)
  summarise(yes_count = sum(clsprty), total_count = n(), .groups = 'drop') %>% # Summarise data to get count of 'clsprty' and total count per group
  mutate(proportion = yes_count / total_count) # Calculate the proportion of 'clsprty' in each group
# Visualize the data
task2_plot <- ggplot(clsprty_proportions_compare, aes(x = yrbrn, y = proportion, color = cntry)) +  # Initialize ggplot with 'clsprty_proportions' data, setting 'yrbrn' as x-axis, 'proportion' as y-axis, and 'cntry' for color differentiation
  geom_point() +  # Add points to the plot for each data point
  geom_smooth(method = 'loess', se = FALSE) +  # Add a LOESS smoothed line for trend visualization without standard error bands
  labs(
    title = "Proportion Saying 'Yes' to Feeling Close to a Party by Cohort",  # Title of the plot
    x = "Cohort",  # Label for the x-axis
    y = "Proportion",  # Label for the y-axis
    color = "Country"  # Legend title for color coding by country
  ) +
  theme_minimal() +  # Use a minimal theme for a cleaner look
  scale_color_manual(values = c(
    "France" = "#1f77b4",  # Assign a specific color for France
    "Norway" = "#ff7f0e",  # Assign a different color for Norway
    "Italy" = "#2ca02c"    # And another distinct color for Italy
  )) +
  scale_y_continuous(limits = c(0, 1))  # Set the y-axis limits from 0 to 1 as we're dealing with proportions

# Display the plot
task2_plot
## `geom_smooth()` using formula = 'y ~ x'

Answer: from 1920 to 1950, france was the only country that displayed an increasing trend towards citizens saying yes to feeling closeness to a party, while the same population, in italy and norway decressed. however in italy and norway, the decrease was more gradual and occured after a slight increase in “yes” respondents between the year 1945 - 1950. One of the main things i noticed, looking at the graph, is that the french respondents who voted for “yes” have decreased severly and rapidly since 1950s, In comparison to italy and norway.

Task 3: What is the marginal percentage of italian men who feel closer to a particular political party ?

italy_data <- read.fst("italy_data.fst")
italy_data <- italy_data %>%
  # Modify 'clsprty' column: set values of 2 to 0, and values in 7, 8, 9 to NA. Retain other values as is.
  mutate(
    clsprty = ifelse(clsprty == 2, 1, ifelse(clsprty %in% c(7, 8, 9), NA, clsprty))
  ) %>%
  # Modify 'yrbrn' column: set specific values (7777, 8888, 9999) to NA.
  mutate(
    yrbrn = ifelse(yrbrn %in% c(7777, 8888, 9999), NA, yrbrn)
  )
italy_data <- italy_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ NA_character_  # Set anything that is not 1 or 2 to NA
    ),
    lrscale = case_when(
      lrscale %in% 1 ~ "Yes",     
      lrscale %in% 2 ~ "No",     
      TRUE ~ NA_character_  # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA 
    )    
  ) 
lrscale_percentages <- italy_data %>%  # Begin with the dataset 'france_data'
  filter(!is.na(lrscale), !is.na(gndr)) %>%  # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
  group_by(gndr, lrscale) %>%  # Group the data by 'gender' and 'lrscale' categories
  summarise(count = n(), .groups = 'drop') %>%  # Summarise each group to get counts, and then drop groupings
  mutate(percentage = count / sum(count) * 100)  # Calculate percentage for each group by dividing count by total count and multiplying by 100

lrscale_percentages  # The resulting dataframe
## # A tibble: 4 × 4
##   gndr   lrscale count percentage
##   <chr>  <chr>   <int>      <dbl>
## 1 Female No        225       33.7
## 2 Female Yes       107       16.0
## 3 Male   No        248       37.1
## 4 Male   Yes        88       13.2

Answer: of all italian respondents, 13.2% are males who feel close to a particular poltical party.

Task 4: In the tutorial, we calculated then visualized the percentage distribution for left vs. right by gender for France. Your task is to replicate the second version of the visualization but for the country of Sweden instead.

sweden_data <- read.fst("sweden_data.fst")
sweden_data <- sweden_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ NA_character_  # Set anything that is not 1 or 2 to NA
    ),
    lrscale = case_when(
      lrscale %in% 0:3 ~ "Left Wing",       # Left-wing (0 to 3)
      lrscale %in% 7:10 ~ "Right Wing",     # Right-wing (7 to 10)
      TRUE ~ NA_character_  # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA 
    )    
  )
lrscale_percentages <- sweden_data %>%  # Begin with the dataset 'france_data'
  filter(!is.na(lrscale), !is.na(gndr)) %>%  # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
  group_by(gndr, lrscale) %>%  # Group the data by 'gender' and 'lrscale' categories
  summarise(count = n(), .groups = 'drop') %>%  # Summarise each group to get counts, and then drop groupings
  mutate(percentage = count / sum(count) * 100)  # Calculate percentage for each group by dividing count by total count and multiplying by 100

lrscale_percentages  # The resulting dataframe
## # A tibble: 4 × 4
##   gndr   lrscale    count percentage
##   <chr>  <chr>      <int>      <dbl>
## 1 Female Left Wing   2296       23.0
## 2 Female Right Wing  2530       25.3
## 3 Male   Left Wing   2062       20.6
## 4 Male   Right Wing  3107       31.1
# Create a ggplot object for horizontal bar chart with the specified style
lrscale_plot_v2 <- ggplot(lrscale_percentages, 
            aes(x = percentage,  # Use percentage directly
                y = reorder(gndr, -percentage),  # Order bars within each gender
                fill = gndr)) +  # Fill color based on Gender

  # Create horizontal bar chart
  geom_col() +  # Draws the bars using the provided data
  coord_flip() +  # Flip coordinates to make bars horizontal

  # Remove fill color legend
  guides(fill = "none") +  # Removes legend for the fill aesthetic

  # Split the plot based on Political Orientation
  facet_wrap(~ lrscale, nrow = 1) +  # Separate plots for Left/Right

  # Labels and titles for the plot
  labs(x = "Percentage of Respondents",  # X-axis label
       y = NULL,  # Remove Y-axis label
       title = "Political Orientation by Gender",  # Main title
       subtitle = "Comparing the percentage distribution of Left vs. Right for Sweden (2002-2020)") +  # Subtitle

  # Adjust visual properties of the plot
  theme(plot.title = element_text(size = 16, face = "bold"),  # Format title
        plot.subtitle = element_text(size = 12),  # Format subtitle
        axis.title.y = element_blank(),  # Remove Y-axis title
        legend.position = "bottom")  # Position the legend at the bottom

# Display the ggplot object
lrscale_plot_v2

Task 5: In Hungary, what is the conditional probability of NOT feeling close to any particular party given that the person lives in a rural area?

hungary_data <- read.fst("hungary_data.fst")
hungary_data <- hungary_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ NA_character_  # Set anything that is not 1 or 2 to NA
    ),
    lrscale = case_when(
      lrscale %in% 1 ~ "Yes",       # Left-wing (0 to 3)
      lrscale %in% 2 ~ "No",     # Right-wing (7 to 10)
      TRUE ~ NA_character_  # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA 
    )    
  ) 
lrscale_percentages <- hungary_data %>%  # Begin with the dataset 'france_data'
  filter(!is.na(lrscale), !is.na(gndr)) %>%  # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
  group_by(gndr, lrscale) %>%  # Group the data by 'gender' and 'lrscale' categories
  summarise(count = n(), .groups = 'drop') %>%  # Summarise each group to get counts, and then drop groupings
  mutate(percentage = count / sum(count) * 100)  # Calculate percentage for each group by dividing count by total count and multiplying by 100

lrscale_percentages  # The resulting dataframe
## # A tibble: 4 × 4
##   gndr   lrscale count percentage
##   <chr>  <chr>   <int>      <dbl>
## 1 Female No        409       39.9
## 2 Female Yes       185       18.1
## 3 Male   No        275       26.9
## 4 Male   Yes       155       15.1
# Recode clsprty and geo variables, removing NAs
hungary_data <- hungary_data %>%
  mutate(
    geo = recode(as.character(domicil), 
                 '1' = "Urban", 
                 '2' = "Urban",
                 '3' = "Rural", 
                 '4' = "Rural", 
                 '5' = "Rural",
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_)
  ) %>%
  filter(!is.na(lrscale), !is.na(geo))  # Removing rows with NA in clsprty or geo
# Calculate conditional probabilities, excluding NAs
cond <- hungary_data %>%
  count(lrscale, geo) %>%
  group_by(geo) %>%
  mutate(prob = n / sum(n))

cond
## # A tibble: 4 × 4
## # Groups:   geo [2]
##   lrscale geo       n  prob
##   <chr>   <chr> <int> <dbl>
## 1 No      Rural   469 0.668
## 2 No      Urban   215 0.668
## 3 Yes     Rural   233 0.332
## 4 Yes     Urban   107 0.332

Answer: the conditional probability of not feeling close to any political party is 66.8%, given that hungarian citezens live in a rural area.