Set-Up

#Installing & Applying Packages
packages <-c("tidyverse", "fst", "modelsummary", "viridis")
new_packages <-packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"
#Loading Data into R
getwd()
## [1] "/Users/owner/Downloads"
setwd("/Users/owner/Downloads")
rm(list=ls()); gc()
##           used (Mb) gc trigger  (Mb) limit (Mb) max used (Mb)
## Ncells 1074684 57.4    2200161 117.6         NA  1483800 79.3
## Vcells 1863612 14.3    8388608  64.0      16384  2375777 18.2
hungary_data<- read_fst("hungary_data.fst")
spain_data <- read_fst("spain_data.fst")
sweden_data <- read_fst("sweden_data.fst")
italy_data <- read_fst("italy_data.fst")

Task 1: In the tutorial, we calculated the average trust in others for France and visualized it. Using instead the variable ‘Trust in Parliament’ (trstplt) and the country of Spain, visualize the average trust by survey year. You can truncate the y-axis if you wish. Provide appropriate titles and labels given the changes. What are your main takeaways based on the visual (e.g., signs of increase, decrease, or stall)?

#Viewing & Cleaning Spain trstplt data
spain_data <- spain_data %>%
  mutate(
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),)
table(spain_data$trstplt)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
## 5165 1830 2329 2441 2085 2890 1154  639  355   80   71
#Creating a year variable
spain_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020) 
for(i in 1:10){spain_data$year[spain_data$essround == i] <- replacements[i]}
table(spain_data$year)
## 
## 2002 2004 2006 2008 2010 2012 2014 2016 2018 2020 
## 1729 1663 1876 2576 1885 1889 1925 1958 1668 2283
#Calculating average trust by year
trust_by_year <- spain_data %>%
  group_by(year) %>%
  summarize(mean_trust = mean(trstplt, na.rm = TRUE))
trust_by_year
## # A tibble: 10 × 2
##     year mean_trust
##    <dbl>      <dbl>
##  1  2002       3.41
##  2  2004       3.66
##  3  2006       3.49
##  4  2008       3.32
##  5  2010       2.72
##  6  2012       1.91
##  7  2014       2.23
##  8  2016       2.40
##  9  2018       2.55
## 10  2020       1.94
#Visualizing trust by year
ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
  geom_line(color = "blue", size = 1) +  
  geom_point(color = "red", size = 3) +
    labs(title = "Trust in Parliament in Spain (2002-2020)", 
       x = "Survey Year", 
       y = "Average Trust (0-10 scale)") + ylim(0, 10) + theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

For this variable, it is important to note that a survey response of 0 means ‘No trust at all’ and a response of 10 means ’ Complete trust’. From the graph above, it can be interpreted that the average trust in parliament in Spain decreased from the initial survey date until 2012. After 2012, trust in parliament began to steadily increase until 2018, before decreasing in 2020 to a similar level as was recorded in 2012.

Task 3: What is the marginal percentage of Italian men who feel close to a particular political party?

#Viewing and cleaning up data
italy_data <-italy_data %>%
  mutate(clsprty=ifelse(clsprty == 2,0, ifelse(clsprty %in% c(7,8,9), NA, clsprty))) %>%
  mutate(gndr = ifelse(gndr %in% (9), NA, gndr))
table(italy_data$clsprty) 
## 
##    0    1 
## 5828 3626
table(italy_data$gndr)
## 
##    1    2 
## 4836 5329
# Adding names to gndr
italy_data <-italy_data %>%
  mutate(gndr = case_when(
    gndr == 1 ~ "Male",
    gndr == 2 ~ "Female", 
    TRUE ~ NA_character_))

# Adding names to clsprty
italy_data <-italy_data %>%
  mutate(clsprty = case_when(
    clsprty == 0 ~ "No",
    clsprty == 1 ~ "Yes", 
    TRUE ~ NA_character_))

#Checking it worked
table(italy_data$clsprty) 
## 
##   No  Yes 
## 5828 3626
table(italy_data$gndr)
## 
## Female   Male 
##   5329   4836
#Calculating the marginal percentage 
clsprty_percentages <- italy_data %>%
  filter(!is.na(clsprty),!is.na(gndr)) %>%
  group_by(gndr,clsprty) %>%
  summarise(count =n(), .groups ='drop') %>%
  mutate(percentage = count / sum(count) * 100)

clsprty_percentages
## # A tibble: 4 × 4
##   gndr   clsprty count percentage
##   <chr>  <chr>   <int>      <dbl>
## 1 Female No       3228       34.2
## 2 Female Yes      1686       17.9
## 3 Male   No       2593       27.5
## 4 Male   Yes      1936       20.5

The response “yes” for the variable clsprty indicates that the respondent feels closer to a particular party than all other parties. Therefore, the marginal percentage of Italian men that feel close to a particular political party is 20.5%.

Task 4: In the tutorial, we calculated then visualized the percentage distribution for left vs. right by gender for France. Your task is to replicate the second version of the visualization but for the country of Sweden instead.

#Cleaning & naming data
sweden_data <-sweden_data %>%
  mutate(lrscale=ifelse(lrscale %in% c(77,88,99), NA, lrscale),
         lrscale = case_when(
           lrscale %in% 0:3 ~ "Left",
           lrscale %in% 7:10 ~ "Right", 
           TRUE ~ NA_character_))
table(sweden_data$lrscale) 
## 
##  Left Right 
##  4362  5641
## Note: Responses of 4,5,6 to lrscale are removed as they represent 'moderate' or 'centrist' self-placement on the scale.

sweden_data <-sweden_data %>%
  mutate(gndr = ifelse(gndr %in% c(9), NA, gndr),
         gndr = case_when(
           gndr == 1 ~ "Male",
           gndr == 2 ~ "Female", 
           TRUE ~ NA_character_))
table(sweden_data$gndr)
## 
## Female   Male 
##   9127   9076
#Calculations
lrscale_percentages <- sweden_data %>%
  filter(!is.na(lrscale), !is.na(gndr)) %>%
  group_by(gndr, lrscale) %>%
  summarise (count = n(), .groups ='drop') %>%
  mutate(percentage = count / sum(count) * 100)
lrscale_percentages
## # A tibble: 4 × 4
##   gndr   lrscale count percentage
##   <chr>  <chr>   <int>      <dbl>
## 1 Female Left     2296       23.0
## 2 Female Right    2530       25.3
## 3 Male   Left     2062       20.6
## 4 Male   Right    3107       31.1
#Visualization
lrscale_plot <- ggplot(lrscale_percentages, aes(x = percentage, y=reorder(gndr, -percentage), fill=gndr)) + 
  geom_col()+
  coord_flip()+
  guides(fill="none")+
  facet_wrap(~ lrscale, nrow = 1)+
  labs(x= "Percentage of Respondents", y=NULL, title = "Political Orientation by Gender", subtitle = "Comparing the percentage distribution of left vs. right for Sweden (2002-2020) ") + 
  theme(plot.title = element_text(size=16, face = "bold"), plot.subtitle = element_text(size =12), axis.title.y = element_blank(), legend.position = "bottom")
lrscale_plot

Task 5: In Hungary, what is the conditional probability of NOT feeling close to any particular party given that the person lives in a rural area?

#Cleaning up data
hungary_data <-hungary_data %>%
  mutate(clsprty=ifelse(clsprty %in% c(7,8,9), NA, clsprty),
         clsprty = case_when(
           clsprty == 2 ~ "No",
           clsprty == 1 ~ "Yes", 
      TRUE ~ NA_character_))
table(hungary_data$clsprty) 
## 
##   No  Yes 
## 8679 7342
# Cleaning and recoding domicil as a new variable "geo" with named responses
hungary_data <- hungary_data %>%
  mutate(
    geo = recode(as.character(domicil), 
                 '1' = "Urban", 
                 '2' = "Urban",
                 '3' = "Rural", 
                 '4' = "Rural", 
                 '5' = "Rural",
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_)
  ) %>%
  filter(!is.na(clsprty), !is.na(geo))


#Calculate conditional probability
cond <- hungary_data %>%
  count(clsprty, geo) %>%
  group_by(geo) %>%
  mutate(prob = n / sum(n))
cond
## # A tibble: 4 × 4
## # Groups:   geo [2]
##   clsprty geo       n  prob
##   <chr>   <chr> <int> <dbl>
## 1 No      Rural  6275 0.554
## 2 No      Urban  2395 0.512
## 3 Yes     Rural  5055 0.446
## 4 Yes     Urban  2283 0.488

Therefore, in Hungary, given that someone resides in a rural area, the probability of that individual not feeling close to any particular party is 55.4%.