rm(list=ls()); gc()
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 528717 28.3    1175897 62.8         NA   669445 35.8
## Vcells 974238  7.5    8388608 64.0      16384  1851710 14.2
# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"

Task 1

Provide code and answer.

Prompt: in the tutorial, we calculated the average trust in others for France and visualized it. Using instead the variable ‘Trust in Parliament’ (trstplt) and the country of Spain (country file provided on course website), visualize the average trust by survey year. You can truncate the y-axis if you wish. Provide appropriate titles and labels given the changes. What are your main takeaways based on the visual (e.g., signs of increase, decrease, or stall)?

spain_data <- read.fst("spain_data.fst")
spain_data <- spain_data %>%
  mutate(
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt), # set values 77, 88, and 99 to NA.
  )
table(spain_data$trstplt) # if there are values that are not supposed to be there (e.g., 77, 88, 99 in this case), then we need to deal with it
## 
##    0    1    2    3    4    5    6    7    8    9   10 
## 5165 1830 2329 2441 2085 2890 1154  639  355   80   71
spain_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  spain_data$year[spain_data$essround == i] <- replacements[i]
}
table(spain_data$year)
## 
## 2002 2004 2006 2008 2010 2012 2014 2016 2018 2020 
## 1729 1663 1876 2576 1885 1889 1925 1958 1668 2283
trust_by_year <- spain_data %>%
  group_by(year) %>%
  summarize(mean_trust = mean(trstplt, na.rm = TRUE))
trust_by_year
## # A tibble: 10 × 2
##     year mean_trust
##    <dbl>      <dbl>
##  1  2002       3.41
##  2  2004       3.66
##  3  2006       3.49
##  4  2008       3.32
##  5  2010       2.72
##  6  2012       1.91
##  7  2014       2.23
##  8  2016       2.40
##  9  2018       2.55
## 10  2020       1.94
ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
  geom_line(color = "blue", size = 1) +  # Line to show the trend
  geom_point(color = "red", size = 3) +  # Points to highlight each year's value
  labs(title = "Trust in Parliament in Spain (2002-2020)", 
       x = "Survey Year", 
       y = "Average Trust (0-10 scale)") +
  ylim(0, 10) +  # Setting the y-axis limits from 0 to 10
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

According to the visual, there were signs of decrease. From 2005 to around 2012, the average trust kept decreasing from 3.75 to around 1.9. ALthough it had slightly increased around 0.6 during around 2012 to around 2018, it still decreased after that.

Task 2

Provide answer only.

Prompt and question: Based on the figure we produced above called task2_plot, tell us: what are your main takeaways regarding France relative to Italy and Norway? Make sure to be concrete and highlight at least two important comparative trends visualized in the graph.

Comparing France and Italy, although both of them are continuously decreasing since 1950, the proportion of France is still larger than Italy. France has decreased from almost 0.625 to a little lower than 0.25. Italy has decreased from around 0.4375 to almost 0.125. Since 0.125 is smaller than 0.25, France still has a larger proportion. Comparing to France and Norway, Norway is better than France. Norway has a steadier decrease while France has a significant decrease. Although 1920 to 1940, France had increased for a little when Norway had decreased. Starting from 1940, France has decreased from 0.625 to a little lower than 0.25 and Norway has only decreased from around 0.75 to around 0.5. It shows that France has decreased for around 0.4 proportion and Norway has only decreased for around 0.25. Therefore, Norway is better than France and France is better than Italy.

Task 3

Provide code and answer.

Question: What is the marginal percentage of Italian men who feel close to a particular political party?

italy_data <- read.fst("italy_data.fst")
italy_data <- italy_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ NA_character_  # Set anything that is not 1 or 2 to NA
    ),
    lrscale = case_when(
      lrscale %in% 0:3 ~ "Left",       # Left-wing (0 to 3)
      lrscale %in% 7:10 ~ "Right",     # Right-wing (7 to 10)
      TRUE ~ NA_character_  # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA 
    )    
  ) 
lrscale_percentages <- italy_data %>%  # Begin with the dataset 'italy_data'
  filter(!is.na(lrscale), !is.na(gndr)) %>%  # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
  group_by(gndr, lrscale) %>%  # Group the data by 'gender' and 'lrscale' categories
  summarise(count = n(), .groups = 'drop') %>%  # Summarise each group to get counts, and then drop groupings
  mutate(percentage = count / sum(count) * 100)  # Calculate percentage for each group by dividing count by total count and multiplying by 100

lrscale_percentages  # The resulting dataframe
## # A tibble: 4 × 4
##   gndr   lrscale count percentage
##   <chr>  <chr>   <int>      <dbl>
## 1 Female Left      930       23.9
## 2 Female Right     955       24.5
## 3 Male   Left      924       23.7
## 4 Male   Right    1084       27.8

The marginal percentage for male respondents who lean politically right is 27.8% while the marginal percentage for male respondents who lean politically left is 23.7%.

Task 4

Provide code and output only.

Prompt: In the tutorial, we calculated then visualized the percentage distribution for left vs. right by gender for France. Your task is to replicate the second version of the visualization but for the country of Sweden instead.

sweden_data <- read.fst("sweden_data.fst")
lrscale_plot <- ggplot(lrscale_percentages, aes(x = lrscale, y = percentage, fill = lrscale)) +
  geom_bar(stat = "identity", position = position_dodge()) +  # Dodged bar chart
  facet_wrap(~ gndr, scales = "fixed") +  # Fixed scales for y-axis across facets
  scale_fill_brewer(palette = "Set1") +  # Distinct colors for Left and Right
  labs(
    title = "Political Orientation (Left vs. Right) by Gender in Sweden",
    x = "Political Orientation",
    y = "Percentage of Respondents",
    fill = "Orientation"
  ) +
  theme_minimal() +  # Minimal theme for clarity
  theme(legend.position = "bottom")  # Legend at the bottom

# Display the ggplot object
lrscale_plot

Task 5

Provide code and answer: In Hungary, what is the conditional probability of NOT feeling close to any particular party given that the person lives in a rural area?

hungary_data <- read.fst("hungary_data.fst")
# Recode clsprty and geo variables, removing NAs
hungary_data <- hungary_data %>%
   mutate(
    clsprty = case_when(
      clsprty == 1 ~ "Yes",
      clsprty == 2 ~ "No",
      TRUE ~ NA_character_  # Set anything that is not 1 or 2 to NA
    ),
    geo = recode(as.character(domicil), 
                 '1' = "Urban", 
                 '2' = "Urban",
                 '3' = "Rural", 
                 '4' = "Rural", 
                 '5' = "Rural",
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_)
  ) %>%
  filter(!is.na(clsprty), !is.na(geo))  # Removing rows with NA in clsprty or geo
# Calculate conditional probabilities, excluding NAs
cond <- hungary_data %>%
  count(clsprty, geo) %>%
  group_by(geo) %>%
  mutate(prob = n / sum(n))

cond
## # A tibble: 4 × 4
## # Groups:   geo [2]
##   clsprty geo       n  prob
##   <chr>   <chr> <int> <dbl>
## 1 No      Rural  6275 0.554
## 2 No      Urban  2395 0.512
## 3 Yes     Rural  5055 0.446
## 4 Yes     Urban  2283 0.488

The conditional probability of NOT feeling close to any particular party given that the person lives in a rural area is 55.4%.