Naccarato_Isabella_Homework

# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "viridis") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "viridis"      "viridisLite"  "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"

##HOMEWORK QUESTIONS #Question 1 Provide code and answer. Prompt: in the tutorial, we calculated the average trust in others for France and visualized it. Using instead the variable ‘Trust in Parliament’ (trstplt) and the country of Spain (country file provided on course website), visualize the average trust by survey year. You can truncate the y-axis if you wish. Provide appropriate titles and labels given the changes. What are your main takeaways based on the visual (e.g., signs of increase, decrease, or stall)? ##Answer

#x is average trust 
#Y is survey year 

spain_data <- read.fst("spain_data.fst") #load in Spain data

table(spain_data$trstplt) #view values of trstplt and see if anything should be removed

## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88   99 
## 5165 1830 2329 2441 2085 2890 1154  639  355   80   71   46  336   31

#trstplt should only go from 0 to 10 so remove 77, 88, 99:

spain_data <- spain_data %>%
  mutate(
    trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt), # set values 77, 88, and 99 to NA.
  )

table(spain_data$trstplt) #view the table again to see if values were removed

## 
##    0    1    2    3    4    5    6    7    8    9   10 
## 5165 1830 2329 2441 2085 2890 1154  639  355   80   71

#we want to view the average trust by survey year so create a year variable:

spain_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  spain_data$year[spain_data$essround == i] <- replacements[i]
}

table(spain_data$year) #check if it worked

## 
## 2002 2004 2006 2008 2010 2012 2014 2016 2018 2020 
## 1729 1663 1876 2576 1885 1889 1925 1958 1668 2283

#Next calculate average by year and visualize:

trust_by_year <- spain_data %>%
  group_by(year) %>%
  summarize(mean_trust = mean(trstplt, na.rm = TRUE))
trust_by_year

## # A tibble: 10 × 2
##     year mean_trust
##    <dbl>      <dbl>
##  1  2002       3.41
##  2  2004       3.66
##  3  2006       3.49
##  4  2008       3.32
##  5  2010       2.72
##  6  2012       1.91
##  7  2014       2.23
##  8  2016       2.40
##  9  2018       2.55
## 10  2020       1.94

#Next visualize the average by year

ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
  geom_line(color = "blue", size = 1) +  # Line to show the trend
  geom_point(color = "red", size = 3) +  # Points to highlight each year's value
  labs(title = "Trust in Parliament in Spain (2002-2020)", 
       x = "Survey Year", 
       y = "Average Trust (0-10 scale)") +
  ylim(0, 10) +  # Setting the y-axis limits from 0 to 10
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# After viewing the above graph, I can truncate the y-axis to see the trend better and remove the points to focus only on the trend line

ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
  geom_line(aes(group = 1), color = "blue", size = 1, linetype = "longdash") + 
  labs(title = "Trust in Parliament in Spain (2002-2020)", 
       x = "Survey Year", 
       y = "Average Trust (0-10 scale)") +
  ylim(1.25, 4) +  # change the numbers to truncate more or less or keep the full 0 to 10 scale
  theme_minimal() +  # Minimal theme for a clean look
  theme(legend.position = "none")  # Remove the legend

##Based on the visual I get from this code, I can see that over the survey years 2004 to 2012 there was a decline in average parliment trust. Between the years 2012 and 2018, there was a brief rise in aveage parliament trust, followed by a decline again after 2018. #Question 2 Provide answer only.

Prompt and question: Based on the figure we produced above called task2_plot, tell us: what are your main takeaways regarding France relative to Italy and Norway? Make sure to be concrete and highlight at least two important comparative trends visualized in the graph. ##Answer Looking at the task2_plot, the trend for France relative to Italy declines for both regions from about the year 1950. Prior to 1950, the trend rises for France for about 20 years before beginning to decline, while the trend for Italy stalls for about 30 years before beginning to decline. After around the year 1985, the trend for Italy decreases at a steeper rate than it does for France Compaing France and Norway, the trend is also not consistent between the years 1920 to 1950. In France, as mentioned, the trend rises from 1920 to 1940, but in Norway, the trend falls from 1920 to 1940 with only a very slight rise starting after 1940 for a short period of time. After 1950, the trend for both France and Norway declines, but the trend for France declines at a steeper rate which is shown by the fact that the two trend lines diverge as we move towards the right on the graph Trust in Norway declines much less and stays very close in numbers throughout the years.

#Question 3 Provide code and answer.

Question: What is the marginal percentage of Italian men who feel close to a particular political party? ###Answer

#Recode clsprty variable so that 0 is no and 1 is yes (1 is yes feel close to particular party)
italy_data <- read.fst("italy_data.fst")

table(italy_data$clsprty)

## 
##    1    2    7    8    9 
## 3626 5828  322  400    2

italy_data <- italy_data %>%
  # Modify 'clsprty' column: set values of 2 to 0, and values in 7, 8, 9 to NA. Retain other values as is.
  mutate(
    clsprty = ifelse(clsprty == 2, 0, ifelse(clsprty %in% c(7, 8, 9), NA, clsprty))
  ) 
  
table(italy_data$clsprty)

## 
##    0    1 
## 5828 3626

italy_data <- italy_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ NA_character_  # Set anything that is not 1 or 2 to NA
    ),
    clsprty = case_when(
      clsprty == 0 ~ "No",       # Not close to political party
      clsprty == 1 ~ "Yes",     # Close to politicaly party
      TRUE ~ NA_character_  # Set 9 to NA
    )    
  ) 

clsprty_percentages <- italy_data %>%  # Begin with the dataset 'italy_data'
  filter(!is.na(clsprty), !is.na(gndr)) %>%  # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
  group_by(gndr, clsprty) %>%  # Group the data by 'gender' and 'clsprty' categories
  summarise(count = n(), .groups = 'drop') %>%  # Summarise each group to get counts, and then drop groupings
  mutate(percentage = count / sum(count) * 100)  # Calculate percentage for each group by dividing count by total count and multiplying by 100

clsprty_percentages  # The resulting dataframe

## # A tibble: 4 × 4
##   gndr   clsprty count percentage
##   <chr>  <chr>   <int>      <dbl>
## 1 Female No       3228       34.2
## 2 Female Yes      1686       17.9
## 3 Male   No       2593       27.5
## 4 Male   Yes      1936       20.5

The marginal percentage of Italian men who feel close to a particular political party is 20.50%.

##Question 4 Provide code and output only.

Prompt: In the tutorial, we calculated then visualized the percentage distribution for left vs. right by gender for France. Your task is to replicate the second version of the visualization but for the country of Sweden instead.

sweden_data <- read.fst("sweden_data.fst")

# First calculate the percentage distribution for left vs right by gender for Sweden:

sweden_data <- sweden_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ NA_character_  # Set anything that is not 1 or 2 to NA
    ),
    lrscale = case_when(
      lrscale %in% 0:3 ~ "Left",       # Left-wing (0 to 3)
      lrscale %in% 7:10 ~ "Right",     # Right-wing (7 to 10)
      TRUE ~ NA_character_  # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA 
    )    
  ) 

lrscale_percentages <- sweden_data %>%  # Begin with the dataset 'france_data'
  filter(!is.na(lrscale), !is.na(gndr)) %>%  # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
  group_by(gndr, lrscale) %>%  # Group the data by 'gender' and 'lrscale' categories
  summarise(count = n(), .groups = 'drop') %>%  # Summarise each group to get counts, and then drop groupings
  mutate(percentage = count / sum(count) * 100)  # Calculate percentage for each group by dividing count by total count and multiplying by 100

lrscale_percentages  # The resulting dataframe

## # A tibble: 4 × 4
##   gndr   lrscale count percentage
##   <chr>  <chr>   <int>      <dbl>
## 1 Female Left     2296       23.0
## 2 Female Right    2530       25.3
## 3 Male   Left     2062       20.6
## 4 Male   Right    3107       31.1

# Create the first visualization plot from the tutorial:

lrscale_plot <- ggplot(lrscale_percentages, aes(x = lrscale, y = percentage, fill = lrscale)) +
  geom_bar(stat = "identity", position = position_dodge()) +  # Dodged bar chart
  facet_wrap(~ gndr, scales = "fixed") +  # Fixed scales for y-axis across facets
  scale_fill_brewer(palette = "Set1") +  # Distinct colors for Left and Right
  labs(
    title = "Political Orientation (Left vs. Right) by Gender in Sweden",
    x = "Political Orientation",
    y = "Percentage of Respondents",
    fill = "Orientation"
  ) +
  theme_minimal() +  # Minimal theme for clarity
  theme(legend.position = "bottom")  # Legend at the bottom

# Display the ggplot object
lrscale_plot

# Make the second version of the visualization from the tutorial
lrscale_plot_v2 <- ggplot(lrscale_percentages, 
            aes(x = percentage,  # Use percentage directly
                y = reorder(gndr, -percentage),  # Order bars within each gender
                fill = gndr)) +  # Fill color based on Gender

  # Create horizontal bar chart
  geom_col() +  # Draws the bars using the provided data
  coord_flip() +  # Flip coordinates to make bars horizontal

  # Remove fill color legend
  guides(fill = "none") +  # Removes legend for the fill aesthetic

  # Split the plot based on Political Orientation
  facet_wrap(~ lrscale, nrow = 1) +  # Separate plots for Left/Right

  # Labels and titles for the plot
  labs(x = "Percentage of Respondents",  # X-axis label
       y = NULL,  # Remove Y-axis label
       title = "Political Orientation by Gender",  # Main title
       subtitle = "Comparing the percentage distribution of left vs. right for Sweden (2002-2020)") +  # Subtitle

  # Adjust visual properties of the plot
  theme(plot.title = element_text(size = 16, face = "bold"),  # Format title
        plot.subtitle = element_text(size = 12),  # Format subtitle
        axis.title.y = element_blank(),  # Remove Y-axis title
        legend.position = "bottom")  # Position the legend at the bottom

# Display the ggplot object
lrscale_plot_v2

###Question 5 Provide code and answer: In Hungary, what is the conditional probability of NOT feeling close to any particular party given that the person lives in a rural area? ##Answer ##Load Hungary data

hungary_data <- read.fst("hungary_data.fst")

# Recode clsprty and geo variables, removing NAs
hungary_data <- hungary_data %>%
  mutate(
    geo = recode(as.character(domicil), 
                 '1' = "Urban", 
                 '2' = "Urban",
                 '3' = "Rural", 
                 '4' = "Rural", 
                 '5' = "Rural",
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_)
  ) %>%
  filter(!is.na(lrscale), !is.na(geo))  # Removing rows with NA in clsprty or geo

# Calculate conditional probabilities, excluding NAs
cond <- hungary_data %>%
  count(clsprty, geo) %>%
  group_by(geo) %>%
  mutate(prob = n / sum(n))

cond

## # A tibble: 10 × 4
## # Groups:   geo [2]
##    clsprty geo       n     prob
##      <dbl> <chr> <int>    <dbl>
##  1       1 Rural  5055 0.429   
##  2       1 Urban  2283 0.472   
##  3       2 Rural  6275 0.532   
##  4       2 Urban  2395 0.495   
##  5       7 Rural   234 0.0199  
##  6       7 Urban    88 0.0182  
##  7       8 Rural   219 0.0186  
##  8       8 Urban    70 0.0145  
##  9       9 Rural     4 0.000339
## 10       9 Urban     4 0.000826

Recall that 2 is coded as not feeling close to a particular political party. So the conditional probability of NOT feeling close to any particular party given that the person lives in a rural area is 55.38%.

Naccarato_Isabella_Homework_2

Isabella Naccarato

2024-01-24