Jaishankar_Arvindh_Homework

# List of packages
packages <- c("tidyverse", "fst", "modelsummary") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'fst' was built under R version 4.3.2

## Warning: package 'modelsummary' was built under R version 4.3.2

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"

ess <- read_fst("All-ESS-Data.fst")

## Warning: package 'fstcore' was built under R version 4.3.2

Task 1

Provide code and answer.

Prompt and question: calculate the average for the variable ‘happy’ for the country of Norway. On average, based on the ESS data, who reports higher levels of happiness: Norway or Belgium?

Note: we already did it for Belgium. You just need to compare to Norway’s average, making sure to provide the code for both.

unique(ess$cntry)

##  [1] "AT" "BE" "CH" "CZ" "DE" "DK" "ES" "FI" "FR" "GB" "GR" "HU" "IE" "IL" "IT"
## [16] "LU" "NL" "NO" "PL" "PT" "SE" "SI" "EE" "IS" "SK" "TR" "UA" "BG" "CY" "RU"
## [31] "HR" "LV" "RO" "LT" "AL" "XK" "ME" "RS" "MK"

belgium_happy <- ess %>% 
  filter(cntry == "BE") %>% 
  select(happy)

belgium_happy$y <- belgium_happy$happy

table(belgium_happy$y)

## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88   99 
##   50   27  104  194  234  830  999 3503 6521 3402 1565    3   16    3

# need to remove 77, 88, 99 or else will alter results. See data portal for what they represent (e.g. DK, Refusal, etc.)

# Recode values 77 through 99 to NA
belgium_happy$y[belgium_happy$y %in% 77:99] <- NA

# checking again
table(belgium_happy$y)

## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   50   27  104  194  234  830  999 3503 6521 3402 1565

mean_b <- mean(belgium_happy$y, na.rm = TRUE)
cat("Mean of 'Belgium' is:", mean_b, "\n")

## Mean of 'Belgium' is: 7.737334

norway_happy <- ess %>% 
  filter(cntry == "NO") %>% 
  select(happy)

norway_happy$y <- norway_happy$happy

table(norway_happy$y)

## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88 
##   15   29   59  163  238  730  817 2617 5235 3796 2344   12   10

belgium_happy$y[belgium_happy$y %in% 77:99] <- NA

table(belgium_happy$y)

## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   50   27  104  194  234  830  999 3503 6521 3402 1565

mean_n <- mean(norway_happy$y, na.rm = TRUE)
cat("Mean of 'Norway' is:", mean_n, "\n")

## Mean of 'Norway' is: 8.076377

Ans: On average, Norway reports higher amounts of happiness compared to Belgium.

Task 2

Provide code and answer.

Prompt and question: what is the most common category selected, for Irish respondents, for frequency of binge drinking? The variable of interest is: alcbnge.

More info here: https://ess-search.nsd.no/en/variable/0c65116e-7481-4ca6-b1d9-f237db99a694.

Hint: need to convert numeric value entries to categories as specified in the variable information link. We did similar steps for Estonia and the climate change attitude variable.

ireland_alcbnge <- ess %>%
  filter(cntry == "IE") %>%
  select(alcbnge)

ireland_alcbnge$y <- ireland_alcbnge$alcbnge

table(ireland_alcbnge$y)

## 
##   1   2   3   4   5   6   7   8 
##  65 650 346 417 239 641  26   6

# Recode values 6 through 8 to NA
ireland_alcbnge$y[ireland_alcbnge$y %in% 6:8] <- NA

# Converting to categories to get mode as a category instead of a number
df <- ireland_alcbnge %>%
  mutate(
    y_category = case_when(
      y == 1 ~ "Daily or almost daily",
      y == 2 ~ "Weekly",
      y == 3 ~ "Monthly",
      y == 4 ~ "Less than monthly",
      y == 5 ~ "Never",
      TRUE ~ NA_character_
    ),
    y_category = fct_relevel(factor(y_category),  ### here you would put the categories in order you want them to appear or else it will appear alphabetically
                             "Daily or almost daily", 
                             "Weekly", 
                             "Monthly", 
                             "Less than monthly", 
                             "Never")
  )

# To confirm the conversion:
table(df$y_category)

## 
## Daily or almost daily                Weekly               Monthly 
##                    65                   650                   346 
##     Less than monthly                 Never 
##                   417                   239

get_mode <- function(v) {
  tbl <- table(v)
  mode_vals <- as.character(names(tbl)[tbl == max(tbl)])
  return(mode_vals)
}

mode_values <- get_mode(df$y_category)
cat("Mode of y category:", paste(mode_values, collapse = ", "), "\n")

## Mode of y category: Weekly

Ans: The most common category selected by Irish respondents is ‘weekly’.

Task 3

Provide code and answer.

Prompt and question: when you use the summary() function for the variable plnftr (about planning for future or taking every each day as it comes from 0-10) for both the countries of Portugal and Serbia, what do you notice? What stands out as different when you compare the two countries (note: look up the variable information on the ESS website to help with interpretation)? Explain while referring to the output generated.

  # Step 1: Filter for the countries of interest
  portugal_plnftr <- ess %>%
  filter(cntry == "PT") %>%
  select(plnftr)

  serbia_plnftr <- ess %>%
  filter(cntry == "RS") %>%
  select(plnftr)
  
  # Step 3: Summary 
  
 summary(portugal_plnftr)

##      plnftr      
##  Min.   : 0.000  
##  1st Qu.: 3.000  
##  Median : 5.000  
##  Mean   : 6.426  
##  3rd Qu.: 8.000  
##  Max.   :88.000  
##  NA's   :14604

 summary(serbia_plnftr)

##      plnftr      
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 4.000  
##  Mean   : 4.983  
##  3rd Qu.: 8.000  
##  Max.   :88.000  
##  NA's   :1505

Ans: People in Portugal are more likely to take the day as it comes instead of planning ahead, as their mean is closer to taking every each as it comes on the scale of 0-10.

Task 4

Provide code and answer.

Prompt and question: using the variables stfdem and gndr, answer the following: on average, who is more dissastified with democracy in Italy, men or women? Explain while referring to the output generated.

Info on variable here: https://ess.sikt.no/en/variable/query/stfdem/page/1

italy_data <- ess %>% 
  filter(cntry == "IT")

# Convert gender and stfdem 
italy_data <- italy_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ as.character(gndr)
    ),
    stfdem = ifelse(stfdem %in% c(77, 88), NA, stfdem)  # Convert stfdem values
  )
italy_data$gndr[italy_data$gndr%in%9]<- NA

# Compute average of stfdem by gender
means_by_gender <- italy_data %>%
  group_by(gndr) %>% 
  summarize(lrscale = mean(lrscale, na.rm = TRUE)) 

print(means_by_gender)

## # A tibble: 3 × 2
##   gndr   lrscale
##   <chr>    <dbl>
## 1 Female    28.3
## 2 Male      24.3
## 3 <NA>      22.2

Ans: On average, women are more dissatisfied with democracy in Italy.

Task 5

Provide code and answer.

Prompt: Interpret the boxplot graph of stfedu and stfhlth that we generated already: according to ESS data, would we say that the median French person is more satisfied with the education system or health services? Explain.

Change the boxplot graph: provide the code to change some of the key labels: (1) Change the title to: Boxplot of satisfaction with the state of education vs. health services; (2) Remove the x-axis label; (3) Change the y-axis label to: Satisfaction (0-10).

Hint: copy the boxplot code above and just replace or cut what is asked.

france_data <- ess %>% 
  filter(cntry == "FR")

france_data %>%
  # Setting values to NA
  mutate(stfedu = ifelse(stfedu %in% c(77, 88, 99), NA, stfedu),
         stfhlth = ifelse(stfhlth %in% c(77, 88, 99), NA, stfhlth)) %>%
  # Reshaping the data
  select(stfedu, stfhlth) %>%
  gather(variable, value, c(stfedu, stfhlth)) %>%
  # Creating the boxplot
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(y = "Sastifaction(0-10)", title = "Boxplot of sastifaction with the state of education vs. health services") +
  theme_minimal()

## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).

Ans: The median French person is more satisfied with health services as the level of satisfaction on a scale of 0-10 is higher for health services than the education system.