rm(list=ls()); gc()
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 529145 28.3    1177120 62.9         NA   669445 35.8
## Vcells 975682  7.5    8388608 64.0      16384  1851710 14.2
# List of packages
packages <- c("tidyverse", "fst", "modelsummary") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"
ess <- read_fst("All-ESS-Data.fst")

Task 1

Provide code and answer.

Prompt and question: calculate the average for the variable ‘happy’ for the country of Norway. On average, based on the ESS data, who reports higher levels of happiness: Norway or Belgium?

Note: we already did it for Belgium. You just need to compare to Norway’s average, making sure to provide the code for both.

belgium_happy <- ess %>% 
  filter(cntry == "BE") %>% 
  select(happy)
belgium_happy$y <- belgium_happy$happy

table(belgium_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88   99 
##   50   27  104  194  234  830  999 3503 6521 3402 1565    3   16    3
# need to remove 77, 88, 99 or else will alter results. See data portal for what they represent (e.g. DK, Refusal, etc.)

# Recode values 77 through 99 to NA
belgium_happy$y[belgium_happy$y %in% 77:99] <- NA

# checking again
table(belgium_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   50   27  104  194  234  830  999 3503 6521 3402 1565
mean_y <- mean(belgium_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.737334
norway_happy <- ess %>% 
  filter(cntry == "NO") %>% 
  select(happy)
norway_happy$y <- norway_happy$happy

table(norway_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88 
##   15   29   59  163  238  730  817 2617 5235 3796 2344   12   10
# Recode values 77 through 88 to NA
norway_happy$y[norway_happy$y %in% 77:88] <- NA

# checking again
table(norway_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   15   29   59  163  238  730  817 2617 5235 3796 2344
mean_y <- mean(norway_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.975005

Based on the ESS data, the mean of the levels of happiness of Norway is 7.975005. On the other hand, the mean of Belgium is 7.737334. Since 7.975005 > 7.737334, Norway reports higher levels of happiness.

Task 2

Provide code and answer.

Prompt and question: what is the most common category selected, for Irish respondents, for frequency of binge drinking? The variable of interest is: alcbnge.

More info here: https://ess-search.nsd.no/en/variable/0c65116e-7481-4ca6-b1d9-f237db99a694.

Hint: need to convert numeric value entries to categories as specified in the variable information link. We did similar steps for Estonia and the climate change attitude variable.

ireland_bngdrnk <- ess %>%
  filter(cntry == "IE") %>%
  select(alcbnge)

ireland_bngdrnk$y <- ireland_bngdrnk$alcbnge

table(ireland_bngdrnk$y)
## 
##   1   2   3   4   5   6   7   8 
##  65 650 346 417 239 641  26   6
# Recode values 6 through 8 to NA
ireland_bngdrnk$y[ireland_bngdrnk$y %in% 6:8] <- NA
# Converting to categories to get mode as a category instead of a number
df <- ireland_bngdrnk %>%
  mutate(
    y_category = case_when(
      y == 1 ~ "Daily or almost daily",
      y == 2 ~ "Weekly",
      y == 3 ~ "Monthly",
      y == 4 ~ "Less than monthly",
      y == 5 ~ "Never",
      TRUE ~ NA_character_
    ),
    y_category = fct_relevel(factor(y_category),  ### here you would put the categories in order you want them to appear or else it will appear alphabetically
                             "Daily or almost daily", 
                             "Weekly", 
                             "Monthly", 
                             "Less than monthly", 
                             "Never")
  )

# To confirm the conversion:
table(df$y_category)
## 
## Daily or almost daily                Weekly               Monthly 
##                    65                   650                   346 
##     Less than monthly                 Never 
##                   417                   239
# Let's determine the mode of our newly created category:

get_mode <- function(v) {
  tbl <- table(v)
  mode_vals <- as.character(names(tbl)[tbl == max(tbl)])
  return(mode_vals)
}

mode_values <- get_mode(df$y_category)
cat("Mode of y category:", paste(mode_values, collapse = ", "), "\n")
## Mode of y category: Weekly

For Irish respondents, the most common category selected for frequency of binge drinking is weekly.

Task 3

Provide code and answer.

Prompt and question: when you use the summary() function for the variable plnftr (about planning for future or taking every each day as it comes from 0-10) for both the countries of Portugal and Serbia, what do you notice? What stands out as different when you compare the two countries (note: look up the variable information on the ESS website to help with interpretation)? Explain while referring to the output generated.

portugal_plnftr <- ess %>% 
  filter(cntry == "PT") %>% 
  select(plnftr)
portugal_plnftr$y <- portugal_plnftr$plnftr

table(portugal_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10  88 
## 114 184 313 356 264 481 262 382 345 166 370  40
# Recode values 88 to NA
portugal_plnftr$y[portugal_plnftr$y %in% 88] <- NA

# checking again
table(portugal_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 114 184 313 356 264 481 262 382 345 166 370
# Using tidyverse syntax principles, we can get both the mean and median at once.
portugal_plnftr %>%
  summarize(
    mean_y = mean(y, na.rm = TRUE),
    median_y = median(y, na.rm = TRUE)
  ) %>%
  print()
##     mean_y median_y
## 1 5.418289        5
mode_y <- portugal_plnftr %>%
  filter(!is.na(y)) %>%
  count(y) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(y)

cat("\nMode of Y:", mode_y, "\n")
## 
## Mode of Y: 5
sd_y <- sd(portugal_plnftr$y, na.rm = TRUE)
cat("Standard Deviation of 'y':", sd_y, "\n")
## Standard Deviation of 'y': 2.86348
summary(portugal_plnftr)
##      plnftr             y         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 5.000   Median : 5.000  
##  Mean   : 6.426   Mean   : 5.418  
##  3rd Qu.: 8.000   3rd Qu.: 8.000  
##  Max.   :88.000   Max.   :10.000  
##  NA's   :14604    NA's   :14644
serbia_plnftr <- ess %>% 
  filter(cntry == "RS") %>% 
  select(plnftr)
serbia_plnftr$y <- serbia_plnftr$plnftr

table(serbia_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10  77  88 
## 587 133 152 138  95 246  70  87 103  47 364   4  17
# Recode values 77 and 88 to NA
serbia_plnftr$y[serbia_plnftr$y %in% 77:88] <- NA

# checking again
table(serbia_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 587 133 152 138  95 246  70  87 103  47 364
# Using tidyverse syntax principles, we can get both the mean and median at once.
serbia_plnftr %>%
  summarize(
    mean_y = mean(y, na.rm = TRUE),
    median_y = median(y, na.rm = TRUE)
  ) %>%
  print()
##     mean_y median_y
## 1 4.142928        4
mode_y <- serbia_plnftr %>%
  filter(!is.na(y)) %>%
  count(y) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(y)

cat("\nMode of Y:", mode_y, "\n")
## 
## Mode of Y: 0
sd_y <- sd(serbia_plnftr$y, na.rm = TRUE)
cat("Standard Deviation of 'y':", sd_y, "\n")
## Standard Deviation of 'y': 3.757209
summary(serbia_plnftr)
##      plnftr             y         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 4.000   Median : 4.000  
##  Mean   : 4.983   Mean   : 4.143  
##  3rd Qu.: 8.000   3rd Qu.: 8.000  
##  Max.   :88.000   Max.   :10.000  
##  NA's   :1505     NA's   :1526

Comparing to Portugal, there are more people in Serbia plan for their future as much as possible. In the summary table of Serbia, the numbers of Q1, median, and mean are lower. When the numbers are lower, more people plan for their future as much as possible. On the other hand, when the numbers are higher, more people just take each day as it comes. It shows that people in Serbia prefer planning for their future while people in Portugal prefer taking each day as it comes. At the same time, Serbia and Portugal have the same minimum and maximum number.

Task 4

Provide code and answer.

Prompt and question: using the variables stfdem and gndr, answer the following: on average, who is more dissastified with democracy in Italy, men or women? Explain while referring to the output generated.

Info on variable here: https://ess.sikt.no/en/variable/query/stfdem/page/1

italy_data <- ess %>%
  filter(cntry == "IT")

# Convert gender and stfdem (representing pol ID self-placement from left to right)
italy_data <- italy_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ as.character(gndr)
    ),
    stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem)  # Convert stfdem values
  )
table(italy_data$gndr)
## 
##      9 Female   Male 
##     13   5329   4836
# Recode values 9 to NA
italy_data$gndr[italy_data$gndr %in% 9] <- NA

italy_data <- italy_data %>% filter (!is.na(gndr))

table(italy_data$gndr)
## 
## Female   Male 
##   5329   4836
# Compute mean for male
mean_male_stfdem <- italy_data %>%
  filter(gndr == "Male") %>%
  summarize(mean_stfdem_men = mean(stfdem, na.rm = TRUE))

print(mean_male_stfdem)
##   mean_stfdem_men
## 1        4.782646
# Compute average of stfdem by gender
means_by_gender <- italy_data %>%
 group_by(gndr) %>% # here you are "grouping by" your second variable
  summarize(stfdem = mean(stfdem, na.rm = TRUE)) # here you are summarizing your variable of interest

print(means_by_gender)
## # A tibble: 2 × 2
##   gndr   stfdem
##   <chr>   <dbl>
## 1 Female   4.66
## 2 Male     4.78

According to the output, the average for female is 4.657553 and the average for male is 4.782646. Since 4.782646 > 4.657553 , women is more dissatisfied with democracy in Italy because higher numbers represent “extremely satisfied” and lower numbers represent “extremely dissatisfied”.

Task 5

Provide code and answer.

Prompt: Interpret the boxplot graph of stfedu and stfhlth that we generated already: according to ESS data, would we say that the median French person is more satisfied with the education system or health services? Explain.

Change the boxplot graph: provide the code to change some of the key labels: (1) Change the title to: Boxplot of satisfaction with the state of education vs. health services; (2) Remove the x-axis label; (3) Change the y-axis label to: Satisfaction (0-10).

Hint: copy the boxplot code above and just replace or cut what is asked.

france_data <- ess %>% 
  filter(cntry == "FR")

france_data %>%
  # Setting values to NA
  mutate(stfedu = ifelse(stfedu %in% c(77, 88, 99), NA, stfedu),
         stfhlth = ifelse(stfhlth %in% c(77, 88, 99), NA, stfhlth)) %>%
  # Reshaping the data
  select(stfedu, stfhlth) %>%
  gather(variable, value, c(stfedu, stfhlth)) %>%
  # Creating the boxplot
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(y = "Satisfaction (0-10)", title = "Boxplot of satisfaction with the state of education vs. health services") +
  theme_minimal()
## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).