Before I start

# List of packages
packages <- c("tidyverse", "fst", "modelsummary") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"
ess <- read_fst("All-ESS-Data.fst")

Task 1

Prompt and question: calculate the average for the variable ‘happy’ for the country of Norway. On average, based on the ESS data, who reports higher levels of happiness: Norway or Belgium?

Levels of happiness for Belgium

belgium_happy <- ess %>%
  filter(cntry == "BE") %>% 
  select(happy)
belgium_happy$y <- belgium_happy$happy
table(belgium_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88   99 
##   50   27  104  194  234  830  999 3503 6521 3402 1565    3   16    3
belgium_happy$y[belgium_happy$y %in% 77:99] <- NA
table(belgium_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   50   27  104  194  234  830  999 3503 6521 3402 1565
mean_y <- mean(belgium_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.737334
median_y <- median(belgium_happy$y, na.rm = TRUE)
cat("Median of 'y' is:", median_y, "\n")
## Median of 'y' is: 8
mode_y <- belgium_happy %>%
  filter(!is.na(y)) %>%
  count(y) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(y)

cat("\nMode of Y:", mode_y, "\n")
## 
## Mode of Y: 8
sd_y <- sd(belgium_happy$y, na.rm = TRUE)
cat("Standard Deviation of 'y':", sd_y, "\n")
## Standard Deviation of 'y': 1.52045
summary(belgium_happy)
##      happy              y         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 7.000   1st Qu.: 7.000  
##  Median : 8.000   Median : 8.000  
##  Mean   : 7.839   Mean   : 7.737  
##  3rd Qu.: 9.000   3rd Qu.: 9.000  
##  Max.   :99.000   Max.   :10.000  
##                   NA's   :22

Levels of happiness for Norway

norway_happy <- ess %>%
  filter(cntry == "NO") %>% 
  select(happy)
norway_happy$y <- norway_happy$happy
table(norway_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88 
##   15   29   59  163  238  730  817 2617 5235 3796 2344   12   10
norway_happy$y[norway_happy$y %in% 77:99] <- NA
table(norway_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   15   29   59  163  238  730  817 2617 5235 3796 2344
mean_y <- mean(norway_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.975005
median_y <- median(norway_happy$y, na.rm = TRUE)
cat("Median of 'y' is:", median_y, "\n")
## Median of 'y' is: 8
mode_y <- norway_happy %>%
  filter(!is.na(y)) %>%
  count(y) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(y)

cat("\nMode of Y:", mode_y, "\n")
## 
## Mode of Y: 8
sd_y <- sd(norway_happy$y, na.rm = TRUE)
cat("Standard Deviation of 'y':", sd_y, "\n")
## Standard Deviation of 'y': 1.539186
summary(norway_happy)
##      happy              y         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 7.000   1st Qu.: 7.000  
##  Median : 8.000   Median : 8.000  
##  Mean   : 8.076   Mean   : 7.975  
##  3rd Qu.: 9.000   3rd Qu.: 9.000  
##  Max.   :88.000   Max.   :10.000  
##                   NA's   :22

Answer: Norway (mean 7.975 ± SD 1.539) reports higher levels of happiness than Belgium (mean 7.737 ± SD 1.520).

Task 2

Prompt and question: what is the most common category selected, for Irish respondents, for frequency of binge drinking? The variable of interest is: alcbnge.

ireland_alcbnge <- ess %>%
  filter(cntry == "IE") %>%
  select(alcbnge)
ireland_alcbnge$y <- ireland_alcbnge$alcbnge
table(ireland_alcbnge$y)
## 
##   1   2   3   4   5   6   7   8 
##  65 650 346 417 239 641  26   6
ireland_alcbnge$y[ireland_alcbnge$y %in% 6:8] <- NA
mean_y <- mean(ireland_alcbnge$y, na.rm = TRUE)
median_y <- median(ireland_alcbnge$y, na.rm = TRUE)

cat("Mean of 'y':", mean_y, "\n")
## Mean of 'y': 3.066977
cat("Median of 'y':", median_y, "\n")
## Median of 'y': 3
df <- ireland_alcbnge %>%
  mutate(
    y_category = case_when(
      y == 1 ~ "Daily or almost daily",
      y == 2 ~ "Weekly",
      y == 3 ~ "Monthly",
      y == 4 ~ "Less than monthly",
      y == 5 ~ "Never",
      TRUE ~ NA_character_
    ),
    y_category = fct_relevel(factor(y_category),
                             "Daily or almost daily", 
                             "Weekly", 
                             "Monthly", 
                             "Less than monthly", 
                             "Never")
  )
table(df$y_category)
## 
## Daily or almost daily                Weekly               Monthly 
##                    65                   650                   346 
##     Less than monthly                 Never 
##                   417                   239
get_mode <- function(v) {
  tbl <- table(v)
  mode_vals <- as.character(names(tbl)[tbl == max(tbl)])
  return(mode_vals)
}
mode_values <- get_mode(df$y_category)
cat("Mode of y category:", paste(mode_values, collapse = ", "), "\n")
## Mode of y category: Weekly

Answer: The most common category selected, for Irish respondents, for frequency of binge drinking is 2. “Weekly” (650 responds).

Task 3

Prompt and question: when you use the summary() function for the variable plnftr (about planning for future or taking every each day as it comes from 0-10) for both the countries of Portugal and Serbia, what do you notice? What stands out as different when you compare the two countries (note: look up the variable information on the ESS website to help with interpretation)? Explain while referring to the output generated.

Obtaining data for the variable plnftr for Portugal

portugal_plnftr <- ess %>%
  filter(cntry == "PT") %>% 
  select(plnftr)
portugal_plnftr$y <- portugal_plnftr$plnftr

table(portugal_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10  88 
## 114 184 313 356 264 481 262 382 345 166 370  40
portugal_plnftr$y[portugal_plnftr$y %in% 88] <- NA

table(portugal_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 114 184 313 356 264 481 262 382 345 166 370
mean_y <- mean(portugal_plnftr$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 5.418289
median_y <- median(portugal_plnftr$y, na.rm = TRUE)
cat("Median of 'y' is:", median_y, "\n")
## Median of 'y' is: 5

Obtaining data for the variable plnftr for Serbia

serbia_plnftr <- ess %>%
  filter(cntry == "SE") %>% 
  select(plnftr)
serbia_plnftr$y <- serbia_plnftr$plnftr

table(serbia_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10  77  88  99 
## 151 155 398 517 301 503 270 405 374 145 237   2   7   1
serbia_plnftr$y[serbia_plnftr$y %in% 88] <- NA

table(serbia_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10  77  99 
## 151 155 398 517 301 503 270 405 374 145 237   2   1
mean_y <- mean(serbia_plnftr$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 5.087019
median_y <- median(serbia_plnftr$y, na.rm = TRUE)
cat("Median of 'y' is:", median_y, "\n")
## Median of 'y' is: 5

Comparing the summaries for two countries

summary(portugal_plnftr)
##      plnftr             y         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 5.000   Median : 5.000  
##  Mean   : 6.426   Mean   : 5.418  
##  3rd Qu.: 8.000   3rd Qu.: 8.000  
##  Max.   :88.000   Max.   :10.000  
##  NA's   :14604    NA's   :14644
summary(serbia_plnftr)
##      plnftr             y         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 5.000   Median : 5.000  
##  Mean   : 5.254   Mean   : 5.087  
##  3rd Qu.: 7.000   3rd Qu.: 7.000  
##  Max.   :99.000   Max.   :99.000  
##  NA's   :14750    NA's   :14757

Answer: The degree to which people tend to either plan for their future or take each day as it comes is very similar between those two countries and it stands out. The Portuguese population (mean 5.418), however, tends to plan for their future slightly more than the Serbian population (mean 5.018).

Task 4

Prompt and question: using the variables stfdem and gndr, answer the following: on average, who is more dissastified with democracy in Italy, men or women? Explain while referring to the output generated.

Loading datasets for the variables stfdem and gndr

italy_data <- ess %>% 
  filter(cntry == "IT")
italy_data <- italy_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ as.character(gndr)
    ),
    stfdem = ifelse(stfdem %in% c(77, 88), NA, stfdem)
  )
mean_male_stfdem <- italy_data %>%
  filter(gndr == "Male") %>%
  summarize(mean_stfdem_men = mean(stfdem, na.rm = TRUE))

print(mean_male_stfdem)
##   mean_stfdem_men
## 1        4.782646
mean_female_stfdem <- italy_data %>%
  filter(gndr == "Female") %>%
  summarize(mean_stfdem_women = mean(stfdem, na.rm = TRUE))

print(mean_female_stfdem)
##   mean_stfdem_women
## 1          4.694652

Answer: On average, female population (mean_female_stfdem = 4.694) is slightly more dissatisfied with democracy in Italy than male population (mean_female_stfdem = 4.782).

Task 5

Prompt: Interpret the boxplot graph of stfedu and stfhlth that we generated already: according to ESS data, would we say that the median French person is more satisfied with the education system or health services? Explain.

Answer: As the provided boxplot graph shows, the median of the variable stfhlth is apparently higher than that of the variable stfedu. Thus, we could say that the median French person is more satisfied with the health services than with the education system.

Change the boxplot graph: provide the code to change some of the key labels: (1) Change the title to: Boxplot of satisfaction with the state of education vs. health services; (2) Remove the x-axis label; (3) Change the y-axis label to: Satisfaction (0-10).

france_data <- ess %>% 
  filter(cntry == "FR")
france_data %>%
  mutate(stfedu = ifelse(stfedu %in% c(77, 88, 99), NA, stfedu),
         stfhlth = ifelse(stfhlth %in% c(77, 88, 99), NA, stfhlth)) %>%
  select(stfedu, stfhlth) %>%
  gather(variable, value, c(stfedu, stfhlth)) %>%
  ggplot(aes(y = value)) +
  geom_boxplot() +
  labs(y = "Satisfaction (0-10)", title = "Boxplot of satisfaction with stfedu vs. stfhlth") +
  theme_minimal()
## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).

This homework is done by Seokang “Eric” Kim at 9:11 PM, January 16 2024.