Installing & Applying Packages

packages <-c("tidyverse", "fst", "modelsummary", "ggplot2", "dplyr")
new_packages <-packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` has built-in support to draw text-only (markdown) tables.
##   To generate tables in other formats, you must install one or more of
##   these libraries:
##   
## install.packages(c(
##     "kableExtra",
##     "gt",
##     "flextable",
##    
##   "huxtable",
##     "DT"
## ))
## 
##   Alternatively, you can set markdown as the default table format to
##   silence this alert:
##   
## config_modelsummary(factory_default = "markdown")
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"
getwd()
## [1] "/Users/owner/Downloads"
setwd("/Users/owner/Downloads")
rm(list=ls()); gc()
##           used (Mb) gc trigger  (Mb) limit (Mb) max used (Mb)
## Ncells 1080555 57.8    2198598 117.5         NA  1484656 79.3
## Vcells 1861986 14.3    8388608  64.0      16384  2379122 18.2
ess <- read_fst("All-ESS-Data.fst")

Task 1: Calculate the average for the variable ‘happy’ for the country of Norway. On average, based on the ESS data, who reports higher levels of happiness: Norway or Belgium? Note: we already did it for Belgium. You just need to compare to Norway’s average, making sure to provide the code for both.

#Viewing Belgium happy data & calculating mean
belgium_happy <-ess %>%
  filter(cntry=="BE") %>%
  select(happy)
belgium_happy$y <- belgium_happy$happy
table(belgium_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88   99 
##   50   27  104  194  234  830  999 3503 6521 3402 1565    3   16    3
belgium_happy$y[belgium_happy$y %in% 77:99] <- NA
table(belgium_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   50   27  104  194  234  830  999 3503 6521 3402 1565
#Calculating mean for Belgium
mean_y <- mean(belgium_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.737334
#Viewing & cleaning up Norway happy data 
norway_happy <-ess %>%
  filter(cntry=="NO") %>%
  select(happy)
norway_happy$y <-norway_happy$happy
table(norway_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88 
##   15   29   59  163  238  730  817 2617 5235 3796 2344   12   10
norway_happy$y[norway_happy$y %in% 77:88] <- NA
table(norway_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   15   29   59  163  238  730  817 2617 5235 3796 2344
#Calculating mean for Norway
mean_y <- mean(norway_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.975005

Therefore, on average, Norway has higher levels of happiness than Belgium.

Task 2: What is the most common category selected, for Irish respondents, for frequency of binge drinking? The variable of interest is: alcbnge.

#Viewing & cleaning up Ireland alcbnge data
ireland_alcbnge <-ess%>%
  filter(cntry=="IL") %>%
  select(alcbnge)
ireland_alcbnge$y <-ireland_alcbnge$alcbnge
table(ireland_alcbnge$y)
## 
##    1    2    3    4    5    7    8    9 
##   15  153  149  264  362    5   69 1545
ireland_alcbnge$y[ireland_alcbnge$y %in% 7:9] <- NA
table(ireland_alcbnge$y)
## 
##   1   2   3   4   5 
##  15 153 149 264 362
#Converting to categories
df <- ireland_alcbnge %>% 
  mutate(y_category = case_when(
  y == 1 ~ "Daily or almost daily",
  y == 2 ~ "Weekly",
  y == 3 ~ "Monthly",
  y == 4 ~ "Less than monthly",
  y == 5 ~ "Never",
  TRUE ~ NA_character_), y_category = fct_relevel(factor(y_category), "Daily or almost daily", "Weekly", "Monthly", "Less than monthly", "Never"))
table(df$y_category)
## 
## Daily or almost daily                Weekly               Monthly 
##                    15                   153                   149 
##     Less than monthly                 Never 
##                   264                   362
#Calculating mode
get_mode <- function(v) {tbl <- table(v)
  mode_vals <- as.character(names(tbl)[tbl == max(tbl)])
  return(mode_vals)}
mode_values <- get_mode(df$y_category)
cat("Mode of y category:", paste(mode_values, collapse = ", "), "\n")
## Mode of y category: Never

Therefore, the most common category selected for the frequency of binge drinking by Irish respondents was “Never”.

Task 3: When you use the summary() function for the variable plnftr (about planning for future or taking every each day as it comes from 0-10) for both the countries of Portugal and Serbia, what do you notice? What stands out as different when you compare the two countries (note: look up the variable information on the ESS website to help with interpretation)? Explain while referring to the output generated.

#Viewing & cleaning up Portugal plnftr data
portugal_plnftr <-ess%>%
  filter(cntry=="PT") %>%
  select(plnftr)
portugal_plnftr$y <-portugal_plnftr$plnftr
table(portugal_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10  88 
## 114 184 313 356 264 481 262 382 345 166 370  40
portugal_plnftr$y[portugal_plnftr$y %in% 88] <- NA
table(portugal_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 114 184 313 356 264 481 262 382 345 166 370
#Summary of Portugal plnftr data
summary(portugal_plnftr$y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   3.000   5.000   5.418   8.000  10.000   14644
#Viewing & cleaning up Serbia plnftr data
serbia_plnftr <-ess%>%
  filter(cntry=="RS") %>%
  select(plnftr)
serbia_plnftr$y <-serbia_plnftr$plnftr
table(serbia_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10  77  88 
## 587 133 152 138  95 246  70  87 103  47 364   4  17
serbia_plnftr$y[serbia_plnftr$y %in% 77:88] <- NA
table(serbia_plnftr$y)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 587 133 152 138  95 246  70  87 103  47 364
#Summary of Serbia plnftr data
summary(serbia_plnftr$y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   4.000   4.143   8.000  10.000    1526

For the variable plnftr, a response of “0” means the respondent plans for their future as much as possible, and a response of 10 means the respondent takes each day as it comes. From the summary table, Portugal has a higher mean than Serbia, 5.418 compared to 4.413, indicating that on average, Portugal plans less about their future than Serbia.

Task 4: Using the variables stfdem and gndr, answer the following: on average, who is more dissastified with democracy in Italy, men or women? Explain while referring to the output generated

#Viewing Italy data
italy_data <- ess %>%
  filter(cntry == "IT")
#Converting to categories & cleaning up data
italy_data <- italy_data %>% mutate(gndr =case_when(
  gndr == 1 ~ "Male", 
  gndr == 2 ~ "Female", 
  TRUE ~ as.character(gndr)), 
  gndr = ifelse(gndr %in% c(9), NA, gndr),
  stfdem = ifelse(stfdem %in% c(6, 7, 8, 9, 10, 77, 88, 99), NA, stfdem))
#Compute mean for stfdem by gender
means_by_gender <-italy_data %>%
  group_by(gndr) %>%
  summarize(stfdem = mean(stfdem, na.rm = TRUE))
print(means_by_gender)
## # A tibble: 3 × 2
##   gndr   stfdem
##   <chr>   <dbl>
## 1 Female   3.22
## 2 Male     3.19
## 3 <NA>     2.22

On average, males are more dissatisfied with democracy in Italy than females as they have a lower mean for the variable stfdem, where a score of 0 indicates ‘Extremely disatisfied’ and a score of 10 indicates “Extremely satisfied’.

Task 5A: Interpret the boxplot graph of stfedu and stfhlth that we generated already: according to ESS data, would we say that the median French person is more satisfied with the education system or health services? Explain.

#Boxplot from tutorial 2
france_data <-ess %>%
  filter(cntry == "FR")
france_data %>%
  mutate(stfedu = ifelse(stfedu %in% c(77, 88, 99), NA, stfedu),
         stfhlth = ifelse(stfhlth %in% c(77, 88, 99), NA, stfhlth)) %>%
  select(stfedu, stfhlth) %>%
  gather(variable, value, c(stfedu, stfhlth)) %>%
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(y = "Y-axis", x = "X-axis", title = "Boxplot of stfedu vs. stfhlth") +
  theme_minimal()
## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).

According to ESS data, the French are more disatisfied with the education system than the health services. This conclusion can be drawn as the interquartile range and median for stfedu is lower than that of stfhlth.

Task 5B: Change the boxplot graph: provide the code to change some of the key labels: (1) Change the title to: Boxplot of satisfaction with the state of education vs. health services; (2) Remove the x-axis label; (3) Change the y-axis label to: Satisfaction (0-10).

#Changing title and axis labels
france_data %>%
  mutate(stfedu = ifelse(stfedu %in% c(77, 88, 99), NA, stfedu),
         stfhlth = ifelse(stfhlth %in% c(77, 88, 99), NA, stfhlth)) %>%
  select(stfedu, stfhlth) %>%
  gather(variable, value, c(stfedu, stfhlth)) %>%

  ggplot(aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(y = "Satisfaction (0-10)", x = "" , title = "Boxplot of satisfaction with the state of education vs. health services") +
  theme_minimal()
## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).