rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 529145 28.3 1177120 62.9 NA 669445 35.8
## Vcells 975682 7.5 8388608 64.0 16384 1851710 14.2
# List of packages
packages <- c("tidyverse", "fst", "modelsummary") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
ess <- read_fst("All-ESS-Data.fst")
Provide code and answer.
Prompt and question: calculate the average for the variable ‘happy’ for the country of Norway. On average, based on the ESS data, who reports higher levels of happiness: Norway or Belgium?
Note: we already did it for Belgium. You just need to compare to Norway’s average, making sure to provide the code for both.
belgium_happy <- ess %>%
filter(cntry == "BE") %>%
select(happy)
belgium_happy$y <- belgium_happy$happy
table(belgium_happy$y)
##
## 0 1 2 3 4 5 6 7 8 9 10 77 88 99
## 50 27 104 194 234 830 999 3503 6521 3402 1565 3 16 3
# need to remove 77, 88, 99 or else will alter results. See data portal for what they represent (e.g. DK, Refusal, etc.)
# Recode values 77 through 99 to NA
belgium_happy$y[belgium_happy$y %in% 77:99] <- NA
# checking again
table(belgium_happy$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 50 27 104 194 234 830 999 3503 6521 3402 1565
mean_y <- mean(belgium_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.737334
norway_happy <- ess %>%
filter(cntry == "NO") %>%
select(happy)
norway_happy$y <- norway_happy$happy
table(norway_happy$y)
##
## 0 1 2 3 4 5 6 7 8 9 10 77 88
## 15 29 59 163 238 730 817 2617 5235 3796 2344 12 10
# Recode values 77 through 88 to NA
norway_happy$y[norway_happy$y %in% 77:88] <- NA
# checking again
table(norway_happy$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 15 29 59 163 238 730 817 2617 5235 3796 2344
mean_y <- mean(norway_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.975005
Based on the ESS data, the mean of the levels of happiness of Norway is 7.975005. On the other hand, the mean of Belgium is 7.737334. Since 7.975005 > 7.737334, Norway reports higher levels of happiness.
Provide code and answer.
Prompt and question: what is the most common category selected, for Irish respondents, for frequency of binge drinking? The variable of interest is: alcbnge.
More info here: https://ess-search.nsd.no/en/variable/0c65116e-7481-4ca6-b1d9-f237db99a694.
Hint: need to convert numeric value entries to categories as specified in the variable information link. We did similar steps for Estonia and the climate change attitude variable.
ireland_bngdrnk <- ess %>%
filter(cntry == "IE") %>%
select(alcbnge)
ireland_bngdrnk$y <- ireland_bngdrnk$alcbnge
table(ireland_bngdrnk$y)
##
## 1 2 3 4 5 6 7 8
## 65 650 346 417 239 641 26 6
# Recode values 6 through 8 to NA
ireland_bngdrnk$y[ireland_bngdrnk$y %in% 6:8] <- NA
# Converting to categories to get mode as a category instead of a number
df <- ireland_bngdrnk %>%
mutate(
y_category = case_when(
y == 1 ~ "Daily or almost daily",
y == 2 ~ "Weekly",
y == 3 ~ "Monthly",
y == 4 ~ "Less than monthly",
y == 5 ~ "Never",
TRUE ~ NA_character_
),
y_category = fct_relevel(factor(y_category), ### here you would put the categories in order you want them to appear or else it will appear alphabetically
"Daily or almost daily",
"Weekly",
"Monthly",
"Less than monthly",
"Never")
)
# To confirm the conversion:
table(df$y_category)
##
## Daily or almost daily Weekly Monthly
## 65 650 346
## Less than monthly Never
## 417 239
# Let's determine the mode of our newly created category:
get_mode <- function(v) {
tbl <- table(v)
mode_vals <- as.character(names(tbl)[tbl == max(tbl)])
return(mode_vals)
}
mode_values <- get_mode(df$y_category)
cat("Mode of y category:", paste(mode_values, collapse = ", "), "\n")
## Mode of y category: Weekly
For Irish respondents, the most common category selected for frequency of binge drinking is weekly.
Provide code and answer.
Prompt and question: when you use the summary() function for the variable plnftr (about planning for future or taking every each day as it comes from 0-10) for both the countries of Portugal and Serbia, what do you notice? What stands out as different when you compare the two countries (note: look up the variable information on the ESS website to help with interpretation)? Explain while referring to the output generated.
portugal_plnftr <- ess %>%
filter(cntry == "PT") %>%
select(plnftr)
portugal_plnftr$y <- portugal_plnftr$plnftr
table(portugal_plnftr$y)
##
## 0 1 2 3 4 5 6 7 8 9 10 88
## 114 184 313 356 264 481 262 382 345 166 370 40
# Recode values 88 to NA
portugal_plnftr$y[portugal_plnftr$y %in% 88] <- NA
# checking again
table(portugal_plnftr$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 114 184 313 356 264 481 262 382 345 166 370
# Using tidyverse syntax principles, we can get both the mean and median at once.
portugal_plnftr %>%
summarize(
mean_y = mean(y, na.rm = TRUE),
median_y = median(y, na.rm = TRUE)
) %>%
print()
## mean_y median_y
## 1 5.418289 5
mode_y <- portugal_plnftr %>%
filter(!is.na(y)) %>%
count(y) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(y)
cat("\nMode of Y:", mode_y, "\n")
##
## Mode of Y: 5
sd_y <- sd(portugal_plnftr$y, na.rm = TRUE)
cat("Standard Deviation of 'y':", sd_y, "\n")
## Standard Deviation of 'y': 2.86348
summary(portugal_plnftr)
## plnftr y
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 5.000 Median : 5.000
## Mean : 6.426 Mean : 5.418
## 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :88.000 Max. :10.000
## NA's :14604 NA's :14644
serbia_plnftr <- ess %>%
filter(cntry == "RS") %>%
select(plnftr)
serbia_plnftr$y <- serbia_plnftr$plnftr
table(serbia_plnftr$y)
##
## 0 1 2 3 4 5 6 7 8 9 10 77 88
## 587 133 152 138 95 246 70 87 103 47 364 4 17
# Recode values 77 and 88 to NA
serbia_plnftr$y[serbia_plnftr$y %in% 77:88] <- NA
# checking again
table(serbia_plnftr$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 587 133 152 138 95 246 70 87 103 47 364
# Using tidyverse syntax principles, we can get both the mean and median at once.
serbia_plnftr %>%
summarize(
mean_y = mean(y, na.rm = TRUE),
median_y = median(y, na.rm = TRUE)
) %>%
print()
## mean_y median_y
## 1 4.142928 4
mode_y <- serbia_plnftr %>%
filter(!is.na(y)) %>%
count(y) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(y)
cat("\nMode of Y:", mode_y, "\n")
##
## Mode of Y: 0
sd_y <- sd(serbia_plnftr$y, na.rm = TRUE)
cat("Standard Deviation of 'y':", sd_y, "\n")
## Standard Deviation of 'y': 3.757209
summary(serbia_plnftr)
## plnftr y
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 4.000 Median : 4.000
## Mean : 4.983 Mean : 4.143
## 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :88.000 Max. :10.000
## NA's :1505 NA's :1526
Comparing to Portugal, there are more people in Serbia plan for their future as much as possible. In the summary table of Serbia, the numbers of Q1, median, and mean are lower. When the numbers are lower, more people plan for their future as much as possible. On the other hand, when the numbers are higher, more people just take each day as it comes. It shows that people in Serbia prefer planning for their future while people in Portugal prefer taking each day as it comes. At the same time, Serbia and Portugal have the same minimum and maximum number.
Provide code and answer.
Prompt and question: using the variables stfdem and gndr, answer the following: on average, who is more dissastified with democracy in Italy, men or women? Explain while referring to the output generated.
Info on variable here: https://ess.sikt.no/en/variable/query/stfdem/page/1
italy_data <- ess %>%
filter(cntry == "IT")
# Convert gender and stfdem (representing pol ID self-placement from left to right)
italy_data <- italy_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ as.character(gndr)
),
stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem) # Convert stfdem values
)
table(italy_data$gndr)
##
## 9 Female Male
## 13 5329 4836
# Recode values 9 to NA
italy_data$gndr[italy_data$gndr %in% 9] <- NA
italy_data <- italy_data %>% filter (!is.na(gndr))
table(italy_data$gndr)
##
## Female Male
## 5329 4836
# Compute mean for male
mean_male_stfdem <- italy_data %>%
filter(gndr == "Male") %>%
summarize(mean_stfdem_men = mean(stfdem, na.rm = TRUE))
print(mean_male_stfdem)
## mean_stfdem_men
## 1 4.782646
# Compute average of stfdem by gender
means_by_gender <- italy_data %>%
group_by(gndr) %>% # here you are "grouping by" your second variable
summarize(stfdem = mean(stfdem, na.rm = TRUE)) # here you are summarizing your variable of interest
print(means_by_gender)
## # A tibble: 2 × 2
## gndr stfdem
## <chr> <dbl>
## 1 Female 4.66
## 2 Male 4.78
According to the output, the average for female is 4.657553 and the average for male is 4.782646. Since 4.782646 > 4.657553 , women is more dissatisfied with democracy in Italy because higher numbers represent “extremely satisfied” and lower numbers represent “extremely dissatisfied”.
Provide code and answer.
Prompt: Interpret the boxplot graph of stfedu and stfhlth that we generated already: according to ESS data, would we say that the median French person is more satisfied with the education system or health services? Explain.
Change the boxplot graph: provide the code to change some of the key labels: (1) Change the title to: Boxplot of satisfaction with the state of education vs. health services; (2) Remove the x-axis label; (3) Change the y-axis label to: Satisfaction (0-10).
Hint: copy the boxplot code above and just replace or cut what is asked.
france_data <- ess %>%
filter(cntry == "FR")
france_data %>%
# Setting values to NA
mutate(stfedu = ifelse(stfedu %in% c(77, 88, 99), NA, stfedu),
stfhlth = ifelse(stfhlth %in% c(77, 88, 99), NA, stfhlth)) %>%
# Reshaping the data
select(stfedu, stfhlth) %>%
gather(variable, value, c(stfedu, stfhlth)) %>%
# Creating the boxplot
ggplot(aes(x = variable, y = value)) +
geom_boxplot() +
labs(y = "Satisfaction (0-10)", title = "Boxplot of satisfaction with the state of education vs. health services") +
theme_minimal()
## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).