Installing & Applying Packages
packages <-c("tidyverse", "fst", "modelsummary", "ggplot2", "dplyr")
new_packages <-packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` has built-in support to draw text-only (markdown) tables.
## To generate tables in other formats, you must install one or more of
## these libraries:
##
## install.packages(c(
## "kableExtra",
## "gt",
## "flextable",
##
## "huxtable",
## "DT"
## ))
##
## Alternatively, you can set markdown as the default table format to
## silence this alert:
##
## config_modelsummary(factory_default = "markdown")
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
getwd()
## [1] "/Users/owner/Downloads"
setwd("/Users/owner/Downloads")
rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 1080555 57.8 2198598 117.5 NA 1484656 79.3
## Vcells 1861986 14.3 8388608 64.0 16384 2379122 18.2
ess <- read_fst("All-ESS-Data.fst")
Task 1: Calculate the average for the variable ‘happy’ for the
country of Norway. On average, based on the ESS data, who reports higher
levels of happiness: Norway or Belgium? Note: we already did it for
Belgium. You just need to compare to Norway’s average, making sure to
provide the code for both.
#Viewing Belgium happy data & calculating mean
belgium_happy <-ess %>%
filter(cntry=="BE") %>%
select(happy)
belgium_happy$y <- belgium_happy$happy
table(belgium_happy$y)
##
## 0 1 2 3 4 5 6 7 8 9 10 77 88 99
## 50 27 104 194 234 830 999 3503 6521 3402 1565 3 16 3
belgium_happy$y[belgium_happy$y %in% 77:99] <- NA
table(belgium_happy$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 50 27 104 194 234 830 999 3503 6521 3402 1565
#Calculating mean for Belgium
mean_y <- mean(belgium_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.737334
#Viewing & cleaning up Norway happy data
norway_happy <-ess %>%
filter(cntry=="NO") %>%
select(happy)
norway_happy$y <-norway_happy$happy
table(norway_happy$y)
##
## 0 1 2 3 4 5 6 7 8 9 10 77 88
## 15 29 59 163 238 730 817 2617 5235 3796 2344 12 10
norway_happy$y[norway_happy$y %in% 77:88] <- NA
table(norway_happy$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 15 29 59 163 238 730 817 2617 5235 3796 2344
#Calculating mean for Norway
mean_y <- mean(norway_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.975005
Therefore, on average, Norway has higher levels of happiness than
Belgium.
Task 2: What is the most common category selected, for Irish
respondents, for frequency of binge drinking? The variable of interest
is: alcbnge.
#Viewing & cleaning up Ireland alcbnge data
ireland_alcbnge <-ess%>%
filter(cntry=="IL") %>%
select(alcbnge)
ireland_alcbnge$y <-ireland_alcbnge$alcbnge
table(ireland_alcbnge$y)
##
## 1 2 3 4 5 7 8 9
## 15 153 149 264 362 5 69 1545
ireland_alcbnge$y[ireland_alcbnge$y %in% 7:9] <- NA
table(ireland_alcbnge$y)
##
## 1 2 3 4 5
## 15 153 149 264 362
#Converting to categories
df <- ireland_alcbnge %>%
mutate(y_category = case_when(
y == 1 ~ "Daily or almost daily",
y == 2 ~ "Weekly",
y == 3 ~ "Monthly",
y == 4 ~ "Less than monthly",
y == 5 ~ "Never",
TRUE ~ NA_character_), y_category = fct_relevel(factor(y_category), "Daily or almost daily", "Weekly", "Monthly", "Less than monthly", "Never"))
table(df$y_category)
##
## Daily or almost daily Weekly Monthly
## 15 153 149
## Less than monthly Never
## 264 362
#Calculating mode
get_mode <- function(v) {tbl <- table(v)
mode_vals <- as.character(names(tbl)[tbl == max(tbl)])
return(mode_vals)}
mode_values <- get_mode(df$y_category)
cat("Mode of y category:", paste(mode_values, collapse = ", "), "\n")
## Mode of y category: Never
Therefore, the most common category selected for the frequency of
binge drinking by Irish respondents was “Never”.
Task 3: When you use the summary() function for the variable plnftr
(about planning for future or taking every each day as it comes from
0-10) for both the countries of Portugal and Serbia, what do you notice?
What stands out as different when you compare the two countries (note:
look up the variable information on the ESS website to help with
interpretation)? Explain while referring to the output generated.
#Viewing & cleaning up Portugal plnftr data
portugal_plnftr <-ess%>%
filter(cntry=="PT") %>%
select(plnftr)
portugal_plnftr$y <-portugal_plnftr$plnftr
table(portugal_plnftr$y)
##
## 0 1 2 3 4 5 6 7 8 9 10 88
## 114 184 313 356 264 481 262 382 345 166 370 40
portugal_plnftr$y[portugal_plnftr$y %in% 88] <- NA
table(portugal_plnftr$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 114 184 313 356 264 481 262 382 345 166 370
#Summary of Portugal plnftr data
summary(portugal_plnftr$y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 3.000 5.000 5.418 8.000 10.000 14644
#Viewing & cleaning up Serbia plnftr data
serbia_plnftr <-ess%>%
filter(cntry=="RS") %>%
select(plnftr)
serbia_plnftr$y <-serbia_plnftr$plnftr
table(serbia_plnftr$y)
##
## 0 1 2 3 4 5 6 7 8 9 10 77 88
## 587 133 152 138 95 246 70 87 103 47 364 4 17
serbia_plnftr$y[serbia_plnftr$y %in% 77:88] <- NA
table(serbia_plnftr$y)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 587 133 152 138 95 246 70 87 103 47 364
#Summary of Serbia plnftr data
summary(serbia_plnftr$y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 4.000 4.143 8.000 10.000 1526
For the variable plnftr, a response of “0” means the respondent
plans for their future as much as possible, and a response of 10 means
the respondent takes each day as it comes. From the summary table,
Portugal has a higher mean than Serbia, 5.418 compared to 4.413,
indicating that on average, Portugal plans less about their future than
Serbia.
Task 4: Using the variables stfdem and gndr, answer the following:
on average, who is more dissastified with democracy in Italy, men or
women? Explain while referring to the output generated
#Viewing Italy data
italy_data <- ess %>%
filter(cntry == "IT")
#Converting to categories & cleaning up data
italy_data <- italy_data %>% mutate(gndr =case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ as.character(gndr)),
gndr = ifelse(gndr %in% c(9), NA, gndr),
stfdem = ifelse(stfdem %in% c(6, 7, 8, 9, 10, 77, 88, 99), NA, stfdem))
#Compute mean for stfdem by gender
means_by_gender <-italy_data %>%
group_by(gndr) %>%
summarize(stfdem = mean(stfdem, na.rm = TRUE))
print(means_by_gender)
## # A tibble: 3 × 2
## gndr stfdem
## <chr> <dbl>
## 1 Female 3.22
## 2 Male 3.19
## 3 <NA> 2.22
On average, males are more dissatisfied with democracy in Italy than
females as they have a lower mean for the variable stfdem, where a score
of 0 indicates ‘Extremely disatisfied’ and a score of 10 indicates
“Extremely satisfied’.
Task 5B: Change the boxplot graph: provide the code to change some
of the key labels: (1) Change the title to: Boxplot of satisfaction with
the state of education vs. health services; (2) Remove the x-axis label;
(3) Change the y-axis label to: Satisfaction (0-10).
#Changing title and axis labels
france_data %>%
mutate(stfedu = ifelse(stfedu %in% c(77, 88, 99), NA, stfedu),
stfhlth = ifelse(stfhlth %in% c(77, 88, 99), NA, stfhlth)) %>%
select(stfedu, stfhlth) %>%
gather(variable, value, c(stfedu, stfhlth)) %>%
ggplot(aes(x = variable, y = value)) +
geom_boxplot() +
labs(y = "Satisfaction (0-10)", x = "" , title = "Boxplot of satisfaction with the state of education vs. health services") +
theme_minimal()
## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).
