Homework 1 - SOC222 - HARIS HABIB

# List of packages
packages <- c("tidyverse", "fst", "modelsummary") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"
ess <- read_fst("All-ESS-Data.fst")

Task 1

BELGIUM

belgium_happy <- ess %>% 
  filter(cntry == "BE") %>% 
  select(happy)
belgium_happy$y <- belgium_happy$happy

table(belgium_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88   99 
##   50   27  104  194  234  830  999 3503 6521 3402 1565    3   16    3
# Recode values 77 through 99 to NA
belgium_happy$y[belgium_happy$y %in% 77:99] <- NA

# checking again
table(belgium_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   50   27  104  194  234  830  999 3503 6521 3402 1565

MEAN OF BELGIUM

mean_y <- mean(belgium_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.737334

NORWAY

norway_happy <- ess %>% 
  filter(cntry == "NO") %>% 
  select(happy)
norway_happy$y <- norway_happy$happy

table(norway_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88 
##   15   29   59  163  238  730  817 2617 5235 3796 2344   12   10
# Recode values 77 through 88 to NA
norway_happy$y[norway_happy$y %in% 77:88] <- NA

# checking again
table(norway_happy$y)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   15   29   59  163  238  730  817 2617 5235 3796 2344

MEAN OF NORWAY

mean_y <- mean(norway_happy$y, na.rm = TRUE)
cat("Mean of 'y' is:", mean_y, "\n")
## Mean of 'y' is: 7.975005

Prompt and question: calculate the average for the variable ‘happy’ for the country of Norway. On average, based on the ESS data, who reports higher levels of happiness: Norway or Belgium?

ANSWER: On average, Norway reports higher level of happiness with a mean of 7.975005 while Belgium reports lower levels of happiness at 7.737334

Task 2

ireland_ccdrinking <- ess %>%
  filter(cntry == "IE") %>%
  select(alcbnge)

ireland_ccdrinking$y <- ireland_ccdrinking$alcbnge

table(ireland_ccdrinking$y)
## 
##   1   2   3   4   5   6   7   8 
##  65 650 346 417 239 641  26   6
# Recode values 6 through 8 to NA
ireland_ccdrinking$y[ireland_ccdrinking$y %in% 6:8] <- NA
# CONVERTING TO CATEGORIES FROM NUMBERS
df <- ireland_ccdrinking %>%
  mutate(
    y_category = case_when(
      y == 1 ~ "Daily or almost daily",
      y == 2 ~ "Weekly",
      y == 3 ~ "Monthly",
      y == 4 ~ "Less than monthly",
      y == 5 ~ "Never",
      TRUE ~ NA_character_
    ),
    y_category = fct_relevel(factor(y_category),  
                             "Daily or almost daily", 
                             "Weekly", 
                             "Monthly", 
                             "Less than monthly", 
                             "Never")
  )

# To confirm the conversion:
table(df$y_category)
## 
## Daily or almost daily                Weekly               Monthly 
##                    65                   650                   346 
##     Less than monthly                 Never 
##                   417                   239
# DETERMINING THE MODE OF OUR NEW CATEGORY:

get_mode <- function(v) {
  tbl <- table(v)
  mode_vals <- as.character(names(tbl)[tbl == max(tbl)])
  return(mode_vals)
}

mode_values <- get_mode(df$y_category)
cat("Mode of y category:", paste(mode_values, collapse = ", "), "\n")
## Mode of y category: Weekly

Prompt and question: what is the most common category selected, for Irish respondents, for frequency of binge drinking? The variable of interest is: alcbnge.

ANSWER: The most common category for irish respondents for frequency of binge drinking is weekly.

Task 3

PORTUGAL

portugal_plnftr <- ess %>% 
  filter(cntry == "PT")%>% 
  select(plnftr)

portugal_plnftr$c <- portugal_plnftr$plnftr

table(portugal_plnftr$c)
## 
##   0   1   2   3   4   5   6   7   8   9  10  88 
## 114 184 313 356 264 481 262 382 345 166 370  40
# Recode values 88 to NA
portugal_plnftr$c[portugal_plnftr$c %in% 88] <- NA

# checking again
table(portugal_plnftr$c)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 114 184 313 356 264 481 262 382 345 166 370

SERBIA

serbia_plnftr <- ess %>% 
  filter(cntry == "RS")%>% 
  select(plnftr)

serbia_plnftr$b <- serbia_plnftr$plnftr

table(serbia_plnftr$b)
## 
##   0   1   2   3   4   5   6   7   8   9  10  77  88 
## 587 133 152 138  95 246  70  87 103  47 364   4  17
# Recode values 77 through 88 to NA
serbia_plnftr$b[serbia_plnftr$b %in% 77:88] <- NA

# checking again
table(serbia_plnftr$b)
## 
##   0   1   2   3   4   5   6   7   8   9  10 
## 587 133 152 138  95 246  70  87 103  47 364

SUMMARY - PORTUGAL

summary(portugal_plnftr)
##      plnftr             c         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 5.000   Median : 5.000  
##  Mean   : 6.426   Mean   : 5.418  
##  3rd Qu.: 8.000   3rd Qu.: 8.000  
##  Max.   :88.000   Max.   :10.000  
##  NA's   :14604    NA's   :14644

SUMMARY - SERBIA

summary(serbia_plnftr)
##      plnftr             b         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 4.000   Median : 4.000  
##  Mean   : 4.983   Mean   : 4.143  
##  3rd Qu.: 8.000   3rd Qu.: 8.000  
##  Max.   :88.000   Max.   :10.000  
##  NA's   :1505     NA's   :1526

Prompt and question: when you use the summary() function for the variable plnftr (about planning for future or taking every each day as it comes from 0-10) for both the countries of Portugal and Serbia, what do you notice? What stands out as different when you compare the two countries (note: look up the variable information on the ESS website to help with interpretation)? Explain while referring to the output generated.

ANSWER: As we compare the summary statistics for the variable plnftr for Portugal and Serbia, we find that the mean value of Portugal (5.418) is higher than the mean value of Serbia (4.143) suggesting that Serbia has a higher level of future planning as it is closer to the value of 0 (planning for the future as much as possible). When comparing the 1st quartile of Serbia and Portugal, we notice that Serbia has a 1st quartile of 0 while Portugal has a 1st quartile of 3. This suggests that individuals from Serbia are more interested in planning for the future as much as possible.

Task 4

italy_data <- ess %>% 
  filter(cntry == "IT")

italy_data <- italy_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      gndr == 9 ~ "Refusal to Answer",
      TRUE ~ as.character(gndr)
    ),
    stfdem = ifelse(stfdem %in% c(77, 88), NA, stfdem)  
  )
# Compute mean for male
mean_male_stfdem <- italy_data %>%
  filter(gndr == "Male") %>%
  summarize(mean_stfdem_men = mean(stfdem, na.rm = TRUE))

print(mean_male_stfdem)
##   mean_stfdem_men
## 1        4.782646
# Compute average of stfdem by gender
means_by_gender <- italy_data %>%
  group_by(gndr) %>% 
  summarize(stfdem = mean(stfdem, na.rm = TRUE)) 

print(means_by_gender)
## # A tibble: 3 × 2
##   gndr              stfdem
##   <chr>              <dbl>
## 1 Female              4.69
## 2 Male                4.78
## 3 Refusal to Answer   3.25

Prompt and question: using the variables stfdem and gndr, answer the following: on average, who is more dissatisfied with democracy in Italy, men or women? Explain while referring to the output generated.

ANSWER: Based on the data generated, women have an average of 4.69 while men have an average of 4.78. This indicates that women are slightly more dissatisfied with democracy in Italy than men.

Task 5

france_data <- ess %>% 
  filter(cntry == "FR")
france_data %>%
  # Setting values to NA
  mutate(stfedu = ifelse(stfedu %in% c(77, 88, 99), NA, stfedu),
         stfhlth = ifelse(stfhlth %in% c(77, 88, 99), NA, stfhlth)) %>%
  # Reshaping the data
  select(stfedu, stfhlth) %>%
  gather(variable, value, c(stfedu, stfhlth)) %>%
  # Creating the boxplot
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(y = "Satisfaction (0-10)", title = "Boxplot of satisfaction with the state of education vs. health services") +
  theme_minimal()
## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).

Prompt: Interpret the boxplot graph of stfedu and stfhlth that we generated already: according to ESS data, would we say that the median French person is more satisfied with the education system or health services? Explain.

ANSWER: The median french person is more satisfied with the health services (stfhlth) compared to the state of education (stfedu). When we take a look at the median for stfhlth, it is approximately 7.1 while the median for stfedu is 5.0. This indicates that the higher number median correlates with a higher level of satisfaction. Furthermore, the 1st quartile and 3rd quartile are lower for stfedu in comparison to stfhlth.