Zhang_Leanna

# List of packages
packages <- c("tidyverse", "fst", "modelsummary") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"

##Task 1 Provide code and answer.

Prompt and question: calculate the average for the variable ‘health’ for the countries of Serbia and Belgium. On average, based on the ESS data, who reports higher levels of subjective health: Serbia and Belgium?

#serbia_data <- read.fst("serbia_data.fst")

#belgium_data <- read.fst("belgium_data.fst")

##Health Data for Serbia

serbia_data <- read.fst("serbia_data.fst")

serbia_health <- serbia_data%>%
  select(health)

# Recode invalid values to NA
serbia_health$health[serbia_health$health %in% c(77, 88, 99)] <- NA

##Calculate Mean for Serbia

# Calculate mean
mean_serbia <- mean(serbia_health$health, na.rm = TRUE)
cat("Mean of 'y' is:", mean_serbia, "\n")

## Mean of 'y' is: 2.575254

##Health Data for Belgium

belgium_data <- read.fst("belgium_data.fst")

belgium_health <- belgium_data %>%
  select(health)

# Recode invalid values to NA
belgium_health$health[belgium_health$health %in% c(77, 88, 99)] <- NA

##Calculate Mean for Belgium

# Calculate mean
mean_belgium <- mean(belgium_health$health, na.rm = TRUE)
cat("Mean of 'y' is:", mean_belgium, "\n")

## Mean of 'y' is: 2.063377

##Compare Belgium = 2.06, Serbia = 2.58, therefore Serbia reports slightly higher levels of subjective health

mean_belgium

## [1] 2.063377

mean_serbia

## [1] 2.575254

##Task 2 Provide code and answer.

Prompt and question: what is the most common category selected, for Norwegian respondents, for how often they socially meet with friends or relatives. The variable of interest is: sclmeet.

norway_data <- read.fst("norway_data.fst")

norway_social <- norway_data %>%
  select(sclmeet)

norway_social <- (norway_data) %>%
  filter(cntry == "NO") %>%
  select(sclmeet)

table(norway_social$sclmeet)

## 
##    1    2    3    4    5    6    7   77   88 
##   44  356  746 2569 2119 5764 4458    2    7

1 Daily or almost daily

2 Weekly

3 Monthly

4 Less than monthly

5 Never

6 Not applicable*

7 Refusal*

8 Don’t know*

9 No answer

norway_social$sclmeet[norway_social$sclmeet %in% 77:88] <- NA

table(norway_social$sclmeet)

## 
##    1    2    3    4    5    6    7 
##   44  356  746 2569 2119 5764 4458

norway_social$sclmeet_category <- factor(norway_social$sclmeet, labels = c ("Daily or almost daily","Weekly","Monthly","Less than monthly","Never", "Not Applicable", "Don't Know"))

table(norway_social$sclmeet_category)

## 
## Daily or almost daily                Weekly               Monthly 
##                    44                   356                   746 
##     Less than monthly                 Never        Not Applicable 
##                  2569                  2119                  5764 
##            Don't Know 
##                  4458

get_mode <- function(v) {
  tbl <- table(v)
  mode_vals <- as.character(names(tbl)[tbl == max(tbl)])
  return(mode_vals)
}

mode_norway_social <- get_mode(norway_social$sclmeet_category)
mode_norway_social

## [1] "Not Applicable"

Norwegians meet socially with friends or relatives less than monthly

##Task 3 Provide code and answer.

Prompt and question: when you use the summary() function for the variable wrclmch (i.e., how worried about climate change) for both the countries of Estonia and Portugal – what do you notice? What stands out as different when you compare the two countries (note: look up the variable information on the ESS website to help with interpretation)? Explain while referring to the output generated.

portugal_data <- read.fst("portugal_data.fst")

estonia_data <- read.fst("estonia_data.fst")

# For Portugal
portugal_wrclmch <- portugal_data %>%
  filter(cntry == "PT") %>%
  select(wrclmch)


# For Estonia
estonia_wrclmch <- estonia_data %>%
  filter(cntry == "EE") %>%
  select(wrclmch)

summary(portugal_wrclmch)

##     wrclmch     
##  Min.   :1.000  
##  1st Qu.:3.000  
##  Median :4.000  
##  Mean   :3.583  
##  3rd Qu.:4.000  
##  Max.   :8.000  
##  NA's   :14773

summary(estonia_wrclmch)

##     wrclmch     
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :3.000  
##  Mean   :2.855  
##  3rd Qu.:3.000  
##  Max.   :8.000  
##  NA's   :13295

table(portugal_wrclmch$wrclmch)

## 
##    1    2    3    4    5    6    7    8 
##   55  251 1171 1203  380   13    4   31

table(estonia_wrclmch$wrclmch)

## 
##    1    2    3    4    5    6    7    8 
##  343  852 1612  565  125   56    1    7

summary(portugal_wrclmch)

##     wrclmch     
##  Min.   :1.000  
##  1st Qu.:3.000  
##  Median :4.000  
##  Mean   :3.583  
##  3rd Qu.:4.000  
##  Max.   :8.000  
##  NA's   :14773

summary(estonia_wrclmch)

##     wrclmch     
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :3.000  
##  Mean   :2.855  
##  3rd Qu.:3.000  
##  Max.   :8.000  
##  NA's   :13295

What stands out is the first quartile and mean.

1st Q for Portugal is 3, for Estonia is 2. That means the middle 50% for Estonia actually starts at 2. The mean for Portugal 3.58 while the mean for Estonia is 2.86. The median for Portugal is 4 while it is only 4 for Estonia.

They also both have the same maximum and minimum values.

##Task 4 Provide code and answer.

Prompt and question: using the variables stfhlth and gndr, answer the following: on average, who is more sastified with health services in Italy, men or women? Explain while referring to the output generated.

italy_data <- read.fst("italy_data.fst")

table(italy_data$gndr)

## 
##    1    2    9 
## 4836 5329   13

italy_data$gndr[italy_data$gndr %in% 9] <- NA

table(italy_data$stfhlth)

## 
##    0    1    2    3    4    5    6    7    8    9   10   77   88   99 
##  455  279  553  778  958 1470 1718 1869 1363  411  192   24  105    3

italy_data$stfhlth[italy_data$stfhlth %in% 77:99] <- NA

italy_data <- italy_data %>%
  mutate(
    gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ as.character(gndr)
    )
  )
italy_data <- italy_data %>% filter(!is.na(gndr))
italy_data <- italy_data %>% filter(!is.na(stfhlth))

table(italy_data$gndr)

## 
## Female   Male 
##   5260   4774

table(italy_data$stfhlth)

## 
##    0    1    2    3    4    5    6    7    8    9   10 
##  455  278  552  775  957 1466 1716 1869 1363  411  192

italy_data %>%
  group_by(gndr) %>%
  summarize(meanstfhlth = mean(stfhlth, na.rm = TRUE))

## # A tibble: 2 × 2
##   gndr   meanstfhlth
##   <chr>        <dbl>
## 1 Female        5.38
## 2 Male          5.54

Men are more satisfied with health services in Italy.

##Task 5 Create boxplots, in the same graph, for stfdem and stfgov for the country of France. Provide an appropriate title and labels. Further, tell us: according to ESS data, would we say that the median French person is more satisfied with the democracy or the national government? Explain.

france_data <- read.fst("france_data.fst")

france_data %>%
  # Setting values to NA
  mutate(stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem),
         stfgov = ifelse(stfgov %in% c(77, 88, 99), NA, stfgov)) %>%
  # Reshaping the data
  select(stfdem, stfgov) %>%
  gather(variable, value, c(stfdem, stfgov)) %>%
  # Creating the boxplot
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(y = "Y-axis", x = "X-axis", title = "Boxplot of stfdem vs. stfgov") +
  theme_minimal()

## Warning: Removed 557 rows containing non-finite values (`stat_boxplot()`).

median(france_data$stfdem)

## [1] 5

median(france_data$stfgov)

## [1] 4

The median French person is more satisfied with the democracy as the median for national government is only 4 but the median for democracy is 5.

Zhang_Leanna_Makeup

2024-02-18