homework 5

##Set Up

packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")

setwd("C:\\Users\\helen\\OneDrive\\Desktop\\soc202")
library(fst)
ess <- read_fst("All-ESS-Data.fst")
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "fstcore"   "fst"      
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "fstcore"      "fst"          "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "fstcore"      "fst"          "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "fstcore"      "fst"          "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "fstcore"      "fst"          "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "RColorBrewer" "modelsummary" "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "fstcore"     
## [16] "fst"          "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "fstcore"      "fst"          "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "fstcore"      "fst"          "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "fstcore"      "fst"          "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "fstcore"      "fst"         
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "fstcore"      "fst"         
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "fstcore"     
## [21] "fst"          "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"

#TASK 1

table(ess$essround)

## 
##     1     2     3     4     5     6     7     8     9    10 
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685

ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for( i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

table(ess$happy)

## 
##      0      1      2      3      4      5      6      7      8      9     10 
##   3933   3540   7343  13759  17016  52117  43707  87869 130802  76229  51006 
##     77     88     99 
##    330   2163    741

table(ess$health)

## 
##      1      2      3      4      5      7      8      9 
## 112145 203129 131573  35374   7435    119    421    359

table(ess$impsafe)

## 
##      1      2      3      4      5      6      7      8      9 
## 128667 163675  85309  43811  25787   5323   1352   4044   4522

belgium_data <- ess %>%
  filter(cntry == "BE") %>%
  mutate(
    health = ifelse(health %in% c(7, 8, 9), NA, health),
    happy = ifelse(happy %in% c(77, 88, 99), NA, happy),
    impsafe = ifelse(impsafe %in% c(7, 8, 9), NA, impsafe), 
  )

table(belgium_data$happy)

## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   50   27  104  194  234  830  999 3503 6521 3402 1565

table(belgium_data$health)

## 
##    1    2    3    4    5 
## 4033 9262 3338  674  137

table(belgium_data$impsafe)

## 
##    1    2    3    4    5    6 
## 3205 7074 4025 1849  987  232

datasummary_skim(belgium_data %>% select(happy, health, impsafe))

	Unique (#)	Mean	SD	Min	Median	Max
happy	12	7.7	1.5	0.0	8.0	10.0
health	6	2.1	0.8	1.0	2.0	5.0
impsafe	7	2.5	1.2	1.0	2.0	6.0

In belgium there is generally a good amount of happiness (in happiness the mean is 7.7 and the max is 10) there is also a good amount of health with a mean of 2 meaning generally good. The variable impsafe represents whether or not people think it is important to live in a safe space, which most people responded as like me or somewhat like me (mean of 2.5) meaning that while there is some concern on living in a safe spot, it is not the biggest priority.

impsafe_by_year <- belgium_data %>%
  group_by(year) %>%
  summarize(impsafe_mean = mean(impsafe, na.rm = TRUE))
impsafe_by_year

## # A tibble: 10 × 2
##     year impsafe_mean
##    <dbl>        <dbl>
##  1  2002         2.43
##  2  2004         2.37
##  3  2006         2.53
##  4  2008         2.51
##  5  2010         2.52
##  6  2012         2.39
##  7  2014         2.47
##  8  2016         2.54
##  9  2018         2.55
## 10  2020         2.56

  ggplot(impsafe_by_year, aes(x = year, y = impsafe_mean)) +
  geom_line(color = "blue", size = 1) +  # Line to show the trend
  geom_point(color = "red", size = 3) +  # Points to highlight each year's value
  labs(title = "Important to live in secure and safe surroundings (2002-2020)", 
       x = "Survey Year", 
       y = "importance (0-10)") +
  ylim(0, 6) +  # Setting the y-axis limits from 0 to 10
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Throughout the years there was little change in the importance of living in a safe space as the line shows only slight increase or decrease throughout the graph.

ess_selected <- ess %>%
  filter(cntry %in% c("BE", "GB", "FR")) %>%
  mutate(impsafe = ifelse(impsafe %in% c(77, 88, 99), NA, impsafe))


task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -impsafe, FUN=median), y = impsafe, fill = cntry)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = "Boxplot comparison for importance of living in a safe enviroment 
       (Belgium, United Kingdom, France)", 
       x = "Country", 
       y = "Scale (0-6)")

task3plot

## Warning: Removed 381 rows containing non-finite values (`stat_boxplot()`).

All countries have the same median, meaning that across all the countries they value living in a safe space equally. However in Belgium there is more outliers towards 6 (not like me). In France their maximum also goes a lot further than the other two countries, meaning that there was a wider range of responses than other countries. For belgium there is more variety in responses leaning towards 1 (Very much like me)

belgium_data <- belgium_data %>%
  mutate(geo = recode(as.character(domicil), 
                      '1' = "Urban", 
                      '2' = "Peri-Urban", # or set to Urban | Regardless decision needs to be justified
                      '3' = "Rural", 
                      '4' = "Rural", 
                      '5' = "Rural",
                      '7' = NA_character_,
                      '8' = NA_character_,
                      '9' = NA_character_))

# check
table(belgium_data$geo)

## 
## Peri-Urban      Rural      Urban 
##       1797      13293       2322

table(belgium_data$domicil)

## 
##    1    2    3    4    5    7    8    9 
## 2322 1797 4197 8185  911    1    5   33

belgium_data <- belgium_data %>%
  mutate(born_in_country = recode(brncntr,
                                  `1` = "Yes",
                                  `2` = "No",
                                  `7` = NA_character_,
                                  `8` = NA_character_,
                                  `9` = NA_character_))

table(belgium_data$born_in_country)

## 
##    No   Yes 
##  2077 15370

table(belgium_data$geo, belgium_data$born_in_country) %>%
  cprop()

##             
##              No    Yes   All  
##   Peri-Urban  10.3  10.3  10.3
##   Rural       56.1  79.1  76.4
##   Urban       33.5  10.6  13.3
##   Total      100.0 100.0 100.0

Majority of the people who live in the rural parts of belgium were born there (80%). Urban areas have a lower percent of people born in the country who live there. An infrence could be made as to why that is due to the fact that urban areas are more popular for those who have immigrated to the country. There is a similar percentage of people who live in peri-urban areas for those born in and outside of the country.

table(belgium_data$impsafe, belgium_data$geo) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  geom_col(position = "dodge") +
  labs(title="Importance of living in a safe enviroment",
       y = "Conditional Percentage",
       x = "Belief Scale (1 = Very much like me - 6 = Not like me at all)",
       fill = "")

Majority of the responses are in 2 which (like me), and most amount of people who responded in this category were from rural areas of the country. Interestingly, the only categories in which people responded the most as were in categories 1 (very much like me), 5 (not like me), and 6 (not like me at all). While overall there is a lower percent of people in urban areas who responded with 6, there is a small minoirty that value living in a safe enviroment less than the rest of the population.

homework 5

Helen_Huang

2023-10-18