Homework 5_Lyra

# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

setwd("~/SOC 202/Homework 5_Lyra_Dong")
library(gapminder)
library(tidyverse)

library(fst)
ess <- read_fst("All-ESS-Data.fst")

Homework 5 (2.5%) due Oct. 16

Important: Must post link to your markdown on the discussion board (file and “knit” html), along with post detailing your general takeaways from exploring variables of interest, comparing to other countries, and looking into socio-demographics. You must also attach your R markdown file.

In your post, please link to the ESS website for the three variables you considered, as well as highlight the country you wish to focus on and for what main reason. Finally, please attach (or provide a screenshot) the visual that you produced while doing the tasks that is most interesting to you and explain why (just one visual not all).

Important note

You cannot use the same three variables we considered in the tutorial. You can use one of the three, but not all three.

Task 1

Do a data summary table of three variables of interest. Discuss briefly what you note (i.e., add text in your markdown after the Task 1 code).

table(ess$sclmeet)

## 
##      1      2      3      4      5      6      7     77     88     99 
##  10060  43072  46575  96993  85437 130629  74860    205   2261    463

table(ess$ipbhprp)

## 
##      1      2      3      4      5      6      7      8      9 
##  77804 163317 107251  59719  34855   8031   1558   5105   4850

table(ess$netusoft)

## 
##     1     2     3     4     5     7     8     9 
## 23872  9013  8739 14124 97494    49   101   199

ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

germany_data <- ess %>% 
  filter(cntry == "DE") %>% 
  mutate(
    netusoft = ifelse(netusoft %in% c(7, 8, 9), NA, netusoft),
    sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet), 
    ipbhprp = ifelse(ipbhprp %in% c(77, 88, 99), NA, ipbhprp), 
  )

datasummary_skim(germany_data %>% select(netusoft, sclmeet, ipbhprp))

	Unique (#)	Missing (%)	Mean	SD	Min	Median	Max
netusoft	6	60	4.3	1.3	1.0	5.0	5.0
sclmeet	8	0	4.7	1.4	1.0	5.0	7.0
ipbhprp	10	26	3.0	1.4	1.0	3.0	9.0

netusoft stands for Internet use, how often. 5 demonstrates a usage of the internet or other devices everyday. sclmeet stands for How often socially meet with friends, relatives or colleagues. 5 demonstrates once a week. ipbhprp stands for Important to behave properly. 3 demonstrates Somewhat like me. The positively skewed graph displays that a lot of people in Germany somewhat values the importance to behave properly.

Task 2

Choose one of the three variables you just summarized in the table. This will be your current main outcome of interest.

Produce a visual that showcases the mean (average) for your outcome of interest by survey year (can be, e.g., point + line plot or ridge plot, depending on your variable). Discuss briefly what you note (i.e., add text in your markdown after the Task 2 code).

sclmeet_by_year <- germany_data %>%
  group_by(year) %>%
  summarize(mean_sclmeet = mean(sclmeet, na.rm = TRUE))

ggplot(sclmeet_by_year, aes(x = year, y = mean_sclmeet)) +
  geom_line(color = "blue", size = 1) +  
  geom_point(color = "red", size = 3) +  
  labs(title = "Level of socially meet with friends in Germany (2002-2020)", 
       x = "Survey Year", 
       y = "sclmeet (0-10)") +
  ylim(0, 10) +  
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Overall, the data indicates that the frequency of social interactions with friends or relatives in Germany remains relatively stable at an average of 5 times per week. However, there was a noticeable decline in this rate during the year 2020, which can be attributed to the global COVID-19 pandemic.

Task 3

Provide a comparison visual of your outcome of interest with two other countries. You can choose the geom() you prefer. Discuss briefly what you note (i.e., add text in your markdown after the Task 3 code).

ess_selected <- ess %>%
  filter(cntry %in% c("DE", "CA", "KR")) %>%
  mutate(sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet))


task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -sclmeet, FUN=median), y = sclmeet, fill = cntry)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = "Level of socially meet with friends (Germany, Canada, Korea)", 
       x = "Country", 
       y = "Scale (0-10)")

task3plot

## Warning: Removed 154 rows containing non-finite values (`stat_boxplot()`).

Task 4

Produce a cross-tab between your outcome of interest and a socio-demographic variable (use datasummary_crosstab). Then, calculate column percentages using cprop(), making sure to pick a second socio-demographic variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 4 code).

germany_data <- germany_data %>%
  mutate(born_in_country = recode(brncntr,
                                  `1` = "Yes",
                                  `2` = "No",
                                  `7` = NA_character_,
                                  `8` = NA_character_,
                                  `9` = NA_character_))

table(germany_data$born_in_country)

## 
##    No   Yes 
##  3295 31014

table(germany_data$brncntr)

## 
##     1     2     7     8     9 
## 31014  3295     7     2   107

sclmeetborn <- datasummary_crosstab(sclmeet ~ born_in_country, data = germany_data)

sclmeetborn

sclmeet		No	Yes	All
1	N	52	326	380
	% row	13.7	85.8	100.0
2	N	300	2170	2476
	% row	12.1	87.6	100.0
3	N	350	3073	3429
	% row	10.2	89.6	100.0
4	N	794	8462	9272
	% row	8.6	91.3	100.0
5	N	601	5485	6095
	% row	9.9	90.0	100.0
6	N	868	8842	9723
	% row	8.9	90.9	100.0
7	N	317	2574	2896
	% row	10.9	88.9	100.0
All	N	3295	31014	34425
	% row	9.6	90.1	100.0

As depicted in the graph, individuals born in Germany exhibit a tendency to engage in frequent social interactions with their friends or relatives on multiple occasions throughout the week. This suggests that native Germans possess an inherent advantage of being familiarized with their own homeland, leading them to derive greater enjoyment from socializing compared to immigrants originating from outside of Germany.

germany_data <- germany_data %>%
  mutate(paidjob = case_when(
    pdjobev == 2 ~ "No",
    pdjobev == 1 ~ "Yes",
    pdjobev %in% c(6, 7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(pdjobev)
  ))

table(germany_data$paidjob)

## 
##    No   Yes 
##  2151 12007

table(germany_data$pdjobev)

## 
##     1     2     6     7     8     9 
## 12007  2151 19643    46     3   575

table(germany_data$brncntr, germany_data$pdjobev) %>%
  cprop()

##        
##         1     2     6     7     8     9     All  
##   1      91.3  84.9  90.4  84.8 100.0  73.9  90.1
##   2       8.6  14.9   9.5   8.7   0.0  12.2   9.6
##   7       0.0   0.0   0.0   6.5   0.0   0.0   0.0
##   8       0.0   0.0   0.0   0.0   0.0   0.0   0.0
##   9       0.1   0.1   0.1   0.0   0.0  13.9   0.3
##   Total 100.0 100.0 100.0 100.0 100.0 100.0 100.0

Task 5

Choose one of the two socio-demographic variables you just worked with. Visualize the conditional probability (or column percentages) of your outcome given your selected socio-dem variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 5 code).

germany_clean <- germany_data %>%
  filter(!is.na(sclmeet) & !is.na(pdjobev))

germany_probs <- germany_clean %>%
  count(pdjobev, sclmeet) %>%
  group_by(sclmeet) %>%
  mutate(prob = n / sum(n))
ggplot(germany_probs, aes(x = as.factor(pdjobev), y = prob, color = sclmeet)) +
  geom_point() +
  geom_line(aes(group = sclmeet)) +
  labs(title = "Conditional Probabilities of paid job",
       subtitle = "by socially meet with friends or relatives",
       x = "paid job Scale", 
       y = "Probability") +
  theme_minimal()

Homework 5_Lyra_Dong

Lyra Dong

2023-10-04