# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"
setwd("~/SOC 202/Homework 5_Lyra_Dong")
library(gapminder)
library(tidyverse)
library(fst)
ess <- read_fst("All-ESS-Data.fst")

Homework 5 (2.5%) due Oct. 16

Important: Must post link to your markdown on the discussion board (file and “knit” html), along with post detailing your general takeaways from exploring variables of interest, comparing to other countries, and looking into socio-demographics. You must also attach your R markdown file.

In your post, please link to the ESS website for the three variables you considered, as well as highlight the country you wish to focus on and for what main reason. Finally, please attach (or provide a screenshot) the visual that you produced while doing the tasks that is most interesting to you and explain why (just one visual not all).

Important note

You cannot use the same three variables we considered in the tutorial. You can use one of the three, but not all three.

Task 1

Do a data summary table of three variables of interest. Discuss briefly what you note (i.e., add text in your markdown after the Task 1 code).

table(ess$sclmeet)
## 
##      1      2      3      4      5      6      7     77     88     99 
##  10060  43072  46575  96993  85437 130629  74860    205   2261    463
table(ess$ipbhprp)
## 
##      1      2      3      4      5      6      7      8      9 
##  77804 163317 107251  59719  34855   8031   1558   5105   4850
table(ess$netusoft)
## 
##     1     2     3     4     5     7     8     9 
## 23872  9013  8739 14124 97494    49   101   199
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

germany_data <- ess %>% 
  filter(cntry == "DE") %>% 
  mutate(
    netusoft = ifelse(netusoft %in% c(7, 8, 9), NA, netusoft),
    sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet), 
    ipbhprp = ifelse(ipbhprp %in% c(77, 88, 99), NA, ipbhprp), 
  )

datasummary_skim(germany_data %>% select(netusoft, sclmeet, ipbhprp))
Unique (#) Missing (%) Mean SD Min Median Max
netusoft 6 60 4.3 1.3 1.0 5.0 5.0
sclmeet 8 0 4.7 1.4 1.0 5.0 7.0
ipbhprp 10 26 3.0 1.4 1.0 3.0 9.0

netusoft stands for Internet use, how often. 5 demonstrates a usage of the internet or other devices everyday. sclmeet stands for How often socially meet with friends, relatives or colleagues. 5 demonstrates once a week. ipbhprp stands for Important to behave properly. 3 demonstrates Somewhat like me. The positively skewed graph displays that a lot of people in Germany somewhat values the importance to behave properly.

Task 2

Choose one of the three variables you just summarized in the table. This will be your current main outcome of interest.

Produce a visual that showcases the mean (average) for your outcome of interest by survey year (can be, e.g., point + line plot or ridge plot, depending on your variable). Discuss briefly what you note (i.e., add text in your markdown after the Task 2 code).

sclmeet_by_year <- germany_data %>%
  group_by(year) %>%
  summarize(mean_sclmeet = mean(sclmeet, na.rm = TRUE))

ggplot(sclmeet_by_year, aes(x = year, y = mean_sclmeet)) +
  geom_line(color = "blue", size = 1) +  
  geom_point(color = "red", size = 3) +  
  labs(title = "Level of socially meet with friends in Germany (2002-2020)", 
       x = "Survey Year", 
       y = "sclmeet (0-10)") +
  ylim(0, 10) +  
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Overall, the data indicates that the frequency of social interactions with friends or relatives in Germany remains relatively stable at an average of 5 times per week. However, there was a noticeable decline in this rate during the year 2020, which can be attributed to the global COVID-19 pandemic.

Task 3

Provide a comparison visual of your outcome of interest with two other countries. You can choose the geom() you prefer. Discuss briefly what you note (i.e., add text in your markdown after the Task 3 code).

ess_selected <- ess %>%
  filter(cntry %in% c("DE", "CA", "KR")) %>%
  mutate(sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet))


task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -sclmeet, FUN=median), y = sclmeet, fill = cntry)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = "Level of socially meet with friends (Germany, Canada, Korea)", 
       x = "Country", 
       y = "Scale (0-10)")

task3plot
## Warning: Removed 154 rows containing non-finite values (`stat_boxplot()`).

Task 4

Produce a cross-tab between your outcome of interest and a socio-demographic variable (use datasummary_crosstab). Then, calculate column percentages using cprop(), making sure to pick a second socio-demographic variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 4 code).

germany_data <- germany_data %>%
  mutate(born_in_country = recode(brncntr,
                                  `1` = "Yes",
                                  `2` = "No",
                                  `7` = NA_character_,
                                  `8` = NA_character_,
                                  `9` = NA_character_))

table(germany_data$born_in_country)
## 
##    No   Yes 
##  3295 31014
table(germany_data$brncntr)
## 
##     1     2     7     8     9 
## 31014  3295     7     2   107
sclmeetborn <- datasummary_crosstab(sclmeet ~ born_in_country, data = germany_data)

sclmeetborn
sclmeet No Yes All
1 N 52 326 380
% row 13.7 85.8 100.0
2 N 300 2170 2476
% row 12.1 87.6 100.0
3 N 350 3073 3429
% row 10.2 89.6 100.0
4 N 794 8462 9272
% row 8.6 91.3 100.0
5 N 601 5485 6095
% row 9.9 90.0 100.0
6 N 868 8842 9723
% row 8.9 90.9 100.0
7 N 317 2574 2896
% row 10.9 88.9 100.0
All N 3295 31014 34425
% row 9.6 90.1 100.0

As depicted in the graph, individuals born in Germany exhibit a tendency to engage in frequent social interactions with their friends or relatives on multiple occasions throughout the week. This suggests that native Germans possess an inherent advantage of being familiarized with their own homeland, leading them to derive greater enjoyment from socializing compared to immigrants originating from outside of Germany.

germany_data <- germany_data %>%
  mutate(paidjob = case_when(
    pdjobev == 2 ~ "No",
    pdjobev == 1 ~ "Yes",
    pdjobev %in% c(6, 7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(pdjobev)
  ))

table(germany_data$paidjob)
## 
##    No   Yes 
##  2151 12007
table(germany_data$pdjobev)
## 
##     1     2     6     7     8     9 
## 12007  2151 19643    46     3   575
table(germany_data$brncntr, germany_data$pdjobev) %>%
  cprop()
##        
##         1     2     6     7     8     9     All  
##   1      91.3  84.9  90.4  84.8 100.0  73.9  90.1
##   2       8.6  14.9   9.5   8.7   0.0  12.2   9.6
##   7       0.0   0.0   0.0   6.5   0.0   0.0   0.0
##   8       0.0   0.0   0.0   0.0   0.0   0.0   0.0
##   9       0.1   0.1   0.1   0.0   0.0  13.9   0.3
##   Total 100.0 100.0 100.0 100.0 100.0 100.0 100.0

Task 5

Choose one of the two socio-demographic variables you just worked with. Visualize the conditional probability (or column percentages) of your outcome given your selected socio-dem variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 5 code).

germany_clean <- germany_data %>%
  filter(!is.na(sclmeet) & !is.na(pdjobev))

germany_probs <- germany_clean %>%
  count(pdjobev, sclmeet) %>%
  group_by(sclmeet) %>%
  mutate(prob = n / sum(n))
ggplot(germany_probs, aes(x = as.factor(pdjobev), y = prob, color = sclmeet)) +
  geom_point() +
  geom_line(aes(group = sclmeet)) +
  labs(title = "Conditional Probabilities of paid job",
       subtitle = "by socially meet with friends or relatives",
       x = "paid job Scale", 
       y = "Probability") +
  theme_minimal()