# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer",
"fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[4]]
## [1] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[7]]
## [1] "knitr" "viridis" "viridisLite" "fst" "RColorBrewer"
## [6] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "kableExtra" "knitr" "viridis" "viridisLite" "fst"
## [6] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [6] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[11]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[12]]
## [1] "questionr" "ggridges" "rmarkdown" "kableExtra" "knitr"
## [6] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
setwd("~/SOC 202/Homework 5_Lyra_Dong")
library(gapminder)
library(tidyverse)
library(fst)
ess <- read_fst("All-ESS-Data.fst")
Important: Must post link to your markdown on the discussion board (file and “knit” html), along with post detailing your general takeaways from exploring variables of interest, comparing to other countries, and looking into socio-demographics. You must also attach your R markdown file.
In your post, please link to the ESS website for the three variables you considered, as well as highlight the country you wish to focus on and for what main reason. Finally, please attach (or provide a screenshot) the visual that you produced while doing the tasks that is most interesting to you and explain why (just one visual not all).
You cannot use the same three variables we considered in the tutorial. You can use one of the three, but not all three.
Do a data summary table of three variables of interest. Discuss briefly what you note (i.e., add text in your markdown after the Task 1 code).
table(ess$sclmeet)
##
## 1 2 3 4 5 6 7 77 88 99
## 10060 43072 46575 96993 85437 130629 74860 205 2261 463
table(ess$ipbhprp)
##
## 1 2 3 4 5 6 7 8 9
## 77804 163317 107251 59719 34855 8031 1558 5105 4850
table(ess$netusoft)
##
## 1 2 3 4 5 7 8 9
## 23872 9013 8739 14124 97494 49 101 199
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
ess$year[ess$essround == i] <- replacements[i]
}
germany_data <- ess %>%
filter(cntry == "DE") %>%
mutate(
netusoft = ifelse(netusoft %in% c(7, 8, 9), NA, netusoft),
sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet),
ipbhprp = ifelse(ipbhprp %in% c(77, 88, 99), NA, ipbhprp),
)
datasummary_skim(germany_data %>% select(netusoft, sclmeet, ipbhprp))
| Unique (#) | Missing (%) | Mean | SD | Min | Median | Max | ||
|---|---|---|---|---|---|---|---|---|
| netusoft | 6 | 60 | 4.3 | 1.3 | 1.0 | 5.0 | 5.0 | |
| sclmeet | 8 | 0 | 4.7 | 1.4 | 1.0 | 5.0 | 7.0 | |
| ipbhprp | 10 | 26 | 3.0 | 1.4 | 1.0 | 3.0 | 9.0 |
netusoft stands for Internet use, how often. 5 demonstrates a usage of the internet or other devices everyday. sclmeet stands for How often socially meet with friends, relatives or colleagues. 5 demonstrates once a week. ipbhprp stands for Important to behave properly. 3 demonstrates Somewhat like me. The positively skewed graph displays that a lot of people in Germany somewhat values the importance to behave properly.
Choose one of the three variables you just summarized in the table. This will be your current main outcome of interest.
Produce a visual that showcases the mean (average) for your outcome of interest by survey year (can be, e.g., point + line plot or ridge plot, depending on your variable). Discuss briefly what you note (i.e., add text in your markdown after the Task 2 code).
sclmeet_by_year <- germany_data %>%
group_by(year) %>%
summarize(mean_sclmeet = mean(sclmeet, na.rm = TRUE))
ggplot(sclmeet_by_year, aes(x = year, y = mean_sclmeet)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 3) +
labs(title = "Level of socially meet with friends in Germany (2002-2020)",
x = "Survey Year",
y = "sclmeet (0-10)") +
ylim(0, 10) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Overall, the data indicates that the frequency of social interactions with friends or relatives in Germany remains relatively stable at an average of 5 times per week. However, there was a noticeable decline in this rate during the year 2020, which can be attributed to the global COVID-19 pandemic.
Provide a comparison visual of your outcome of interest with two other countries. You can choose the geom() you prefer. Discuss briefly what you note (i.e., add text in your markdown after the Task 3 code).
ess_selected <- ess %>%
filter(cntry %in% c("DE", "CA", "KR")) %>%
mutate(sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet))
task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -sclmeet, FUN=median), y = sclmeet, fill = cntry)) +
geom_boxplot() +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Level of socially meet with friends (Germany, Canada, Korea)",
x = "Country",
y = "Scale (0-10)")
task3plot
## Warning: Removed 154 rows containing non-finite values (`stat_boxplot()`).
Produce a cross-tab between your outcome of interest and a socio-demographic variable (use datasummary_crosstab). Then, calculate column percentages using cprop(), making sure to pick a second socio-demographic variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 4 code).
germany_data <- germany_data %>%
mutate(born_in_country = recode(brncntr,
`1` = "Yes",
`2` = "No",
`7` = NA_character_,
`8` = NA_character_,
`9` = NA_character_))
table(germany_data$born_in_country)
##
## No Yes
## 3295 31014
table(germany_data$brncntr)
##
## 1 2 7 8 9
## 31014 3295 7 2 107
sclmeetborn <- datasummary_crosstab(sclmeet ~ born_in_country, data = germany_data)
sclmeetborn
| sclmeet | No | Yes | All | |
|---|---|---|---|---|
| 1 | N | 52 | 326 | 380 |
| % row | 13.7 | 85.8 | 100.0 | |
| 2 | N | 300 | 2170 | 2476 |
| % row | 12.1 | 87.6 | 100.0 | |
| 3 | N | 350 | 3073 | 3429 |
| % row | 10.2 | 89.6 | 100.0 | |
| 4 | N | 794 | 8462 | 9272 |
| % row | 8.6 | 91.3 | 100.0 | |
| 5 | N | 601 | 5485 | 6095 |
| % row | 9.9 | 90.0 | 100.0 | |
| 6 | N | 868 | 8842 | 9723 |
| % row | 8.9 | 90.9 | 100.0 | |
| 7 | N | 317 | 2574 | 2896 |
| % row | 10.9 | 88.9 | 100.0 | |
| All | N | 3295 | 31014 | 34425 |
| % row | 9.6 | 90.1 | 100.0 |
As depicted in the graph, individuals born in Germany exhibit a tendency to engage in frequent social interactions with their friends or relatives on multiple occasions throughout the week. This suggests that native Germans possess an inherent advantage of being familiarized with their own homeland, leading them to derive greater enjoyment from socializing compared to immigrants originating from outside of Germany.
germany_data <- germany_data %>%
mutate(paidjob = case_when(
pdjobev == 2 ~ "No",
pdjobev == 1 ~ "Yes",
pdjobev %in% c(6, 7, 8, 9) ~ NA_character_,
TRUE ~ as.character(pdjobev)
))
table(germany_data$paidjob)
##
## No Yes
## 2151 12007
table(germany_data$pdjobev)
##
## 1 2 6 7 8 9
## 12007 2151 19643 46 3 575
table(germany_data$brncntr, germany_data$pdjobev) %>%
cprop()
##
## 1 2 6 7 8 9 All
## 1 91.3 84.9 90.4 84.8 100.0 73.9 90.1
## 2 8.6 14.9 9.5 8.7 0.0 12.2 9.6
## 7 0.0 0.0 0.0 6.5 0.0 0.0 0.0
## 8 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 9 0.1 0.1 0.1 0.0 0.0 13.9 0.3
## Total 100.0 100.0 100.0 100.0 100.0 100.0 100.0
Choose one of the two socio-demographic variables you just worked with. Visualize the conditional probability (or column percentages) of your outcome given your selected socio-dem variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 5 code).
germany_clean <- germany_data %>%
filter(!is.na(sclmeet) & !is.na(pdjobev))
germany_probs <- germany_clean %>%
count(pdjobev, sclmeet) %>%
group_by(sclmeet) %>%
mutate(prob = n / sum(n))
ggplot(germany_probs, aes(x = as.factor(pdjobev), y = prob, color = sclmeet)) +
geom_point() +
geom_line(aes(group = sclmeet)) +
labs(title = "Conditional Probabilities of paid job",
subtitle = "by socially meet with friends or relatives",
x = "paid job Scale",
y = "Probability") +
theme_minimal()