This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer",
"fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[4]]
## [1] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[7]]
## [1] "knitr" "viridis" "viridisLite" "fst" "RColorBrewer"
## [6] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "kableExtra" "knitr" "viridis" "viridisLite" "fst"
## [6] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [6] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[11]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[12]]
## [1] "questionr" "ggridges" "rmarkdown" "kableExtra" "knitr"
## [6] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
ess <- read_fst("All-ESS-Data.fst")
table(ess$essround)
##
## 1 2 3 4 5 6 7 8 9 10
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
ess$year[ess$essround == i] <- replacements[i]
}
table(ess$clsprty)
##
## 1 2 7 8 9
## 231368 246470 2391 9334 992
table(ess$trstplt)
##
## 0 1 2 3 4 5 6 7 8 9 10 77 88
## 79525 42046 56947 63008 55417 81569 45397 34012 15562 3918 3106 576 8690
## 99
## 782
table(ess$stfdem)
##
## 0 1 2 3 4 5 6 7 8 9 10 77 88
## 29121 18116 31973 45014 46758 80154 55790 67816 59837 23346 12989 755 18211
## 99
## 675
france_data <- ess %>%
filter(cntry == "FR") %>%
mutate(
clsprty = ifelse(clsprty == 2, 0, ifelse(clsprty %in% c(7, 8, 9), NA, clsprty)),
stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem),
trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
)
table(france_data$clsprty)
##
## 0 1
## 9194 9507
table(france_data$stfdem)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1374 779 1703 2228 2357 3563 2125 2298 1669 398 267
table(france_data$trstplt)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 2642 1566 2738 2959 2580 3615 1460 850 362 71 49
##Task 1 Do a data summary table of three variables of interest. Discuss briefly what you note (i.e., add text in your markdown after the Task 1 code).
datasummary_skim(france_data %>% select(fltlnl, wrhpp, brncntr))
| Unique (#) | Missing (%) | Mean | SD | Min | Median | Max | ||
|---|---|---|---|---|---|---|---|---|
| fltlnl | 6 | 69 | 1.6 | 0.8 | 1.0 | 1.0 | 7.0 | |
| wrhpp | 7 | 69 | 2.9 | 0.8 | 1.0 | 3.0 | 8.0 | |
| brncntr | 4 | 0 | 1.1 | 0.3 | 1.0 | 1.0 | 8.0 |
From the summary, we can see that most of the respondents have a good relationship with their families; the average number of hours most people worked in the past week is in the middle of the histogram, which means that most people work similar hours; The last line shows all respondents were born in France.
##Task 2 Choose one of the three variables you just summarized in the table. This will be your current main outcome of interest.
Produce a visual that showcases the mean (average) for your outcome of interest by survey year (can be, e.g., point + line plot or ridge plot, depending on your variable). Discuss briefly what you note (i.e., add text in your markdown after the Task 2 code).
fltlnl_by_year <- france_data %>%
group_by(year) %>%
summarize(mean_fltlnl = mean(fltlnl, na.rm = TRUE))
fltlnl_by_year
## # A tibble: 10 × 2
## year mean_fltlnl
## <dbl> <dbl>
## 1 2002 NaN
## 2 2004 NaN
## 3 2006 1.54
## 4 2008 NaN
## 5 2010 NaN
## 6 2012 1.60
## 7 2014 1.55
## 8 2016 NaN
## 9 2018 NaN
## 10 2020 NaN
france_data %>%
ggplot(aes(x = fltlnl, y = as.factor(year), fill = as.factor(year))) +
geom_density_ridges() +
labs(title = "Distribution of fltlnl Across Survey Years For France",
x = "fltlnl Value", y = "Survey Year",
fill = "Survey Year") +
theme_minimal()
## Picking joint bandwidth of 0.148
## Warning: Removed 13167 rows containing non-finite values
## (`stat_density_ridges()`).
Observing the ridge plot of the output, I observed that the survey data
for fltlnl all showed almost the same peak value in 2006, 2012, and
2014, indicating that the factors affecting the fltlnl variable remained
relatively stable over the years.
Provide a comparison visual of your outcome of interest with two other countries. You can choose the geom() you prefer. Discuss briefly what you note (i.e., add text in your markdown after the Task 3 code).
table(ess$fltlnl)
##
## 1 2 3 4 7 8 9
## 91011 33675 8198 3902 125 805 142
ess_selected <- ess %>%
filter(cntry %in% c("FR", "DE", "CH")) %>%
mutate(fltlnl = ifelse(fltlnl %in% c(1,2), NA, fltlnl))
task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -fltlnl, FUN=median), y = fltlnl, fill = cntry)) +
geom_boxplot() +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Boxplot comparison for fltlnl (France, Germany, Switzerland)",
x = "Country",
y = "Scale (0-10)")
task3plot
## Warning: Removed 69040 rows containing non-finite values (`stat_boxplot()`).
It can be seen from the box plot that the median, first quartile and
third quartile of the fltlnld data in the three countries are similar;
it can also be seen that the outlier of Switzerland is very high, which
means that Switzerland has some specific A situation or group in which
they have a different perspective than the majority.
##Task 4 Produce a cross-tab between your outcome of interest and a socio-demographic variable (use datasummary_crosstab). Then, calculate column percentages using cprop(), making sure to pick a second socio-demographic variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 4 code).
france_data <- france_data %>%
mutate(
edulvla = case_when(
essround < 5 & edulvla == 55 ~ NA_real_,
TRUE ~ edulvla
),
edulvlb = case_when(
essround >= 5 & edulvlb == 5555 ~ NA_real_,
TRUE ~ edulvlb
),
educ_level = case_when(
essround < 5 & edulvla == 5 ~ "BA",
essround >= 5 & edulvlb > 600 ~ "BA",
TRUE ~ "No BA"
)
)
table(france_data$educ_level)
##
## BA No BA
## 4235 14803
fltlnledu <- datasummary_crosstab(fltlnl ~ educ_level, data = france_data)
fltlnledu
| fltlnl | BA | No BA | All | |
|---|---|---|---|---|
| 1 | N | 806 | 2816 | 3622 |
| % row | 22.3 | 77.7 | 100.0 | |
| 2 | N | 346 | 1172 | 1518 |
| % row | 22.8 | 77.2 | 100.0 | |
| 3 | N | 58 | 355 | 413 |
| % row | 14.0 | 86.0 | 100.0 | |
| 4 | N | 25 | 292 | 317 |
| % row | 7.9 | 92.1 | 100.0 | |
| 7 | N | 1 | 0 | 1 |
| % row | 100.0 | 0.0 | 100.0 | |
| All | N | 4235 | 14803 | 19038 |
| % row | 22.2 | 77.8 | 100.0 |
france_data <- france_data %>%
mutate(religion = case_when(
rlgblg == 2 ~ "No",
rlgblg == 1 ~ "Yes",
rlgblg %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ as.character(rlgblg)
))
table(france_data$religion)
##
## No Yes
## 9434 9509
table(france_data$fltlnl, france_data$religion) %>% cprop()
##
## No Yes All
## 1 62.7 60.8 61.7
## 2 25.8 25.8 25.8
## 3 7.1 7.0 7.0
## 4 4.4 6.4 5.4
## Total 100.0 100.0 100.0
When the social demographic variable is education, the number of BAs in fltlnl is 4235, and the number of No BAs is 14803; when the sociodemographic variable is religion, the number of BAs in fltlnl is 9509, and the number of No BAs is 9434.
Choose one of the two socio-demographic variables you just worked with. Visualize the conditional probability (or column percentages) of your outcome given your selected socio-dem variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 5 code).
df <- france_data %>%
filter(!is.na(educ_level) & !is.na(fltlnl))
df <- df %>%
mutate(fltlnl = case_when(
fltlnl == 1 ~ "Yes",
fltlnl == 0 ~ "No",
TRUE ~ as.character(fltlnl)
))
table(df$fltlnl)
##
## 2 3 4 7 Yes
## 1518 413 317 1 3622
table(df$fltlnl, df$educ_level) %>%
cprop() %>%
as.data.frame() %>%
filter(Var1 != "Total",
Var2 != "All") %>%
ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
geom_col(position = "dodge") +
labs(title="Feeling Lonely by Education Level in France",
y = "Conditional Percentage",
x = "Feeling lonely",
fill = "At least BA vs. Not")
The leftmost and rightmost bars of the histogram both have more BA than
No BA, and the second and third bars both have more No BA than BA.