Homework_5_Ruoxi

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")


new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)


lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

ess <- read_fst("All-ESS-Data.fst")

table(ess$essround)

## 
##     1     2     3     4     5     6     7     8     9    10 
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685

ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}
table(ess$fltlnl)

## 
##     1     2     3     4     7     8     9 
## 91011 33675  8198  3902   125   805   142

table(ess$wrhpp)

## 
##     1     2     3     4     7     8     9 
##  7040 32864 63754 32331   136  1560   173

table(ess$brncntr)

## 
##      1      2      7      8      9 
## 443915  45887     78    108    567

france_data <- ess %>% 
  filter(cntry == "FR") %>% 
  mutate(
    fltlnl = ifelse(fltlnl == 2, 0, ifelse(fltlnl %in% c(7, 8, 9), NA, fltlnl)), 
    wrhpp = ifelse(wrhpp %in% c(7, 8, 9), NA, wrhpp), 
    brncntr = ifelse(brncntr %in% c(7, 8, 9), NA, brncntr), 
  )

Task 1 Do a data summary table of three variables of interest. Discuss briefly what you note (i.e., add text in your markdown after the Task 1 code).

datasummary_skim(france_data %>% select(fltlnl, wrhpp, brncntr))

	Unique (#)	Missing (%)	Mean	SD	Min	Median	Max
fltlnl	5	69	1.0	1.0	0.0	1.0	4.0
wrhpp	5	69	2.9	0.8	1.0	3.0	4.0
brncntr	3	0	1.1	0.3	1.0	1.0	2.0

From the summary, we can see that most of the respondents have a good relationship with their families; the average number of hours most people worked in the past week is in the middle of the histogram, which means that most people work similar hours; The last line shows all respondents were born in France.

Task 2 Choose one of the three variables you just summarized in the table. This will be your current main outcome of interest.

Produce a visual that showcases the mean (average) for your outcome of interest by survey year (can be, e.g., point + line plot or ridge plot, depending on your variable). Discuss briefly what you note (i.e., add text in your markdown after the Task 2 code).

fltlnl_by_year <- france_data %>%
  group_by(year) %>%
  summarize(mean_fltlnl = mean(fltlnl, na.rm = TRUE))

fltlnl_by_year

## # A tibble: 10 × 2
##     year mean_fltlnl
##    <dbl>       <dbl>
##  1  2002      NaN   
##  2  2004      NaN   
##  3  2006        1.05
##  4  2008      NaN   
##  5  2010      NaN   
##  6  2012        1.08
##  7  2014        1.00
##  8  2016      NaN   
##  9  2018      NaN   
## 10  2020      NaN

france_data %>%
  ggplot(aes(x = fltlnl, y = as.factor(year), fill = as.factor(year))) +
  geom_density_ridges() + 
  labs(title = "Distribution of fltlnl Across Survey Years For France", 
       x = "fltlnl Value", y = "Survey Year",
       fill = "Survey Year") +
  theme_minimal()

## Picking joint bandwidth of 0.164

## Warning: Removed 13168 rows containing non-finite values
## (`stat_density_ridges()`).

Observing the ridge plot of the output, I observed that the survey data for fltlnl all showed almost the same peak value in 2006, 2012, and 2014, indicating that the factors affecting the fltlnl variable remained relatively stable over the years.

Task 3 Provide a comparison visual of your outcome of interest with two other countries. You can choose the geom() you prefer. Discuss briefly what you note (i.e., add text in your markdown after the Task 3 code).

table(ess$fltlnl)

## 
##     1     2     3     4     7     8     9 
## 91011 33675  8198  3902   125   805   142

ess_selected <- ess %>%
  filter(cntry %in% c("FR", "DE", "CH")) %>%
  mutate(fltlnl = ifelse(fltlnl %in% c(1,2), NA, fltlnl))

task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -fltlnl, FUN=median), y = fltlnl, fill = cntry)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = "Boxplot comparison for fltlnl (France, Germany, Switzerland)", 
       x = "Country", 
       y = "Scale (0-10)")

task3plot

## Warning: Removed 69040 rows containing non-finite values (`stat_boxplot()`).

It can be seen from the box plot that the median, first quartile and third quartile of the fltlnld data in the three countries are similar; it can also be seen that the outlier of Switzerland is very high, which means that Switzerland has some specific A situation or group in which they have a different perspective than the majority.

Task 4 Produce a cross-tab between your outcome of interest and a socio-demographic variable (use datasummary_crosstab). Then, calculate column percentages using cprop(), making sure to pick a second socio-demographic variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 4 code).

france_data <- france_data %>%
  mutate(
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),

    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )

table(france_data$educ_level)

## 
##    BA No BA 
##  4235 14803

fltlnledu <- datasummary_crosstab(fltlnl ~ educ_level, data = france_data)
fltlnledu

fltlnl		BA	No BA	All
0	N	346	1172	1518
	% row	22.8	77.2	100.0
1	N	806	2816	3622
	% row	22.3	77.7	100.0
3	N	58	355	413
	% row	14.0	86.0	100.0
4	N	25	292	317
	% row	7.9	92.1	100.0
All	N	4235	14803	19038
	% row	22.2	77.8	100.0

france_data <- france_data %>%
  mutate(religion = case_when(
    rlgblg == 2 ~ "No",
    rlgblg == 1 ~ "Yes",
    rlgblg %in% c(7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(rlgblg)
  ))

table(france_data$religion)

## 
##   No  Yes 
## 9434 9509

table(france_data$fltlnl, france_data$religion) %>% cprop()

##        
##         No    Yes   All  
##   0      25.8  25.8  25.8
##   1      62.7  60.8  61.7
##   3       7.1   7.0   7.0
##   4       4.4   6.4   5.4
##   Total 100.0 100.0 100.0

When the social demographic variable is education, the number of BAs in fltlnl is 4235, and the number of No BAs is 14803; when the sociodemographic variable is religion, the number of BAs in fltlnl is 9509, and the number of No BAs is 9434.

Task 5 Choose one of the two socio-demographic variables you just worked with. Visualize the conditional probability (or column percentages) of your outcome given your selected socio-dem variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 5 code).

df <- france_data %>%
  filter(!is.na(educ_level) & !is.na(fltlnl))

df <- df %>%
  mutate(fltlnl = case_when(
    fltlnl == 1 ~ "None or almost none of the time",
    fltlnl == 2 ~ "Some of the time",
    fltlnl == 3 ~ "Most of the time",
    fltlnl == 4 ~ "All or almost all of the time",
    fltlnl == 7 ~ "Refusal*",
    TRUE ~ as.character(fltlnl)  
  ))

table(df$fltlnl)

## 
##                               0   All or almost all of the time 
##                            1518                             317 
##                Most of the time None or almost none of the time 
##                             413                            3622

table(df$fltlnl, df$educ_level) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  geom_col(position = "dodge") +
  labs(title="Feeling Lonely by Education Level in France",
       y = "Conditional Percentage",
       x = "Feeling lonely",
       fill = "At least BA vs. Not")

It is worth noting that the number of BAs who feel lonely at none or almost none of the time in France is greater than the number of No BAs; the number of No BAs who feel lonely in Most of the time and All or almost all of the time is greater than the number of BAs.

Homework_5_Ruoxi_Lin

2023-10-16

R Markdown

Including Plots