# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"
getwd()
## [1] "/Users/arlenewang/Documents/soc252/SOC252_Proj"
ess <- read_fst("All-ESS-Data.fst")
print("hell yeah this works!")
## [1] "hell yeah this works!"

Mission 1

Create tables between ess and variables of interest. Take note of what you need to clean (i.e., the 7, 8, 9, and 77, 88, 99).

ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}
portugal_data <- ess[ess$cntry == "PT", ]
table(portugal_data$marsts)
## 
##    1    2    3    4    5    6   66   77   88   99 
##  240    7   91  741 1078 2223 4783   18    9  539
datasummary_skim(portugal_data$marsts)
Unique (#) Missing (%) Mean SD Min Median Max
data 11 46 40.4 32.9 1.0 66.0 99.0
table(portugal_data$sclmeet)
## 
##    1    2    3    4    5    6    7   77   88   99 
##  295  899  750 2169 1773 4066 7863   15   46    5
pt_subset <- portugal_data %>%
  filter(cntry == "PT") %>%
  mutate(
      marsts = ifelse(marsts %in% c(66, 77, 88, 99), NA, marsts),
      sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet)
      
  ) %>%
  select(marsts, sclmeet)
datasummary_skim(pt_subset)
Unique (#) Missing (%) Mean SD Min Median Max
marsts 7 76 5.1 1.3 1.0 6.0 6.0
sclmeet 8 0 5.7 1.6 1.0 6.0 7.0
table(pt_subset$marsts)
## 
##    1    2    3    4    5    6 
##  240    7   91  741 1078 2223
table(pt_subset$sclmeet)
## 
##    1    2    3    4    5    6    7 
##  295  899  750 2169 1773 4066 7863

Mission 2

Look into an additional socio-demographic variable (other than the 5 covered in the tutorial), create a table and take note of how to clean it. You can do more than one, but do at least one for the mission.

Provide the ess data portal link here. For example, like so:

https://ess-search.nsd.no/en/variable/32558258-72b1-479b-8cca-49c9d569408d

Take note of the categories and consider how to recode them (using mutate) so as to create fewer categories (if needed) that are meaningful for analytical purposes. For example, for the one above you can consider recoding 1, 2, and 5 together, 3 and 4 together, and 6 separately.

tporgwk: What type of organisation work/worked for

https://ess-search.nsd.no/en/variable/28a90d9d-34f4-42d2-9293-154906d09e4a 1 Central or local government 3015
9.2% 2 Other public sector (such as education and health) 4299
13.1% 3 A state owned enterprise 3351
10.2% 4 A private firm 18385
56% 5 Self employed 3113
9.5% 6 Other 682 2.1% 66 Not applicable 4134
77 Refusal 475 88 Don’t know 114 99 No answer 43

Mission 3

Filter to your country of interest and use mutate to clean your variables. If you want, you can “select” all your variables of interest now, including the socio-demographic ones.

Note: whenever you recode or clean, always double check what you did by comparing pre and post. So, for example, if you tried to clean and named your dataset france_clean from the uncleaned france_data, do tables of variables of interest between the two to see if the recodes and cleaning worked as intended.

yay I did this in mission 1 by accident

Mission 4

Do a datasummary_skim of variables of interest. You can select variables as follows:

# datasummary_skim(dataset %>% select(v1, v2, v3))

Note: code here and below, you just need to replace with what you need and remove the hashtags.

yay I did this already above

Mission 5

Do a quick frequency check for socio-demographics of interest, then visualize. Here’s an example for geo:

#freq(filtered_data$geo)

#filtered_data %>%
 # drop_na(geo) %>%
#  select(geo) %>%
 # freq() %>%
#  as.data.frame() %>%
#  ggplot(aes(x=factor(rownames(.),
 #                    levels= c("Urban",
  #                                   "Peri-Urban",
   #                                  "Rural")), 
    #         y=`%`)) +
#  geom_col() +
 # labs(title = "Distribution of Place of Residence",
  #     x = "Geo")

Depending on what you look into, you will note that some of the percentage seem off relative to what it would be for the entire population. For example, % rural in the sample vs. % rural in the population. That is why we eventually need to make adjustments and apply survey weights. We will talk more about the specific survey design of the ESS in future lectures.

So let’s turn to do conditional probabilities, giving us a better sense of what’s going conditional on a specific category (i.e., dividing by say where someone lives rather than by the total).

freq(pt_subset$marsts)
##        n    % val%
## 1    240  1.3  5.5
## 2      7  0.0  0.2
## 3     91  0.5  2.1
## 4    741  4.1 16.9
## 5   1078  6.0 24.6
## 6   2223 12.4 50.8
## NA 13501 75.5   NA
pt_subset<-pt_subset %>%
  drop_na(marsts)

pt_subset_freq <- pt_subset %>%
  group_by(marsts)%>%
  summarize(freq=n())

ggplot(pt_subset_freq, aes(x=marsts, y=freq)) +
  scale_x_continuous(breaks = 1:6, limits = c(0,7), labels = c("Legally married", 
                                "In a legally registered civil union",
                                 "Legally separated",
                                "Legally divorced/Civil union dissolved",
                                "Widowed/Civil partner died",
                                "None of these")) +             
  geom_col() +
  theme(axis.text.x = element_text(angle = 90))

Mission 6

Do column percentages (as conditional %) with cprop(). Here’s an example:

#table(france_data$clsprty, france_data$educ_level) %>%
 # cprop()
table(portugal_data$marsts, portugal_data$sclmeet) %>%
  cprop()
##        
##         1     2     3     4     5     6     7     77    88    All  
##   1       4.5   2.4   3.0   2.7   2.8   2.5   2.2   0.0   5.6   2.5
##   2       0.0   0.0   0.2   0.2   0.0   0.0   0.1   0.0   0.0   0.1
##   3       2.3   1.4   0.6   0.5   0.7   1.3   0.8   0.0   0.0   0.9
##   4      12.1   8.8  10.8   8.1   7.8   7.1   7.0  30.0   5.6   7.6
##   5      23.5  19.8  10.4   9.8  12.7  11.5   9.5  10.0  22.2  11.1
##   6      12.9  15.5  17.1  17.9  16.9  23.6  27.1  10.0   5.6  22.8
##   66     36.4  49.6  54.9  56.0  53.7  48.2  46.3  40.0  61.1  49.2
##   77      0.0   0.0   0.2   0.2   0.2   0.1   0.2   0.0   0.0   0.2
##   88      0.0   0.2   0.0   0.1   0.1   0.1   0.1   0.0   0.0   0.1
##   99      8.3   2.2   2.8   4.4   5.1   5.5   6.6  10.0   0.0   5.5
##   Total 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0
table(portugal_data$marsts, portugal_data$sclmeet) %>%
  rprop()
##      
##       1     2     3     4     5     6     7     77    88    Total
##   1     2.5   5.0   5.8  14.2  10.0  23.3  38.8   0.0   0.4 100.0
##   2     0.0   0.0  14.3  28.6   0.0  14.3  42.9   0.0   0.0 100.0
##   3     3.3   7.7   3.3   7.7   6.6  33.0  38.5   0.0   0.0 100.0
##   4     2.2   5.8   6.7  14.0   9.2  21.6  39.9   0.4   0.1 100.0
##   5     2.9   9.0   4.5  11.7  10.3  23.9  37.3   0.1   0.4 100.0
##   6     0.8   3.4   3.6  10.3   6.6  23.8  51.4   0.0   0.0 100.0
##   66    1.0   5.1   5.3  15.0   9.8  22.7  40.8   0.1   0.2 100.0
##   77    0.0   0.0   5.6  16.7  11.1  16.7  50.0   0.0   0.0 100.0
##   88    0.0  11.1   0.0  11.1  11.1  22.2  44.4   0.0   0.0 100.0
##   99    2.0   2.0   2.4  10.6   8.2  22.8  51.8   0.2   0.0 100.0
##   All   1.4   5.0   4.8  13.2   9.0  23.1  43.3   0.1   0.2 100.0

You can also do row percentages with rprop().

Mission 7

Do cross tabs (especially useful for exploring relationships between categorical variables). Here’s an example we saw:

# clsprtyedu <- datasummary_crosstab(clsprty ~ educ_level, data = france_data)
# clsprtyedu

Note: you can do it for a continuous/discrete variable, but the table will yield column percentages for each of the values for a column (for example, for stfdem, all of 0 to 10).

mar_scl <- datasummary_crosstab(marsts ~ sclmeet, data = portugal_data)
mar_scl
marsts 1  2  3  4  5  6  7  77  88  99 All
1 N 6 12 14 34 24 56 93 0 1 0 240
% row 2.5 5.0 5.8 14.2 10.0 23.3 38.8 0.0 0.4 0.0 100.0
2 N 0 0 1 2 0 1 3 0 0 0 7
% row 0.0 0.0 14.3 28.6 0.0 14.3 42.9 0.0 0.0 0.0 100.0
3 N 3 7 3 7 6 30 35 0 0 0 91
% row 3.3 7.7 3.3 7.7 6.6 33.0 38.5 0.0 0.0 0.0 100.0
4 N 16 43 50 104 68 160 296 3 1 0 741
% row 2.2 5.8 6.7 14.0 9.2 21.6 39.9 0.4 0.1 0.0 100.0
5 N 31 97 48 126 111 258 402 1 4 0 1078
% row 2.9 9.0 4.5 11.7 10.3 23.9 37.3 0.1 0.4 0.0 100.0
6 N 17 76 79 229 147 530 1143 1 1 0 2223
% row 0.8 3.4 3.6 10.3 6.6 23.8 51.4 0.0 0.0 0.0 100.0
66 N 48 243 254 718 468 1084 1953 4 11 0 4783
% row 1.0 5.1 5.3 15.0 9.8 22.7 40.8 0.1 0.2 0.0 100.0
77 N 0 0 1 3 2 3 9 0 0 0 18
% row 0.0 0.0 5.6 16.7 11.1 16.7 50.0 0.0 0.0 0.0 100.0
88 N 0 1 0 1 1 2 4 0 0 0 9
% row 0.0 11.1 0.0 11.1 11.1 22.2 44.4 0.0 0.0 0.0 100.0
99 N 11 11 13 57 44 123 279 1 0 0 539
% row 2.0 2.0 2.4 10.6 8.2 22.8 51.8 0.2 0.0 0.0 100.0
All N 295 899 750 2169 1773 4066 7863 15 46 5 17881
% row 1.6 5.0 4.2 12.1 9.9 22.7 44.0 0.1 0.3 0.0 100.0

Mission 8

Produce a visual for how a categorical of interest is distributed over time.

For year example:

pt_mision8_subset <- portugal_data %>%
  filter(cntry == "PT") %>%
  mutate(
      marsts = ifelse(marsts %in% c(66, 77, 88, 99), NA, marsts),
      sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet)
      
  ) %>%
  select(marsts, sclmeet, year)

table(pt_mision8_subset$sclmeet, pt_mision8_subset$year) %>%
cprop() %>%
 as.data.frame() %>%
filter(Var1 != "Total",
      Var2 != "All") %>%
 ggplot(aes(x=Var2 %>% as.character() %>% as.integer(),
           y=Freq,
          color=Var1)) +
 geom_line() +
labs(title="Frequency of Social Interactions by Survey Year",
    x = "Survey",
   color = "Sclmeet")

For birth year example:

#table(filtered_data$educ_level, filtered_data$year) %>%
 # cprop() %>%
#  as.data.frame() %>%
 # filter(Var1 != "Total",
  #       Var2 != "All") %>%
#  ggplot(aes(x=Var2 %>% as.character() %>% as.integer(), 
 #            y=Freq, 
  #           color=Var1)) +
#  geom_line() +
 # labs(title="Educational levels by Birth Year",
  #     x = "Birth Year",
   #    color = "Education")

Mission 9

Do a visualization between a categorical variable and an outcome of interest. You can play around with geom(). Focus on computing as average.

pt_mission9_subset <- portugal_data %>%
  filter(cntry == "PT") %>%
  mutate(
      marsts = ifelse(marsts %in% c(66, 77, 88, 99), NA, marsts),
      sclmeet = ifelse(sclmeet %in% c(77, 88, 99), NA, sclmeet)
      
  ) %>%
  select(marsts, sclmeet, year)

pt_mission9_subset <- pt_mission9_subset %>%
  mutate(recode(pt_mission9_subset$marsts, 
         `1` = "Legally married", 
         `2` = "In a legally registered civil union", 
         `3` = "Legally separated", 
         `4` ="Legally divorced/Civil union dissolved",
         `5` = "Widowed/Civil partner died",
         `6` = "None of these",
         `66` = NA_character_,
         `77` = NA_character_,
                                  `88` = NA_character_,
                                  `99` = NA_character_))
ggplot(pt_mission9_subset, aes(x = marsts, y = sclmeet, fill = factor(marsts))) +
  geom_boxplot() +
  labs(title = "Frequency of social interaction by marital status",
       x = "Marital status",
       y = "Freq of social interaction") +
  scale_fill_viridis_d(name = "Marital status", option = "A") +
  theme_minimal()
## Warning: Removed 13501 rows containing missing values (`stat_boxplot()`).
## Warning: Removed 12 rows containing non-finite values (`stat_boxplot()`).

## Mission 10

Do a second visualization between the same categorical variable and an outcome of interest. Focus this time on computing conditional probabilities or percentages.

Here are two code examples:

# double check clean
#france_data <- france_data %>%
 # filter(!is.na(educ_level) & !is.na(clsprty))

# visualize
#table(france_data$clsprty, france_data$educ_level) %>%
 # cprop() %>%
  #as.data.frame() %>%
  #filter(Var1 != "Total",
   #      Var2 != "All") %>%
  #ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  #geom_col(position = "dodge") +
  #labs(title="Feeling close to a particular party in France",
   #    x = "Feeling close (Y/N)",
    #   fill = "At least BA vs. Not")
# double check clean
#france_clean <- france_data %>%
 # filter(!is.na(geo) & !is.na(trstplt))
# calculate conditional probabilities
#france_probs <- france_clean %>%
 # count(trstplt, geo) %>%
#  group_by(geo) %>%
 # mutate(prob = n / sum(n))
# plot
#ggplot(france_probs, aes(x = as.factor(trstplt), y = prob, color = geo)) +
 # geom_point() +
#  geom_line(aes(group = geo)) +
 # labs(title = "Conditional Probabilities of Trust in Politicians",
  #     subtitle = "by Place of Residence",
   #    x = "Trust Scale", 
    #   y = "Probability") +
  #theme_minimal()

Mission 11

Shifting to your outcome of interest, produce a visualization for the average by survey year, as we did in the last tutorial.

Mission 12

Change the ylim, to ‘Zoom in’ and see more of the fluctuations from year to year. You can play around with annotating, or adding text, etc. if you wish.

Mission 13

Compare your mean(outcome) to the ESS baseline (all, inclusive of your country).

Mission 14

Compare the overall average for your outcome (i.e., not by survey year), relative to all other countries. Just as we did in Tutorial 2, with geom_point and geom_text, where each point had an associated two-letter country code.

Mission 15

Do the same for a different outcome of interest, but compare to specific countries (at least 2) instead of the ESS baseline.

Mission 16

Produce a distribution of your outcome by survey year using geom_density_ridges(), as we did in the last tutorial.

Mission 17

Now produce a visual for the average by year of birth, for your outcome of interest.

Mission 18

Now do a geom_smooth() and geom_point(), as we did in the last tutorial.

Mission 19

Produce a last visual of choice. Anything you would like. Experiment, be colorful, or keep it minimalist.

Mission 20

Turn all your variables into numeric ones, including your categorical variables (as they were previously coded). A quick way is to filter and select from the ess and rename your dataset to a new name, say df_corr.

We will use the datasummary_correlation function, which identifies all the numeric variables, and calculates the correlation between each of those variables

Very important: select only 10 variables you want to look into! Make sure it is a subset and not all 2000+ variables.

# datasummary_correlation(subsetdata)