SOC Part 3 project

setwd("C:/Users/Owner/Downloads/All Downloads/SOC252/Work Space")
getwd()

## [1] "C:/Users/Owner/Downloads/All Downloads/SOC252/Work Space"

#This saves all my library().

# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

ess <- read_fst("All-ESS-Data.fst")

#This saves a title.

print("hell yeah this works!")

## [1] "hell yeah this works!"

#This command saves workspace or global environment

save.image()

Create tables between ess and variables of interest. Take note of what you need to clean (i.e., the 7, 8, 9, and 77, 88, 99).

Mission 1

Create tables between ess and variables of interest. Take note of what you need to clean (i.e., the 7, 8, 9, and 77, 88, 99).

table(ess$cntry)

## 
##    AL    AT    BE    BG    CH    CY    CZ    DE    DK    EE    ES    FI    FR 
##  1201 15225 17451 13240 16925  6065 20090 34425 12408 16856 19452 19532 19038 
##    GB    GR    HR    HU    IE    IL    IS    IT    LT    LU    LV    ME    MK 
## 20979 12558  6535 16642 22233 16218  3975 10178 11652  3187  3921  2478  1429 
##    NL    NO    PL    PT    RO    RS    RU    SE    SI    SK    TR    UA    XK 
## 18329 16065 17689 17881  2146  3548 12458 18216 13484 11292  4272  9987  1295

#Check ESS for Variable of Interest
#head(ess)

# Other ways to do it
head(colnames(ess), 10)# print first 10 columns, can set any other number

##  [1] "name"     "essround" "edition"  "proddate" "cntry"    "idno"    
##  [7] "dweight"  "pspwght"  "pweight"  "anweight"

# Get all unique values of a specific column
unique_values <- unique(ess$cntry)

# To view the unique values
print(unique_values)

##  [1] "AT" "BE" "CH" "CZ" "DE" "DK" "ES" "FI" "FR" "GB" "GR" "HU" "IE" "IL" "IT"
## [16] "LU" "NL" "NO" "PL" "PT" "SE" "SI" "EE" "IS" "SK" "TR" "UA" "BG" "CY" "RU"
## [31] "HR" "LV" "RO" "LT" "AL" "XK" "ME" "RS" "MK"

#Find Switzerland
Switzerland_data <- ess[ess$cntry == "CH", ]

#Find Wrkorg in ESS and Variable of interest of work org.

table (ess$wrkorg)

## 
##      1      2      7      8      9 
##  59600 359846    304   1043    470

#1 is Yes and 2 is No. I'm interested in High levels of work organization, so Yes. 7 Refuse and 8 is DK, which we must remove it.

#Country of interest: https://ess-search.nsd.no/en/variable/query/cntry/1, Wrkorg: https://ess-search.nsd.no/en/variable/query/wrkorg/1

Mission 2

Look into an additional socio-demographic variable (other than the 5 covered in the tutorial), create a table and take note of how to clean it. You can do more than one, but do at least one for the mission.

Provide the ess data portal link here. For example, like so:

https://ess-search.nsd.no/en/variable/32558258-72b1-479b-8cca-49c9d569408d

Take note of the categories and consider how to recode them (using mutate) so as to create fewer categories (if needed) that are meaningful for analytical purposes. For example, for the one above you can consider recoding 1, 2, and 5 together, 3 and 4 together, and 6 separately.

table(ess$netuse)

## 
##     0     1     2     3     4     5     6     7    77    88    99 
## 76194 38069  4603  3817  8159  9623 27492 67203    50   467   201

netuse_cleaned <- ess %>%
mutate(
    netuse = ifelse(netuse %in% c(77, 88, 99), NA, netuse)) %>% 
  select(netuse)
  
table(netuse_cleaned)

## netuse
##     0     1     2     3     4     5     6     7 
## 76194 38069  4603  3817  8159  9623 27492 67203

Mission 3

Filter to your country of interest and use mutate to clean your variables. If you want, you can “select” all your variables of interest now, including the socio-demographic ones.

Note: whenever you recode or clean, always double check what you did by comparing pre and post. So, for example, if you tried to clean and named your dataset france_clean from the uncleaned france_data, do tables of variables of interest between the two to see if the recodes and cleaning worked as intended.

#Check ESS for Variable of Interest
#head(ess)

# Other ways to do it
head(colnames(ess), 10)# print first 10 columns, can set any other number

##  [1] "name"     "essround" "edition"  "proddate" "cntry"    "idno"    
##  [7] "dweight"  "pspwght"  "pweight"  "anweight"

# Get all unique values of a specific column
unique_values <- unique(ess$cntry)

# To view the unique values
print(unique_values)

##  [1] "AT" "BE" "CH" "CZ" "DE" "DK" "ES" "FI" "FR" "GB" "GR" "HU" "IE" "IL" "IT"
## [16] "LU" "NL" "NO" "PL" "PT" "SE" "SI" "EE" "IS" "SK" "TR" "UA" "BG" "CY" "RU"
## [31] "HR" "LV" "RO" "LT" "AL" "XK" "ME" "RS" "MK"

table(ess$cntry)

## 
##    AL    AT    BE    BG    CH    CY    CZ    DE    DK    EE    ES    FI    FR 
##  1201 15225 17451 13240 16925  6065 20090 34425 12408 16856 19452 19532 19038 
##    GB    GR    HR    HU    IE    IL    IS    IT    LT    LU    LV    ME    MK 
## 20979 12558  6535 16642 22233 16218  3975 10178 11652  3187  3921  2478  1429 
##    NL    NO    PL    PT    RO    RS    RU    SE    SI    SK    TR    UA    XK 
## 18329 16065 17689 17881  2146  3548 12458 18216 13484 11292  4272  9987  1295

#Find Switzerland
Switzerland_data <- ess[ess$cntry == "CH", ]

#Find Wrkorg inESS and Variable of interest of work org.

table (ess$wrkorg)

## 
##      1      2      7      8      9 
##  59600 359846    304   1043    470

table(Switzerland_data$wrkorg)

## 
##     1     2     7     8 
##  2378 13000     1    23

#1 is Yes and 2 is No. I'm interested in High levels of work organization, so Yes. 7 Refuse and 8 is DK, which we must remove it.

#Clean Switzerland and wrkorg
Switzerland_wrkorg_cleaned <- Switzerland_data %>% 
  filter(cntry == "CH") %>% 
  mutate(
    wrkorg = ifelse(wrkorg %in% c(7, 8), NA, wrkorg)) %>% 
  select(wrkorg)

#Switzerland and wrkorg table
table(Switzerland_wrkorg_cleaned)

## wrkorg
##     1     2 
##  2378 13000

Mission 4

Do a datasummary_skim of variables of interest. You can select variables as follows:

# datasummary_skim(dataset %>% select(v1, v2, v3))

datasummary_skim(Switzerland_wrkorg_cleaned %>% select(wrkorg))

	Unique (#)	Missing (%)	Mean	SD	Min	Median	Max
wrkorg	3	9	1.8	0.4	1.0	2.0	2.0

Mission 5

Do a quick frequency check for socio-demographics of interest, then visualize. Here’s an example for geo:

#filtered_data %>%
 # drop_na(geo) %>%
#  select(geo) %>%
 # freq() %>%
#  as.data.frame() %>%
#  ggplot(aes(x=factor(rownames(.),
 #                    levels= c("Urban",
  #                                   "Peri-Urban",
   #                                  "Rural")), 
    #         y=`%`)) +
#  geom_col() +
 # labs(title = "Distribution of Place of Residence",
  #     x = "Geo")

#filtered_data %>%
 # drop_na(geo) %>%
#  select(geo) %>%
 # freq() %>%
#  as.data.frame() %>%
#  ggplot(aes(x=factor(rownames(.),
 #                    levels= c("Urban",
  #                                   "Peri-Urban",
   #                                  "Rural")), 
    #         y=`%`)) +
#  geom_col() +
 # labs(title = "Distribution of Place of Residence",
  #     x = "Geo")

freq(Switzerland_wrkorg_cleaned$wrkorg)

##        n    % val%
## 1   2378 14.1 15.5
## 2  13000 76.8 84.5
## NA  1547  9.1   NA

Switzerland_wrkorg_cleaned_data <- Switzerland_wrkorg_cleaned %>%
 select(wrkorg) %>%
 mutate(wrkorg = recode(as.character(wrkorg),
                          '0' = "NA",
                          '1' = "YES",
                          '2' = "NO",
                          '3' = "NA",
                          '4' = "NA",
                          '5' = "NA",
                          '6' = "NA",
                          '7' = "NA",
                          '8' = "NA",
                          '9' = "NA",
                          '10'= "NA"))

 Switzerland_wrkorg_cleaned_data %>%
 drop_na(wrkorg) %>%
 freq() %>%
  as.data.frame() %>%
  ggplot(aes(x=factor(rownames(.),
                    levels= c("YES",
                                     "NO",
                                     "NA")), 
             y=`%`)) +
  geom_col() +
  labs(title = "Distribution of Switzerland having working associations in the last 12 months",
       x = "wrkorg")

Depending on what you look into, you will note that some of the percentage seem off relative to what it would be for the entire population. For example, % rural in the sample vs. % rural in the population. That is why we eventually need to make adjustments and apply survey weights. We will talk more about the specific survey design of the ESS in future lectures.

So let’s turn to do conditional probabilities, giving us a better sense of what’s going conditional on a specific category (i.e., dividing by say where someone lives rather than by the total).

Mission 6

Do column percentages (as conditional %) with cprop(). Here’s an example:

#table(france_data$clsprty, france_data$educ_level) %>%
 # cprop()
table(Switzerland_wrkorg_cleaned_data$wrkorg, Switzerland_wrkorg_cleaned_data$wrkorg) %>%
  cprop()

##        
##         NO    YES   All  
##   NO    100.0   0.0  84.5
##   YES     0.0 100.0  15.5
##   Total 100.0 100.0 100.0

You can also do row percentages with rprop().

Mission 8

Produce a visual for how a categorical of interest is distributed over time.

For year example:

#table(filtered_data$educ_level, filtered_data$year) %>%
 # cprop() %>%
#  as.data.frame() %>%
 # filter(Var1 != "Total",
  #       Var2 != "All") %>%
#  ggplot(aes(x=Var2 %>% as.character() %>% as.integer(), 
 #            y=Freq, 
  #           color=Var1)) +
#  geom_line() +
 # labs(title="Educational levels by Survey Year",
  #     x = "Survey",
   #    color = "Education")

For birth year example:

#table(filtered_data$educ_level, filtered_data$year) %>%
 # cprop() %>%
#  as.data.frame() %>%
 # filter(Var1 != "Total",
  #       Var2 != "All") %>%
#  ggplot(aes(x=Var2 %>% as.character() %>% as.integer(), 
 #            y=Freq, 
  #           color=Var1)) +
#  geom_line() +
 # labs(title="Educational levels by Birth Year",
  #     x = "Birth Year",
   #    color = "Education")

table(ess$wrkorg)

## 
##      1      2      7      8      9 
##  59600 359846    304   1043    470

ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}
Switzerland_data <- ess[ess$cntry == "CH",]
Switzerland_data_cleaned <- Switzerland_data %>%
  mutate(wrkorg = ifelse(wrkorg %in% c(7, 8, 9), NA, wrkorg))
Switzerland_data_cleaned_Birthyear <- Switzerland_data_cleaned %>%
  drop_na(wrkorg) %>%
    mutate(wrkorg = recode(as.character(wrkorg),
                          '0' = "NA",
                          '1' = "YES",
                          '2' = "NO",
                          '3' = "NA",
                          '4' = "NA",
                          '5' = "NA",
                          '6' = "NA",
                          '7' = "NA",
                          '8' = "NA",
                          '9' = "NA",
                          '10'= "NA"))
table(Switzerland_data_cleaned_Birthyear$wrkorg, Switzerland_data_cleaned_Birthyear$year) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var2 %>% as.character() %>% as.integer(), 
             y=Freq, 
             color=Var1)) +
  geom_line() +
  labs(title="Switzerland Respondents Having Work Organization in the Last 12 Months by Survey Year",
       x = "Birth Year",
       color = "Switzerland Respondents Having Work Organization in the Last 12 Months")

Mission 10

Do a second visualization between the same categorical variable and an outcome of interest. Focus this time on computing conditional probabilities or percentages.

Here are two code examples:

# double check clean
#france_data <- france_data %>%
 # filter(!is.na(educ_level) & !is.na(clsprty))

# visualize
#table(france_data$clsprty, france_data$educ_level) %>%
 # cprop() %>%
  #as.data.frame() %>%
  #filter(Var1 != "Total",
   #      Var2 != "All") %>%
  #ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  #geom_col(position = "dodge") +
  #labs(title="Feeling close to a particular party in France",
   #    x = "Feeling close (Y/N)",
    #   fill = "At least BA vs. Not")

# double check clean
#france_clean <- france_data %>%
 # filter(!is.na(geo) & !is.na(trstplt))
# calculate conditional probabilities
#france_probs <- france_clean %>%
 # count(trstplt, geo) %>%
#  group_by(geo) %>%
 # mutate(prob = n / sum(n))
# plot
#ggplot(france_probs, aes(x = as.factor(trstplt), y = prob, color = geo)) +
 # geom_point() +
#  geom_line(aes(group = geo)) +
 # labs(title = "Conditional Probabilities of Trust in Politicians",
  #     subtitle = "by Place of Residence",
   #    x = "Trust Scale", 
    #   y = "Probability") +
  #theme_minimal()
table(ess$essround)

## 
##     1     2     3     4     5     6     7     8     9    10 
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685

## 
##     1     2     3     4     5     6     7     8     9    10 
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}
Switzerland_data <- ess %>% filter(cntry == "CH") %>% select(cntry, wrkorg, trstplc, year)
Switzerland_data_cleaned <- Switzerland_data %>% 
  filter(cntry == "CH", year == 2016) %>%
  select(wrkorg, trstplc)
Switzerland_data_cleaned <- Switzerland_data_cleaned %>%
  mutate(wrkorg = ifelse(wrkorg %in% c(7, 8, 9), NA, wrkorg))
Switzerland_data_cleaned <- Switzerland_data_cleaned %>%
  mutate(trstplc = ifelse(trstplc %in% c(77, 88, 99), NA, trstplc))
table(Switzerland_data_cleaned)

##       trstplc
## wrkorg   0   1   2   3   4   5   6   7   8   9  10
##      1   2   0   6   4  12  18  33  54  78  56  23
##      2   9   8  15  31  50 105 129 231 357 189 109

# double check clean
Switzerland_data_cleaned <- Switzerland_data_cleaned %>%
  filter(!is.na(wrkorg) & !is.na(trstplc))

# visualize
table(Switzerland_data_cleaned$wrkorg, Switzerland_data_cleaned$trstplc) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  geom_col(position = "dodge") +
  labs(title="Switzerland Respondents Having Work Organization in the Last 12 Months", x = "Trust in Police")

Mission 13

Compare your mean(outcome) to the ESS baseline (all, inclusive of your country).

ess_data_cleaned <- ess %>% 
  filter(year == 2014) %>%
  select(wrkorg, trstplc)
ess_data_cleaned <- ess_data_cleaned %>%
  mutate(wrkorg = ifelse(wrkorg %in% c(7, 8, 9), NA, wrkorg))
ess_data_cleaned <- ess_data_cleaned %>%
  mutate(trstplc = ifelse(trstplc %in% c(77, 88, 99), NA, trstplc))
avg_wrkorg_ess <- ess_data_cleaned %>%
  summarize(mean_wrkorg_ess = mean(wrkorg, na.rm = TRUE))
avg_trstplc_Switzerland <- Switzerland_data_cleaned %>%
  group_by(trstplc) %>%
  summarize(mean_wrkorg_sweden = mean(wrkorg, na.rm = TRUE))



combined_data <- cross_join(avg_trstplc_Switzerland, avg_wrkorg_ess)
long_data <- combined_data %>%
  pivot_longer(
    cols = starts_with("Mean_wrkorg"),
    names_to = "work_Type",
    values_to = "Mean_Work"  
  )
ggplot(long_data) +
  geom_line(aes(x = trstplc, y = Mean_Work, color = work_Type), size = 2) +
  geom_point(aes(x = trstplc, y = Mean_Work, color = work_Type), size = 3) +
  labs(title = "Mean Work by Trust in police", 
       x = "Trust in Police", 
       y = "Mean Work by Trust in police Scheme (0-3 scale)") +
  ylim(1.6, 2)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 26 rows containing missing values (`geom_point()`).