setwd("C:/Users/Owner/Downloads/All Downloads/SOC252/Work Space")
getwd()
## [1] "C:/Users/Owner/Downloads/All Downloads/SOC252/Work Space"
#This saves all my library().
# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer",
"fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[4]]
## [1] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[7]]
## [1] "knitr" "viridis" "viridisLite" "fst" "RColorBrewer"
## [6] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "kableExtra" "knitr" "viridis" "viridisLite" "fst"
## [6] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [6] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[11]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[12]]
## [1] "questionr" "ggridges" "rmarkdown" "kableExtra" "knitr"
## [6] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
ess <- read_fst("All-ESS-Data.fst")
#This saves a title.
print("hell yeah this works!")
## [1] "hell yeah this works!"
#This command saves workspace or global environment
save.image()
Create tables between ess and variables of interest. Take note of what you need to clean (i.e., the 7, 8, 9, and 77, 88, 99).
Create tables between ess and variables of interest. Take note of what you need to clean (i.e., the 7, 8, 9, and 77, 88, 99).
table(ess$cntry)
##
## AL AT BE BG CH CY CZ DE DK EE ES FI FR
## 1201 15225 17451 13240 16925 6065 20090 34425 12408 16856 19452 19532 19038
## GB GR HR HU IE IL IS IT LT LU LV ME MK
## 20979 12558 6535 16642 22233 16218 3975 10178 11652 3187 3921 2478 1429
## NL NO PL PT RO RS RU SE SI SK TR UA XK
## 18329 16065 17689 17881 2146 3548 12458 18216 13484 11292 4272 9987 1295
#Check ESS for Variable of Interest
#head(ess)
# Other ways to do it
head(colnames(ess), 10)# print first 10 columns, can set any other number
## [1] "name" "essround" "edition" "proddate" "cntry" "idno"
## [7] "dweight" "pspwght" "pweight" "anweight"
# Get all unique values of a specific column
unique_values <- unique(ess$cntry)
# To view the unique values
print(unique_values)
## [1] "AT" "BE" "CH" "CZ" "DE" "DK" "ES" "FI" "FR" "GB" "GR" "HU" "IE" "IL" "IT"
## [16] "LU" "NL" "NO" "PL" "PT" "SE" "SI" "EE" "IS" "SK" "TR" "UA" "BG" "CY" "RU"
## [31] "HR" "LV" "RO" "LT" "AL" "XK" "ME" "RS" "MK"
#Find Switzerland
Switzerland_data <- ess[ess$cntry == "CH", ]
#Find Wrkorg in ESS and Variable of interest of work org.
table (ess$wrkorg)
##
## 1 2 7 8 9
## 59600 359846 304 1043 470
#1 is Yes and 2 is No. I'm interested in High levels of work organization, so Yes. 7 Refuse and 8 is DK, which we must remove it.
#Country of interest: https://ess-search.nsd.no/en/variable/query/cntry/1, Wrkorg: https://ess-search.nsd.no/en/variable/query/wrkorg/1
Look into an additional socio-demographic variable (other than the 5 covered in the tutorial), create a table and take note of how to clean it. You can do more than one, but do at least one for the mission.
Provide the ess data portal link here. For example, like so:
https://ess-search.nsd.no/en/variable/32558258-72b1-479b-8cca-49c9d569408d
Take note of the categories and consider how to recode them (using mutate) so as to create fewer categories (if needed) that are meaningful for analytical purposes. For example, for the one above you can consider recoding 1, 2, and 5 together, 3 and 4 together, and 6 separately.
table(ess$netuse)
##
## 0 1 2 3 4 5 6 7 77 88 99
## 76194 38069 4603 3817 8159 9623 27492 67203 50 467 201
netuse_cleaned <- ess %>%
mutate(
netuse = ifelse(netuse %in% c(77, 88, 99), NA, netuse)) %>%
select(netuse)
table(netuse_cleaned)
## netuse
## 0 1 2 3 4 5 6 7
## 76194 38069 4603 3817 8159 9623 27492 67203
Filter to your country of interest and use mutate to clean your variables. If you want, you can “select” all your variables of interest now, including the socio-demographic ones.
Note: whenever you recode or clean, always double check what you did by comparing pre and post. So, for example, if you tried to clean and named your dataset france_clean from the uncleaned france_data, do tables of variables of interest between the two to see if the recodes and cleaning worked as intended.
#Check ESS for Variable of Interest
#head(ess)
# Other ways to do it
head(colnames(ess), 10)# print first 10 columns, can set any other number
## [1] "name" "essround" "edition" "proddate" "cntry" "idno"
## [7] "dweight" "pspwght" "pweight" "anweight"
# Get all unique values of a specific column
unique_values <- unique(ess$cntry)
# To view the unique values
print(unique_values)
## [1] "AT" "BE" "CH" "CZ" "DE" "DK" "ES" "FI" "FR" "GB" "GR" "HU" "IE" "IL" "IT"
## [16] "LU" "NL" "NO" "PL" "PT" "SE" "SI" "EE" "IS" "SK" "TR" "UA" "BG" "CY" "RU"
## [31] "HR" "LV" "RO" "LT" "AL" "XK" "ME" "RS" "MK"
table(ess$cntry)
##
## AL AT BE BG CH CY CZ DE DK EE ES FI FR
## 1201 15225 17451 13240 16925 6065 20090 34425 12408 16856 19452 19532 19038
## GB GR HR HU IE IL IS IT LT LU LV ME MK
## 20979 12558 6535 16642 22233 16218 3975 10178 11652 3187 3921 2478 1429
## NL NO PL PT RO RS RU SE SI SK TR UA XK
## 18329 16065 17689 17881 2146 3548 12458 18216 13484 11292 4272 9987 1295
#Find Switzerland
Switzerland_data <- ess[ess$cntry == "CH", ]
#Find Wrkorg inESS and Variable of interest of work org.
table (ess$wrkorg)
##
## 1 2 7 8 9
## 59600 359846 304 1043 470
table(Switzerland_data$wrkorg)
##
## 1 2 7 8
## 2378 13000 1 23
#1 is Yes and 2 is No. I'm interested in High levels of work organization, so Yes. 7 Refuse and 8 is DK, which we must remove it.
#Clean Switzerland and wrkorg
Switzerland_wrkorg_cleaned <- Switzerland_data %>%
filter(cntry == "CH") %>%
mutate(
wrkorg = ifelse(wrkorg %in% c(7, 8), NA, wrkorg)) %>%
select(wrkorg)
#Switzerland and wrkorg table
table(Switzerland_wrkorg_cleaned)
## wrkorg
## 1 2
## 2378 13000
Do a datasummary_skim of variables of interest. You can select variables as follows:
# datasummary_skim(dataset %>% select(v1, v2, v3))
datasummary_skim(Switzerland_wrkorg_cleaned %>% select(wrkorg))
Unique (#) | Missing (%) | Mean | SD | Min | Median | Max | ||
---|---|---|---|---|---|---|---|---|
wrkorg | 3 | 9 | 1.8 | 0.4 | 1.0 | 2.0 | 2.0 |
Do a quick frequency check for socio-demographics of interest, then visualize. Here’s an example for geo:
#filtered_data %>%
# drop_na(geo) %>%
# select(geo) %>%
# freq() %>%
# as.data.frame() %>%
# ggplot(aes(x=factor(rownames(.),
# levels= c("Urban",
# "Peri-Urban",
# "Rural")),
# y=`%`)) +
# geom_col() +
# labs(title = "Distribution of Place of Residence",
# x = "Geo")
#filtered_data %>%
# drop_na(geo) %>%
# select(geo) %>%
# freq() %>%
# as.data.frame() %>%
# ggplot(aes(x=factor(rownames(.),
# levels= c("Urban",
# "Peri-Urban",
# "Rural")),
# y=`%`)) +
# geom_col() +
# labs(title = "Distribution of Place of Residence",
# x = "Geo")
freq(Switzerland_wrkorg_cleaned$wrkorg)
## n % val%
## 1 2378 14.1 15.5
## 2 13000 76.8 84.5
## NA 1547 9.1 NA
Switzerland_wrkorg_cleaned_data <- Switzerland_wrkorg_cleaned %>%
select(wrkorg) %>%
mutate(wrkorg = recode(as.character(wrkorg),
'0' = "NA",
'1' = "YES",
'2' = "NO",
'3' = "NA",
'4' = "NA",
'5' = "NA",
'6' = "NA",
'7' = "NA",
'8' = "NA",
'9' = "NA",
'10'= "NA"))
Switzerland_wrkorg_cleaned_data %>%
drop_na(wrkorg) %>%
freq() %>%
as.data.frame() %>%
ggplot(aes(x=factor(rownames(.),
levels= c("YES",
"NO",
"NA")),
y=`%`)) +
geom_col() +
labs(title = "Distribution of Switzerland having working associations in the last 12 months",
x = "wrkorg")
Depending on what you look into, you will note that some of the
percentage seem off relative to what it would be for the entire
population. For example, % rural in the sample vs. % rural in the
population. That is why we eventually need to make adjustments and apply
survey weights. We will talk more about the specific survey design of
the ESS in future lectures.
So let’s turn to do conditional probabilities, giving us a better sense of what’s going conditional on a specific category (i.e., dividing by say where someone lives rather than by the total).
Do column percentages (as conditional %) with cprop(). Here’s an example:
#table(france_data$clsprty, france_data$educ_level) %>%
# cprop()
table(Switzerland_wrkorg_cleaned_data$wrkorg, Switzerland_wrkorg_cleaned_data$wrkorg) %>%
cprop()
##
## NO YES All
## NO 100.0 0.0 84.5
## YES 0.0 100.0 15.5
## Total 100.0 100.0 100.0
You can also do row percentages with rprop().
Produce a visual for how a categorical of interest is distributed over time.
For year example:
#table(filtered_data$educ_level, filtered_data$year) %>%
# cprop() %>%
# as.data.frame() %>%
# filter(Var1 != "Total",
# Var2 != "All") %>%
# ggplot(aes(x=Var2 %>% as.character() %>% as.integer(),
# y=Freq,
# color=Var1)) +
# geom_line() +
# labs(title="Educational levels by Survey Year",
# x = "Survey",
# color = "Education")
For birth year example:
#table(filtered_data$educ_level, filtered_data$year) %>%
# cprop() %>%
# as.data.frame() %>%
# filter(Var1 != "Total",
# Var2 != "All") %>%
# ggplot(aes(x=Var2 %>% as.character() %>% as.integer(),
# y=Freq,
# color=Var1)) +
# geom_line() +
# labs(title="Educational levels by Birth Year",
# x = "Birth Year",
# color = "Education")
table(ess$wrkorg)
##
## 1 2 7 8 9
## 59600 359846 304 1043 470
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
ess$year[ess$essround == i] <- replacements[i]
}
Switzerland_data <- ess[ess$cntry == "CH",]
Switzerland_data_cleaned <- Switzerland_data %>%
mutate(wrkorg = ifelse(wrkorg %in% c(7, 8, 9), NA, wrkorg))
Switzerland_data_cleaned_Birthyear <- Switzerland_data_cleaned %>%
drop_na(wrkorg) %>%
mutate(wrkorg = recode(as.character(wrkorg),
'0' = "NA",
'1' = "YES",
'2' = "NO",
'3' = "NA",
'4' = "NA",
'5' = "NA",
'6' = "NA",
'7' = "NA",
'8' = "NA",
'9' = "NA",
'10'= "NA"))
table(Switzerland_data_cleaned_Birthyear$wrkorg, Switzerland_data_cleaned_Birthyear$year) %>%
cprop() %>%
as.data.frame() %>%
filter(Var1 != "Total",
Var2 != "All") %>%
ggplot(aes(x=Var2 %>% as.character() %>% as.integer(),
y=Freq,
color=Var1)) +
geom_line() +
labs(title="Switzerland Respondents Having Work Organization in the Last 12 Months by Survey Year",
x = "Birth Year",
color = "Switzerland Respondents Having Work Organization in the Last 12 Months")
Do a second visualization between the same categorical variable and an outcome of interest. Focus this time on computing conditional probabilities or percentages.
Here are two code examples:
# double check clean
#france_data <- france_data %>%
# filter(!is.na(educ_level) & !is.na(clsprty))
# visualize
#table(france_data$clsprty, france_data$educ_level) %>%
# cprop() %>%
#as.data.frame() %>%
#filter(Var1 != "Total",
# Var2 != "All") %>%
#ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
#geom_col(position = "dodge") +
#labs(title="Feeling close to a particular party in France",
# x = "Feeling close (Y/N)",
# fill = "At least BA vs. Not")
# double check clean
#france_clean <- france_data %>%
# filter(!is.na(geo) & !is.na(trstplt))
# calculate conditional probabilities
#france_probs <- france_clean %>%
# count(trstplt, geo) %>%
# group_by(geo) %>%
# mutate(prob = n / sum(n))
# plot
#ggplot(france_probs, aes(x = as.factor(trstplt), y = prob, color = geo)) +
# geom_point() +
# geom_line(aes(group = geo)) +
# labs(title = "Conditional Probabilities of Trust in Politicians",
# subtitle = "by Place of Residence",
# x = "Trust Scale",
# y = "Probability") +
#theme_minimal()
table(ess$essround)
##
## 1 2 3 4 5 6 7 8 9 10
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
##
## 1 2 3 4 5 6 7 8 9 10
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
ess$year[ess$essround == i] <- replacements[i]
}
Switzerland_data <- ess %>% filter(cntry == "CH") %>% select(cntry, wrkorg, trstplc, year)
Switzerland_data_cleaned <- Switzerland_data %>%
filter(cntry == "CH", year == 2016) %>%
select(wrkorg, trstplc)
Switzerland_data_cleaned <- Switzerland_data_cleaned %>%
mutate(wrkorg = ifelse(wrkorg %in% c(7, 8, 9), NA, wrkorg))
Switzerland_data_cleaned <- Switzerland_data_cleaned %>%
mutate(trstplc = ifelse(trstplc %in% c(77, 88, 99), NA, trstplc))
table(Switzerland_data_cleaned)
## trstplc
## wrkorg 0 1 2 3 4 5 6 7 8 9 10
## 1 2 0 6 4 12 18 33 54 78 56 23
## 2 9 8 15 31 50 105 129 231 357 189 109
# double check clean
Switzerland_data_cleaned <- Switzerland_data_cleaned %>%
filter(!is.na(wrkorg) & !is.na(trstplc))
# visualize
table(Switzerland_data_cleaned$wrkorg, Switzerland_data_cleaned$trstplc) %>%
cprop() %>%
as.data.frame() %>%
filter(Var1 != "Total",
Var2 != "All") %>%
ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
geom_col(position = "dodge") +
labs(title="Switzerland Respondents Having Work Organization in the Last 12 Months", x = "Trust in Police")
Compare your mean(outcome) to the ESS baseline (all, inclusive of your country).
ess_data_cleaned <- ess %>%
filter(year == 2014) %>%
select(wrkorg, trstplc)
ess_data_cleaned <- ess_data_cleaned %>%
mutate(wrkorg = ifelse(wrkorg %in% c(7, 8, 9), NA, wrkorg))
ess_data_cleaned <- ess_data_cleaned %>%
mutate(trstplc = ifelse(trstplc %in% c(77, 88, 99), NA, trstplc))
avg_wrkorg_ess <- ess_data_cleaned %>%
summarize(mean_wrkorg_ess = mean(wrkorg, na.rm = TRUE))
avg_trstplc_Switzerland <- Switzerland_data_cleaned %>%
group_by(trstplc) %>%
summarize(mean_wrkorg_sweden = mean(wrkorg, na.rm = TRUE))
combined_data <- cross_join(avg_trstplc_Switzerland, avg_wrkorg_ess)
long_data <- combined_data %>%
pivot_longer(
cols = starts_with("Mean_wrkorg"),
names_to = "work_Type",
values_to = "Mean_Work"
)
ggplot(long_data) +
geom_line(aes(x = trstplc, y = Mean_Work, color = work_Type), size = 2) +
geom_point(aes(x = trstplc, y = Mean_Work, color = work_Type), size = 3) +
labs(title = "Mean Work by Trust in police",
x = "Trust in Police",
y = "Mean Work by Trust in police Scheme (0-3 scale)") +
ylim(1.6, 2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 26 rows containing missing values (`geom_point()`).