This reconstruction file creates the Political Extremism measures data for the World Value Survey (WVS). The file includes all analysis steps used to calculate the measures as well as the R code required for this.
To reconstruct the results please perform the following steps:
The analysis was developed using R version 4.4.0 (2024-04-24) and RStudio 2024.04.2 release for Windows.
To view the code - click on the ‘Code’ buttons on the right side of the page.
Load required libraries and support R files
# load external packages and source helper functions
library(data.table)
library(dplyr)
library(tidyr)
library(tidyverse)
library(labelled)
library(extrafont)
library(ggplot2)
library(gt)
library(DT)
library(gridExtra)
library(lavaan)
library(psych)
source("af_utils.R")
source("af_data_dictionary.R")
source("af_factor_analysis.R")
#disable the dplyr console message:
options(dplyr.summarise.grouped = FALSE)
options(dplyr.summarise.inform = FALSE)
# limit uplod file size to 100MB
options(shiny.maxRequestSize=100*1024^2)
Set the default theme & font for plots and tables
extrafont::loadfonts(device = "win", quiet=TRUE)
ggplot2::theme_set(theme_bw() +
theme(text = element_text(family = "LM Roman 10", size = 12)))
Load the survey data file and select the required columns for the analysis .
df <- as.data.frame(readRDS("WVS/data/WVS_Time_Series_1981-2022_rds_v5_0.rds"))
wave_list <- c(6:7)
wave_vars <- c("year" = "S020",
"country_wave" = "S024"
)
demographic_vars <- c("respondent" = "S007",
"sex" = "X001",
"age" = "X003",
"emplyment_status" = "X028"
)
social_vars <- c("people_of_a_different_race" = "A124_02", # WVS 1-7
"heavy_drinkers" = "A124_03", # WVS 1-7
"immigrants_foreign_workers" = "A124_06", # WVS 1-7
"people_who_have_aids" = "A124_07", # WVS 2-7
"drug_addicts" = "A124_08", # WVS 2-7
"homesexuals" = "A124_09", # WVS 2-7
"people_of_a_different_religon" = "A124_12" # WVS 3-7
)
behavioral_vars <- c("violence_against_other_people" = "F114D", # WVS 6-7
"terrorism" = "F114E", # WVS 7
"political_violence" = "E290" # WVS 7
)
cognitive_vars <- c("left_right" = "E033" # WVS 1-7
)
state_vars <- c("society_attiudes" = "E034"
)
critical_vars <- setdiff (c(names(cognitive_vars),
names(behavioral_vars),
names(social_vars)),
c("terrorism",
"political_violence"))
country_critical_pct <- 0.3 # around 300 - 400 respondents min
var_list <-
c(wave_vars, demographic_vars, cognitive_vars, social_vars,
behavioral_vars, state_vars)
df <- df %>% select(!!var_list)
Convert missing data to NA
labelled::na_values(df) <- c(-5, -4, -3, -2, -1)
df <- labelled::user_na_to_na(df)
df <- labelled::remove_user_na(df)
Convert haven_labelled classes/types to R standard
df <- df %>%
mutate(year = labelled::to_factor(year, ordered = TRUE)) %>%
mutate(country_wave = labelled::to_factor(country_wave, ordered = TRUE)) %>%
mutate(respondent = unclass(respondent)) %>%
mutate(sex = labelled::to_factor(sex)) %>%
mutate(age = unclass(age)) %>%
mutate(emplyment_status = labelled::to_factor(emplyment_status)) %>%
mutate(left_right = unclass(left_right)) %>%
mutate(people_of_a_different_race = labelled::to_factor(people_of_a_different_race)) %>%
mutate(heavy_drinkers = labelled::to_factor(heavy_drinkers)) %>%
mutate(immigrants_foreign_workers = labelled::to_factor(immigrants_foreign_workers)) %>%
mutate(people_who_have_aids = labelled::to_factor(people_who_have_aids)) %>%
mutate(drug_addicts = labelled::to_factor(drug_addicts)) %>%
mutate(homesexuals = labelled::to_factor(homesexuals)) %>%
mutate(people_of_a_different_religon = labelled::to_factor(people_of_a_different_religon)) %>%
mutate(violence_against_other_people = unclass(violence_against_other_people)) %>%
mutate(terrorism = unclass(terrorism)) %>%
mutate(political_violence = unclass(political_violence)) %>%
mutate(society_attiudes = labelled::to_factor(society_attiudes))
Create wave number and country name variables
The country_wave data is in the format “country (wave)” Select data from the list of waves and countries to analyze
# Create a new variable with labels
df$country_wave_label <- levels(df$country_wave)[df$country_wave]
# create separate country and wave variables
df <- df %>%
separate(country_wave_label, into = c("country", "wave"), sep = "\\(") %>%
mutate(wave = as.numeric(gsub("\\)", "", wave)))
# Remove trailing blanks in the 'country' variable
df$country <- trimws(df$country, which = "right")
# set attributes to be used for printing
attr(df$wave,"label") <- "Survey wave"
attr(df$country,"label") <- "Survey country"
Filter required waves (6 & 7)
# filter waves and countries
df <- df %>% filter(wave %in% wave_list)
The critical variables are left_right, violence_against_other_people, people_of_a_different_race, heavy_drinkers, immigrants_foreign_workers, people_who_have_aids, drug_addicts, homesexuals, people_of_a_different_religon In each wave, remove countries with less than 30% of critical variables’ data Display the percentage of rows with NA in at least one critical variable Remove all such rows
result <- af_clean_adta(df,
critical_vars = critical_vars,
country_critical_pct = country_critical_pct)
df_cleaned <- result$df
problematic_countries <- result$problematic_countries
percentages <- result$percentages
Percentages of critical variables’ missing data:
DT::datatable(percentages, extensions = "FixedColumns",
options = list(scrollX = TRUE,
fixedColumns = list(leftColumns = 2)))
Countries removed per wave:
# Convert the list to a tibble
problematic_countries_tbl <- tibble(
Wave = names(problematic_countries),
Countries = map(problematic_countries, ~ paste(.x, collapse = ", "))
)
# Create the gt table
problematic_countries_table <- problematic_countries_tbl %>%
gt() %>%
tab_header(
title = "Problematic Countries by Wave"
) %>%
cols_label(
Wave = "Wave",
Countries = "Deleted Countries"
) %>%
fmt_markdown(columns = Countries) %>%
tab_options(
column_labels.font.weight = "bold",
table.width = pct(100)
)
# Display the table
problematic_countries_table
Problematic Countries by Wave | |
Wave | Deleted Countries |
---|---|
6 | Colombia, Ecuador, Iraq, Jordan, Kazakhstan, Morocco, Poland, Russian Federation, South Africa |
7 | China, Ecuador, Iran, Iraq, Japan, Kazakhstan, Kyrgyzstan, South Korea, Mongolia, New Zealand, Tajikistan, Uzbekistan, Vietnam |
Dataset saved to ‘WVS/data/wvs.RDS’.
df <- df_cleaned
saveRDS(df, "WVS/data/wvs.RDS")
Create a data dictionary for the survey variables. LaTex version of the data dictionary table is kept in ‘out/data_dictionary.tex’.
dd <- af_create_data_dictionary(df)
tbl <- gt::gt(dd) %>%
cols_width(2 ~ px(400)) %>%
tab_options(table.width = pct(80), table.font.size = "10pt")
# Save in LaTex format
gt::gtsave(tbl, filename = "out/wvs_data_dictionary.tex",
encoding = "UTF-8", fileEncoding = "UTF-8", use_glyphs = TRUE)
DT::datatable(dd, extensions = "FixedColumns",
options = list(scrollX = TRUE,
fixedColumns = list(leftColumns = 2)))
Create a descriptive statistics table for the survey data. LaTex version of the data dictionary table is kept in ‘out/descriptive_stats.tex’.
dd <- datawizard::describe_distribution(df, include_factors= TRUE) %>%
select(-Skewness, -Kurtosis)
tbl <- gt::gt(dd) %>%
fmt_number(
columns = c(2, 3, 4), # Specify columns to format
decimals = 2, # Set number of decimal places
drop_trailing_zeros = TRUE # Remove trailing zeros after the decimal point
# scientific = FALSE # Don't use scientific notation
) %>%
tab_options(table.font.size = "10pt") # table.width = pct(80),
# Save in LaTex format
gt::gtsave(tbl, filename = "out/wvs_descriptive_stats.tex",
encoding = "UTF-8", fileEncoding = "UTF-8", use_glyphs = TRUE)
DT::datatable(dd, extensions = "FixedColumns",
options = list(scrollX = TRUE,
fixedColumns = list(leftColumns = 2)))
Load the survey data file
df <- as.data.frame(readRDS("WVS/data/wvs.RDS"))
The Ideological dimension is based on respondents self positioning on
a scale of 1-10 where left is 1 and right is 10.
The right_wing measure is the same as the left-right self positioning
measure.
The left_wing measure is the reverse of the left-right self positioning
measure.
right_wing <- df$left_right
scale_min <- 1
scale_max <- 10
left_wing <- (scale_min + scale_max) - df$left_right
df$right_wing <- right_wing
df$left_wing <- left_wing
The following charts provide information on the distribution (histogram) of right and left wing,
g1 <- df %>%
group_by(wave) %>%
ggplot(aes(x = right_wing, fill = as.factor(wave))) +
geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
labs(title = "",
x = "Political Right-wing",
y = "Frequency")
g2 <- df %>%
group_by(wave) %>%
ggplot(aes(x = left_wing, fill = as.factor(wave))) +
geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
labs(title = "",
x = "Political Left-wing",
y = "Frequency")
grid.arrange(g1, g2, nrow = 2, ncol = 1)
In Wave 6 we only have one variable that reflects attitudes towards violence: ‘violence_against_other_people’ In Wave 7 we have three variables that can be combined together: ‘violence_against_other_people’, ‘terrorism’, and ‘political_violence’
The following analysis refers only for Wave 7
df7 <- df[df$wave == 7,]
behavioral_vars <- c("violence_against_other_people",
"terrorism",
"political_violence"
)
Combining multiple related questions into a single measure can be beneficial, as it can provide a more comprehensive and reliable assessment of the underlying construct - in this case, support for political violence. It is essential to ensure that the questions are sufficiently related and measure the same underlying construct before combining them.
The first step is to examine the inter-item correlations: Calculate the correlations between each pair of questions (e.g., violence against other people and terrorism, violence against other people and political violence, terrorism and political violence). High positive correlations (typically above 0.3 or 0.4) suggest that the questions are measuring a similar underlying construct and can potentially be combined.
violence_against_other_people | terrorism | political_violence |
---|---|---|
1.0000000 | 0.6420579 | 0.6233193 |
0.6420579 | 1.0000000 | 0.6552292 |
0.6233193 | 0.6552292 | 1.0000000 |
The second step is to perform an exploratory factor analysis (EFA) and then or a reliability analysis (e.g., calculate Cronbach’s alpha) on the three questions. If the questions load onto a single factor or have a high alpha value (typically above 0.7), it indicates that they are measuring a common underlying construct and can be combined.
# Reliability analysis (Cronbach's alpha)
alpha_result <- psych::alpha(dfb)
alpha_interpretation <- af_interpret_cronbach(alpha_result)
The Cronbach’s test alpha is 0.841645 indicating Good reliability.
# Parallel analysis to determine number of factors
fa_result <- psych::fa.parallel(dfb, fm="pa", plot = FALSE)
#> Parallel analysis suggests that the number of factors = 1 and the number of components = 1
The analysis suggests that 1 factor(s) are required.
Based on the indications above we have decide to combine the three questions into one measure.
The first approach for combining the three indicators is to perform a
simple average of the three scores. This approach assumes that all
indicators are equally important in measuring the construct.
Alternatively we can calculate a weighted average, where each indicator
is given a different weight based on its importance. In this approach we
will give a higher weight (0.5) to the Political violence question as it
relates exactly to our target measure, and a lower weight (0.25) to each
of the other indicators.
The last alternative is to use only the scores from the political
violence indicator
All three alternatives are calculated.
# Combining the questions
df$violence_mean <- rowMeans(df[, behavioral_vars], na.rm=TRUE) # Simple average
# Weighted Average for wave 7, violence_against_other_people for wave 6
weights <- c(w_other_people = 0.25, w_terrorism = 0.25, w_political = 0.5)
df$violence_weighted <-
ifelse(is.na(df$political_violence), df$violence_against_other_people, rowSums(df[, behavioral_vars] * weights))
# Political violence for wave 7, violence_against_other_people for wave 6
df$violence_single <-
ifelse(is.na(df$political_violence), df$violence_against_other_people, df$political_violence)
The following chart provides information on the distribution (histogram) of the different alternatives.
g1 <- df %>%
group_by(wave) %>%
ggplot(aes(x = violence_mean, fill = as.factor(wave))) +
geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
labs(title = "",
x = "Mean (W7) / Violence Against Other People (W6)",
y = "Frequency")
g2 <- df %>%
group_by(wave) %>%
ggplot(aes(x = violence_weighted, fill = as.factor(wave))) +
geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
labs(title = "",
x = "Weighted average (w7) / Violence Against Other People (W6)",
y = "Frequency")
g3 <- df %>%
group_by(wave) %>%
ggplot(aes(x = violence_single, fill = as.factor(wave))) +
geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
labs(title = "",
x = "Political Violence (W7) / Violence Against Other People (W6)",
y = "Frequency")
grid.arrange(g1, g2, g3, nrow = 3, ncol = 1)
3.3 Social Dimension
Select the required columns for the social dimension
3.3.1 Exploratory factor analysis (EFA) - Wave 6.
Since all of the data variables are binary factors (Not mentioned/Mentioned), We use polychoric correlation.
The analysis suggests that there are 3 latent variables (factors).
(Number of factors with eigen values > eigen values of random data)
3.3.2 Exploratory factor analysis (EFA) - Wave 7.
The analysis suggests that there are 3 latent variables (factors).
(Number of factors with eigen values > eigen values of random data)
3.3.3 Confirmatory Factor Analysis (CFA)
Based on the indications from the EFA we perform a Confirmatory Factor Analysis (CFA) for the Political Tolerance latent variable based on the follwoing three factors: people_of_a_different_race, people_of_a_different_religon, and immigrants_foreign_workers
The results of the CFA are:
Using the CFA model fit, we calculate (predict) the component score for political_tolerance.
The following table provides the political tolerance index values based on the different combination of the survey questions.
The following charts provide information on the distribution (histogram and density plots) of political tolerance