This reconstruction file creates the Political Extremism measures data for the World Value Survey (WVS). The file includes all analysis steps used to calculate the measures as well as the R code required for this.

To reconstruct the results please perform the following steps:

  1. Obtain the reconstruction ZIP file from the author
  2. Unpack the zip file to a direcotry on your PC
  3. Open the file extremism_measures_wvs.Rmd in RStudio
  4. Select the knit Document command from the File menu
    • All data files are located in the data/ sub-directory
    • All graphs and tables are located in the out/ sub-directory

The analysis was developed using R version 4.4.0 (2024-04-24) and RStudio 2024.04.2 release for Windows.

To view the code - click on the ‘Code’ buttons on the right side of the page.

1 Load required resources

Load required libraries and support R files

# load external packages and source helper functions

library(data.table)
library(dplyr) 
library(tidyr)
library(tidyverse)
library(labelled)
library(extrafont)
library(ggplot2)
library(gt)
library(DT)
library(gridExtra)
library(lavaan)
library(psych)

source("af_utils.R")
source("af_data_dictionary.R")
source("af_factor_analysis.R")

#disable the dplyr console message: 
options(dplyr.summarise.grouped = FALSE)
options(dplyr.summarise.inform = FALSE)

# limit uplod file size to 100MB
options(shiny.maxRequestSize=100*1024^2) 

Set the default theme & font for plots and tables

extrafont::loadfonts(device = "win", quiet=TRUE)
ggplot2::theme_set(theme_bw() +
            theme(text = element_text(family = "LM Roman 10", size = 12)))

2 WVS dataset for Political Extremism Analysis

Load the survey data file and select the required columns for the analysis .

df <- as.data.frame(readRDS("WVS/data/WVS_Time_Series_1981-2022_rds_v5_0.rds"))

wave_list <- c(6:7)

wave_vars <- c("year" = "S020", 
               "country_wave" = "S024"
               )
demographic_vars <- c("respondent" = "S007",
                      "sex" = "X001",
                      "age" = "X003", 
                      "emplyment_status" = "X028"
                      )
social_vars <- c("people_of_a_different_race" = "A124_02",      # WVS 1-7
                "heavy_drinkers" = "A124_03",                   # WVS 1-7
                "immigrants_foreign_workers" = "A124_06",       # WVS 1-7
                "people_who_have_aids" = "A124_07",             # WVS 2-7
                "drug_addicts" = "A124_08",                     # WVS 2-7
                "homesexuals" = "A124_09",                      # WVS 2-7
                "people_of_a_different_religon" = "A124_12"     # WVS 3-7
                )
behavioral_vars <- c("violence_against_other_people" = "F114D", # WVS 6-7
                     "terrorism" = "F114E",                     # WVS 7
                     "political_violence" = "E290"              # WVS 7
                     )
cognitive_vars <- c("left_right" = "E033"                       # WVS 1-7
                    )

state_vars <- c("society_attiudes" = "E034"
                )
critical_vars <- setdiff (c(names(cognitive_vars),       
                            names(behavioral_vars), 
                            names(social_vars)),
                          c("terrorism",
                            "political_violence"))

country_critical_pct <- 0.3 # around 300 - 400 respondents min

var_list <- 
  c(wave_vars, demographic_vars, cognitive_vars, social_vars, 
    behavioral_vars, state_vars)

df <- df %>% select(!!var_list)

2.1 Clean the dataset

Convert missing data to NA

labelled::na_values(df) <- c(-5, -4, -3, -2, -1)
df <- labelled::user_na_to_na(df)
df <- labelled::remove_user_na(df)

Convert haven_labelled classes/types to R standard

df <- df %>%
  mutate(year = labelled::to_factor(year, ordered = TRUE)) %>%
  mutate(country_wave = labelled::to_factor(country_wave, ordered = TRUE)) %>%
  mutate(respondent = unclass(respondent)) %>%
  mutate(sex = labelled::to_factor(sex)) %>%
  mutate(age = unclass(age)) %>%
  mutate(emplyment_status = labelled::to_factor(emplyment_status)) %>%
  mutate(left_right = unclass(left_right)) %>%
  mutate(people_of_a_different_race = labelled::to_factor(people_of_a_different_race)) %>%
  mutate(heavy_drinkers = labelled::to_factor(heavy_drinkers)) %>%
  mutate(immigrants_foreign_workers = labelled::to_factor(immigrants_foreign_workers)) %>%
  mutate(people_who_have_aids = labelled::to_factor(people_who_have_aids)) %>%
  mutate(drug_addicts = labelled::to_factor(drug_addicts)) %>%
  mutate(homesexuals = labelled::to_factor(homesexuals)) %>%
  mutate(people_of_a_different_religon = labelled::to_factor(people_of_a_different_religon)) %>%
  mutate(violence_against_other_people = unclass(violence_against_other_people)) %>%
  mutate(terrorism = unclass(terrorism)) %>%
  mutate(political_violence = unclass(political_violence)) %>%
  mutate(society_attiudes = labelled::to_factor(society_attiudes))

Create wave number and country name variables

The country_wave data is in the format “country (wave)” Select data from the list of waves and countries to analyze

# Create a new variable with labels
df$country_wave_label <- levels(df$country_wave)[df$country_wave]

# create separate country and wave variables
df <- df %>% 
  separate(country_wave_label, into = c("country", "wave"), sep = "\\(") %>% 
  mutate(wave = as.numeric(gsub("\\)", "", wave)))

# Remove trailing blanks in the 'country' variable
df$country <- trimws(df$country, which = "right")

# set attributes to be used for printing
attr(df$wave,"label") <- "Survey wave"
attr(df$country,"label") <- "Survey country"

Filter required waves (6 & 7)

# filter waves and countries
df <- df %>%  filter(wave %in% wave_list)

2.2 Remove problematic data

The critical variables are left_right, violence_against_other_people, people_of_a_different_race, heavy_drinkers, immigrants_foreign_workers, people_who_have_aids, drug_addicts, homesexuals, people_of_a_different_religon In each wave, remove countries with less than 30% of critical variables’ data Display the percentage of rows with NA in at least one critical variable Remove all such rows


result <- af_clean_adta(df, 
                       critical_vars = critical_vars, 
                       country_critical_pct = country_critical_pct)

df_cleaned <- result$df
problematic_countries <- result$problematic_countries
percentages <- result$percentages

Percentages of critical variables’ missing data:

DT::datatable(percentages, extensions = "FixedColumns",
              options = list(scrollX = TRUE,
                             fixedColumns = list(leftColumns = 2)))

Countries removed per wave:

# Convert the list to a tibble
problematic_countries_tbl <- tibble(
  Wave = names(problematic_countries),
  Countries = map(problematic_countries, ~ paste(.x, collapse = ", "))
)

# Create the gt table
problematic_countries_table <- problematic_countries_tbl %>%
  gt() %>%
  tab_header(
    title = "Problematic Countries by Wave"
  ) %>%
  cols_label(
    Wave = "Wave",
    Countries = "Deleted Countries"
  ) %>%
  fmt_markdown(columns = Countries) %>%
  tab_options(
    column_labels.font.weight = "bold",
    table.width = pct(100)
  )

# Display the table
problematic_countries_table
Problematic Countries by Wave
Wave Deleted Countries
6

Colombia, Ecuador, Iraq, Jordan, Kazakhstan, Morocco, Poland, Russian Federation, South Africa

7

China, Ecuador, Iran, Iraq, Japan, Kazakhstan, Kyrgyzstan, South Korea, Mongolia, New Zealand, Tajikistan, Uzbekistan, Vietnam

2.3 Save dataset

Dataset saved to ‘WVS/data/wvs.RDS’.

df <- df_cleaned

saveRDS(df, "WVS/data/wvs.RDS")

2.4 Data Dictionary

Create a data dictionary for the survey variables. LaTex version of the data dictionary table is kept in ‘out/data_dictionary.tex’.

dd <- af_create_data_dictionary(df)

tbl <- gt::gt(dd) %>%
  cols_width(2 ~ px(400)) %>%
  tab_options(table.width = pct(80), table.font.size = "10pt") 

# Save in LaTex format
gt::gtsave(tbl, filename = "out/wvs_data_dictionary.tex", 
           encoding = "UTF-8", fileEncoding = "UTF-8", use_glyphs = TRUE)

DT::datatable(dd, extensions = "FixedColumns",
              options = list(scrollX = TRUE,
                             fixedColumns = list(leftColumns = 2)))

2.5 Descriptive Statistics

Create a descriptive statistics table for the survey data. LaTex version of the data dictionary table is kept in ‘out/descriptive_stats.tex’.

dd <- datawizard::describe_distribution(df, include_factors= TRUE) %>%
  select(-Skewness, -Kurtosis)

tbl <- gt::gt(dd)  %>%
  fmt_number(
    columns = c(2, 3, 4),  # Specify columns to format
    decimals = 2,           # Set number of decimal places
    drop_trailing_zeros = TRUE  # Remove trailing zeros after the decimal point
    # scientific = FALSE      # Don't use scientific notation
  ) %>%
  tab_options(table.font.size = "10pt") # table.width = pct(80), 

# Save in LaTex format
gt::gtsave(tbl, filename = "out/wvs_descriptive_stats.tex", 
           encoding = "UTF-8", fileEncoding = "UTF-8", use_glyphs = TRUE)

DT::datatable(dd, extensions = "FixedColumns",
              options = list(scrollX = TRUE,
                             fixedColumns = list(leftColumns = 2)))

3 Political Extremism Measures

Load the survey data file

df <- as.data.frame(readRDS("WVS/data/wvs.RDS"))

3.1 Ideological Dimension

The Ideological dimension is based on respondents self positioning on a scale of 1-10 where left is 1 and right is 10.
The right_wing measure is the same as the left-right self positioning measure.
The left_wing measure is the reverse of the left-right self positioning measure.

right_wing <- df$left_right

scale_min <- 1
scale_max <- 10
left_wing <- (scale_min + scale_max) - df$left_right

df$right_wing <- right_wing
df$left_wing <- left_wing

The following charts provide information on the distribution (histogram) of right and left wing,

g1 <- df %>% 
  group_by(wave) %>%
  ggplot(aes(x = right_wing, fill = as.factor(wave))) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
  labs(title = "", 
       x = "Political Right-wing", 
       y = "Frequency")

g2 <- df %>% 
  group_by(wave) %>%
  ggplot(aes(x = left_wing, fill = as.factor(wave))) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
  labs(title = "", 
       x = "Political Left-wing", 
       y = "Frequency")

grid.arrange(g1, g2, nrow = 2, ncol = 1)

3.2 Behavioral Dimension

In Wave 6 we only have one variable that reflects attitudes towards violence: ‘violence_against_other_people’ In Wave 7 we have three variables that can be combined together: ‘violence_against_other_people’, ‘terrorism’, and ‘political_violence’

The following analysis refers only for Wave 7

df7 <- df[df$wave == 7,]

behavioral_vars <- c("violence_against_other_people",
                     "terrorism", 
                     "political_violence"
                     )

Combining multiple related questions into a single measure can be beneficial, as it can provide a more comprehensive and reliable assessment of the underlying construct - in this case, support for political violence. It is essential to ensure that the questions are sufficiently related and measure the same underlying construct before combining them.

The first step is to examine the inter-item correlations: Calculate the correlations between each pair of questions (e.g., violence against other people and terrorism, violence against other people and political violence, terrorism and political violence). High positive correlations (typically above 0.3 or 0.4) suggest that the questions are measuring a similar underlying construct and can potentially be combined.

violence_against_other_people terrorism political_violence
1.0000000 0.6420579 0.6233193
0.6420579 1.0000000 0.6552292
0.6233193 0.6552292 1.0000000

The second step is to perform an exploratory factor analysis (EFA) and then or a reliability analysis (e.g., calculate Cronbach’s alpha) on the three questions. If the questions load onto a single factor or have a high alpha value (typically above 0.7), it indicates that they are measuring a common underlying construct and can be combined.

# Reliability analysis (Cronbach's alpha)
alpha_result <- psych::alpha(dfb)
alpha_interpretation <- af_interpret_cronbach(alpha_result)

The Cronbach’s test alpha is 0.841645 indicating Good reliability.

# Parallel analysis to determine number of factors
fa_result <- psych::fa.parallel(dfb, fm="pa", plot = FALSE) 
#> Parallel analysis suggests that the number of factors =  1  and the number of components =  1

The analysis suggests that 1 factor(s) are required.

Based on the indications above we have decide to combine the three questions into one measure.

The first approach for combining the three indicators is to perform a simple average of the three scores. This approach assumes that all indicators are equally important in measuring the construct.
Alternatively we can calculate a weighted average, where each indicator is given a different weight based on its importance. In this approach we will give a higher weight (0.5) to the Political violence question as it relates exactly to our target measure, and a lower weight (0.25) to each of the other indicators.
The last alternative is to use only the scores from the political violence indicator

All three alternatives are calculated.

# Combining the questions
df$violence_mean <- rowMeans(df[, behavioral_vars], na.rm=TRUE) # Simple average

# Weighted Average for wave 7, violence_against_other_people for wave 6
weights <- c(w_other_people = 0.25, w_terrorism = 0.25, w_political = 0.5)
df$violence_weighted <- 
  ifelse(is.na(df$political_violence), df$violence_against_other_people, rowSums(df[, behavioral_vars] * weights))

# Political violence for wave 7, violence_against_other_people for wave 6
df$violence_single <- 
  ifelse(is.na(df$political_violence), df$violence_against_other_people, df$political_violence)

The following chart provides information on the distribution (histogram) of the different alternatives.

g1 <- df %>% 
  group_by(wave) %>%
  ggplot(aes(x = violence_mean, fill = as.factor(wave))) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
  labs(title = "", 
       x = "Mean (W7) / Violence Against Other People (W6)", 
       y = "Frequency")

g2 <- df %>%
  group_by(wave) %>%
  ggplot(aes(x = violence_weighted, fill = as.factor(wave))) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
  labs(title = "", 
       x = "Weighted average (w7) / Violence Against Other People (W6)", 
       y = "Frequency")

g3 <- df %>%
  group_by(wave) %>%
  ggplot(aes(x = violence_single, fill = as.factor(wave))) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
  labs(title = "", 
       x = "Political Violence (W7) / Violence Against Other People (W6)", 
       y = "Frequency")

grid.arrange(g1, g2, g3, nrow = 3, ncol = 1)

3.3 Social Dimension

Select the required columns for the social dimension

social_vars <- c("people_of_a_different_race",
                "heavy_drinkers", 
                "immigrants_foreign_workers",
                "people_who_have_aids",
                "drug_addicts",
                "homesexuals", 
                "people_of_a_different_religon"
                )

3.3.1 Exploratory factor analysis (EFA) - Wave 6.

Since all of the data variables are binary factors (Not mentioned/Mentioned), We use polychoric correlation.


# Select Wave 7 data
dfs <- na.omit(df[df$wave == 6, social_vars])

# Calculate polychoric correlations
numeric_dfs <- data.frame(lapply(dfs, function(x) as.numeric(x)))
polychoric_data <- as.matrix(numeric_dfs)
polychoric_result <- psych::polychoric(polychoric_data)
poly_cor <- polychoric_result$rho

# Replace n.obs with your actual sample size
n.obs <- nrow(polychoric_data)  

# Run fa.parallel with polychoric correlations
fa_result <- psych::fa.parallel(poly_cor, n.obs = n.obs, fm = "ml")

#> Parallel analysis suggests that the number of factors =  3  and the number of components =  2

The analysis suggests that there are 3 latent variables (factors).
(Number of factors with eigen values > eigen values of random data)

fit <- fa(poly_cor, nfactors = fa_result$nfact, fm = "ml")

# plot the factor model
loads <- fit$loadings
psych::fa.diagram(loads)

3.3.2 Exploratory factor analysis (EFA) - Wave 7.


# Select Wave 7 data
dfs <- na.omit(df[df$wave == 7, social_vars])

# Calculate polychoric correlations
numeric_dfs <- data.frame(lapply(dfs, function(x) as.numeric(x)))
polychoric_data <- as.matrix(numeric_dfs)
polychoric_result <- psych::polychoric(polychoric_data)
poly_cor <- polychoric_result$rho

# Replace n.obs with your actual sample size
n.obs <- nrow(polychoric_data)  

# Run fa.parallel with polychoric correlations
fa_result <- psych::fa.parallel(poly_cor, n.obs = n.obs, fm = "ml")

#> Parallel analysis suggests that the number of factors =  3  and the number of components =  2

The analysis suggests that there are 3 latent variables (factors).
(Number of factors with eigen values > eigen values of random data)

fit <- fa(poly_cor, nfactors = fa_result$nfact, fm = "ml")

# plot the factor model
loads <- fit$loadings
psych::fa.diagram(loads)

3.3.3 Confirmatory Factor Analysis (CFA)

Based on the indications from the EFA we perform a Confirmatory Factor Analysis (CFA) for the Political Tolerance latent variable based on the follwoing three factors: people_of_a_different_race, people_of_a_different_religon, and immigrants_foreign_workers

factor_items <- c("people_of_a_different_race",
                  "people_of_a_different_religon",
                  "immigrants_foreign_workers")

# Select the social vars across all waves
dfs <- df[, factor_items]

model <- paste("political_tolerance =~", paste(factor_items, collapse = "+"))
fit <- lavaan::cfa(model, data = dfs, ordered = factor_items)
fit_measures <- fitMeasures(fit)

# summary(fit, fit.measures = TRUE)

The results of the CFA are:

  • CFI (Comparative Fit Index) = 1. Values above 0.90 indicate good fit.
  • TLI (Tucker-Lewis Index) = 1. Values above 0.90 indicate good fit.
  • RMSEA (Root Mean Square Error of Approximation) = 0. Values below 0.08 indicate reasonable fit, below 0.05 indicate good fit.
  • SRMR (Standardized Root Mean Residual) = 5.4670178^{-10}. Values below 0.08 indicate good fit.

Using the CFA model fit, we calculate (predict) the component score for political_tolerance.

pt_predict <- lavPredict(fit, type = "lv")
df$political_tolerance <- pt_predict[, 1]

The following table provides the political tolerance index values based on the different combination of the survey questions.

pt_index <- unique(df[,c(factor_items, "political_tolerance")])
gt::gt(pt_index[order(pt_index$political_tolerance),])
people_of_a_different_race people_of_a_different_religon immigrants_foreign_workers political_tolerance
Not mentioned Not mentioned Not mentioned -0.2022162
Not mentioned Not mentioned Mentioned 0.5275114
Not mentioned Mentioned Not mentioned 0.6184572
Mentioned Not mentioned Not mentioned 0.8030151
Not mentioned Mentioned Mentioned 0.9288598
Mentioned Not mentioned Mentioned 1.0866797
Mentioned Mentioned Not mentioned 1.1326223
Mentioned Mentioned Mentioned 1.4924313

The following charts provide information on the distribution (histogram and density plots) of political tolerance

# Histogram
g1 <- df %>% 
  group_by(wave) %>%
  ggplot(aes(x = political_tolerance, fill = as.factor(wave))) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.7, position = "dodge") +
  labs(title = "", 
       x = "Political Tolerance", 
       y = "Frequency")

g1

Save the data in ‘WVS/data/wvs_pe.RDS’.

saveRDS(df, "WVS/data/wvs_pe.RDS")