1 Required Libraries

# load external packages and source helper functions

library(knitr)
library(kableExtra)

library(dplyr) 
library(haven)
library(ggplot2)
library(ggcorrplot)
library(viridis)
library(tidyverse)
library(gridExtra)
library(gt)
library(lavaan)
library(semTools)
library(semPlot)
library(psych)

library(afcommon)

source("af_common_add_ons.R")

options(dplyr.summarise.inform = FALSE)

Set the default theme & font for plots and tables

font_size = 6

af_theme <- theme_bw(base_size = font_size) +
  theme(
    plot.title = element_text(size = font_size),        # Plot title
    axis.title = element_text(size = font_size),        # Axis titles
    axis.text = element_text(size = font_size),         # Axis tick labels
    legend.title = element_text(size = font_size),      # Legend title
    legend.text = element_text(size = font_size),       # Legend text
    strip.text = element_text(size = font_size)         # Facet variable's name
    )

ggplot2::theme_set(af_theme)

Load the original SPSS data (.sav) and convert to RDS (RDS files are loaded much faster in R and takes less memory space).

sav_file_name <- "Israel Survey/data/combined 1-6.sav"
rds_file_name <- "Israel Survey/data/combined 1-6.RDS"

df <- as.data.frame(haven::read_sav(sav_file_name)) #read the SPSS .sav file
df <- as.data.frame(lapply(df, unclass)) # Remove the haven package class

saveRDS(df, rds_file_name)

1.1 Tidy Survey Datafile

Remove un-needed spss attributes

df[] <- lapply(df, function(x) {attr(x, "format.spss") <- NULL; return(x)})
df[] <- lapply(df, function(x) {attr(x, "display_width") <- NULL; return(x)})

1.1.1 Identify Duplicated Rows

Check for duplicated rows

duplicated_rows <- duplicated(df) | duplicated(df, fromLast = TRUE)

if (sum(duplicated_rows) > 0) 
  print(df[duplicated_rows, ]) else print("No duplicated rows\n")

[1] “No duplicated rows”

1.1.2 Fix Dates

Convert StartDate and EndDate from date-time format to date only format

df$start_date <- as.Date(as.POSIXct(df$StartDate, format = "%Y-%m-%d %H:%M:%S"))
df$end_date <- as.Date(as.POSIXct(df$EndDate, format = "%Y-%m-%d %H:%M:%S"))

1.1.3 Fix Attributes

Fix attributes originated by SPSS. Labels are use to document the survey variables (Data dictionary) as well as to convert the variables types (when required, mainly used for converting to factors). We add the fourth wave as a an attribute to the Wave variable.

attr(df$sample, "labels") <- c("Jewish"=1, "Arab" = 2)
attr(df$Wave, "labels") <- c("First"=1, "Second"=2, "Third"=3, "Fourth"=4, "Fifth"=5, "Sixth"=6)

1.1.4 Complement Panel Data

Wave 3 and wave 4 are part of a data panel (same survey respondents). The data of wave 4 is kept in the columns ending with ’_P’. In some cases missing demographics data is missing in one wave (wave 3 or wave 4) and exist in the other wave. In such cases we complement the missing data with the data that exist in the other wave of the panel.

#Demographic columns to complement
col_names <- c("age", "gender", "residence", "region", "religion", "religiosity_",
               "politi_orient_1", "political_identifi_1", "income",
               "education", "marital_status", "origin")

# Subset missing Wave 3 data with data from Wave 4
col_list <- col_names 
rows_to_modify <- df$Wave == 3 & df$ResponseId != "" & df$ResponseId_P != ""

# Iterate through columns and modify in-place
for (col in col_list) {
  df[rows_to_modify, col][is.na(df[rows_to_modify, col])] <- 
    df[rows_to_modify, paste0(col, "_P")][is.na(df[rows_to_modify, col])]
}

# Subset missing Wave 4 data with data from Wave 3
col_list <- paste0(col_names, "_P")
rows_to_modify <- df$Wave == 3 & df$ResponseId != "" & df$ResponseId_P != ""

# Iterate through columns and modify in-place
for (col in col_list) {
  
  # Extract base column name without "_P" using sub
  base_col <- sub("_P$", "", col)
  
  # Replace missing values in matching rows
  df[rows_to_modify, col][is.na(df[rows_to_modify, col])] <- 
    df[rows_to_modify, base_col][is.na(df[rows_to_modify, col])]
}

1.1.5 Add Fourth Wave Rows

Wave 3 and Wave 4 are part of a panel (same survey respondents). The data of wave 4 is stored in rows of wave 3 under columns that end with ’_P’. We add additional rows to the survey table with the data of wave 4. This will enable to easily perform calculation or generate statistics across all waves. We keep the _P columns intact.

# Use Wave 3 ResponseId as respondent_id to identify the relevant Wave 4 mathing responses
df$respondent_id <- df$ResponseId

# Duplicate rows of Wave 3 that has non-NULL Response Id_P and set Wave to 4
df_wave_3 <- df %>%
  filter(Wave == 3) %>%
  filter(ResponseId_P != "") %>%
  mutate(Wave = 4)
attributes(df_wave_3$Wave) <- attributes(df$Wave)  # keep attributes

# Set respondents to Wave 4 only (Wave id of wave 3 is NULL but ResponseID_P is not null), to have 4 in the Wave column
df_na_wave <- df %>%
  filter(is.na(Wave)) %>%
  filter(!is.na(ResponseId_P)) %>%
  mutate(Wave = 4)
attributes(df_na_wave$Wave) <- attributes(df$Wave)  # keep attributes

df_main <- df %>% 
  filter(!is.na(Wave))

# Combine the modified rows with Wave == 3 and the rest of the data
# We have the main data with Wave==NA, the Wave=NA rowa awt to Wave 4 and additional copy of Wave 3 with Wave set to 4
df <- rbind(df_main, df_na_wave, df_wave_3) # use rbind to keep the attributes

# Identify the columns that do not end with "_P"
non_P_columns <- grep("[^_P]$", names(df), value = TRUE)

# Identify the corresponding _P columns for the non-_P columns
P_columns <- paste0(non_P_columns, "_P")
P_columns <- P_columns[P_columns %in% names(df)]  # Only keep existing _P columns

# Recalculate non-P columns after removing _P
non_P_columns <- sub("_P$", "", P_columns)

# Identify rows where Wave == 4
rows_to_modify <- df$Wave == 4

# Replace values in non-_P columns with values from corresponding _P columns
df[rows_to_modify, non_P_columns] <- df[rows_to_modify, paste0(non_P_columns, "_P")]

1.1.6 Factor Conversion

The ‘labels’ attribute provides information on the variables. Variables with only one value (Numeric, character or date) do not have the ‘labels’ attribute. Numeric attributes that represent a value within a scale (Likert type answer) have a ‘labels’ attribute that includes a list with the values of the scale. The first and last values are in the form “? - S” where ? is a digit and S is a string that provides the description of the edges of the scale. The ‘labels’ attributes of the rest of the variables in the survey data includes a named list of the potential values (numeric values) and their description. We identify this variables and convert them to factors.

# Function to check first element name in labels attribute
check_first_element_name <- function(col_name) {
  # Get the attribute for the specific column
  col_attr <- attr(df[[col_name]], "labels")
  # Check if labels attribute exists and has non-empty elements
  if (!is.null(col_attr) && length(col_attr) > 0) {
    # Extract the first element from the labels attribute
    name1 <- names(col_attr[1])
    # Check if it's a named list and name follows the pattern
    return(!(grepl("^\\d\\s*=.*$", name1) || grepl("\n\\d$",name1)))
  }
  return(FALSE)
}

# Get column names with matching attribute
col_names <- names(df)[sapply(names(df), check_first_element_name)]

# Convert desired columns to factors with labels (keeping attributes)
for (col_name in col_names) {
  col_attr <- attributes(df[[col_name]])
  col <- df[[col_name]]
  df[[col_name]] <- 
    factor(col, levels = attr(col, "labels"), labels = names(attr(col, "labels")))
  attr(df[[col_name]],"label") <- col_attr$label
  attr(df[[col_name]],"labels") <- col_attr$labels
}

Convert all character variables to factors

df <- df %>%
  mutate(across(where(is.character), as.factor))

1.1.7 Fix column names

  • columns ending with ’_’
df = dplyr::rename(df,religiosity = religiosity_)

1.1.8 Hebrew to English

  • gender
  • religiosity
  • religion
  • income
  • leastliked
  • marital status
  • education
  • region

# Here are the English translations of those Hebrew gender terms:
# "זכר" (Zakhar) - Male
# "נקבה" (Nekevah) - Female
# "אחר" (Acher) - Other

levels(df$gender) <- c("Male", "Female", "Other")

# Here are the English translations of those Hebrew religiosity terms:
# חילוני/ת (chiloni/t) - Secular
# מסורתי/ת (masorti/t) - Traditional
# דתי/ה (dati/ya) - Religious
# דתי/ה-לאומי/ת (dati/ya-leumi/t) - National Religious
# חרדי/ת (charedi/t) - Ultra-Orthodox
# חרדי/ת לאומי/ת (charedi/t leumi/t) - National Ultra-Orthodox

levels(df$religiosity) <- c("Secular", "Traditional", "Religious", 
                             "National Religious", "Ultra-Orthodox", "National Ultra-Orthodox")

# Here are the English translations of those Hebrew religion terms:
# יהודי/ה (Yehudi/ya) - Jewish
# מוסלמי/ת (Muslemi/t) - Muslim
# נוצרי/ה (Notzri/ya) - Christian
# דרוזי/ת (Druzi/t) - Druze
# אחר (Acher) - Other

levels(df$religion) <- c("Jewish", "Muslim", "Christian", "Druze", "Other")

# Here are the English translations of those Hebrew income terms:
# "הרבה מתחת לממוצע" (Harbeh mitachat lamemutza) - Much below average
# "קצת מתחת לממוצע" (Ktzat mitachat lamemutza) - Slightly below average
# "כמו הממוצע" (Kmo hamamutza) - About average
# "קצת מעל הממוצע" (Ktzat me'al hamamutza) - Slightly above average
# "הרבה מעל הממוצע" (Harbeh me'al hamamutza) - Much above average

levels(df$income) <- c("Much below average", "Slightly below average",                                  
                       "About average", 
                       "Slightly above average", "Much above average")
df$income <- factor(df$income, ordered = TRUE) 

# Here are the English translations of those Hebrew population group terms:
# "מתיישבים ביהודה ושומרון" (Mityashvim beYehuda veShomron) - Settlers in Judea and Samaria
# "ערבים-ישראלים" (Aravim-Israelim) - Israeli Arabs
# "שמאלנים" (Smolahim) - Leftists
# "ימנים" (Yemaniyim) - Rightists
# "מזרחים" (Mizrachim) - Middle Eastern Jews
# "אשכנזים" (Ashkenazim) - Ashkenazi Jews
# "חרדים" (Charedim) - Ultra-Orthodox Jews
# "חילונים" (Chilonim) - Secular Jews
# "עולים מבריה״מ לשעבר" (Olim miBrit haMoatzot leshe'avar) - Immigrants from the former Soviet Union
# "אתיופים" (Ethiopim) - Ethiopians
# "מהגרים/פליטים מאפריקה" (Mehagrim/Plitim meAfrika) - Migrants/Refugees from Africa
# "עובדים זרים" (Ovdim Zarim) - Foreign Workers
# "להט״בים" (Lehatbim) - LGBTQ+

levels(df$leastliked) <- c("Settlers in Judea and Samaria",
                           "Israeli Arabs", 
                           "Leftists",
                           "Rightists",
                           "Middle Eastern Jews",
                           "Ashkenazi Jews",
                           "Ultra-Orthodox Jews",
                           "Secular Jews", 
                           "SOviet Union Immigrants",
                           "Ethiopians",
                           "African Migrants/Refugees",
                           "Foreign Workers",
                           "LGBTQ+"
                           )

# Here are the English translations of those Hebrew marital status terms:
# "רווק/ה" (Ravak/Revakah) - Single
# "נשוי/נשואה" (Nasui/Nesuah) - Married
# "גרוש/ה" (Garush/Gerushah) - Divorced
# "אלמן/אלמנה" (Alman/Almanah) - Widowed

levels(df$marital_status) <- c("Single", "Married", "Divorced", "Widowed")

# Here are the English translations of those Hebrew education-level terms:
# "יסודית" (Yesodit) - Elementary
# "תיכונית" (Tichonit) - High School
# "על-תיכונית שאינה אקדמית" (Al-Tichonit She'eina Akademit) - Post-Secondary Non-Academic
# "אקדמית" (Akademit) - Academic (Higher Education)
# "ישיבה" (Yeshiva) - Yeshiva (Religious Jewish education)
# "אחר" (Acher) - Other

levels(df$education) <- c("Elementary", "High School", "Post-Secondary Non-Academic",
                          "Academic", "Yeshiva", "Other")

# Here are the English translations of those Hebrew district terms:
# מחוז ירושלים (Mechoz Yerushalayim) - Jerusalem District
# מחוז צפון (Mechoz Tzafon) - Northern District
# מחוז חיפה (Mechoz Haifa) - Haifa District
# מחוז מרכז (Mechoz Merkaz) - Central District
# מחוז תל אביב (Mechoz Tel Aviv) - Tel Aviv District
# מחוז דרום (Mechoz Darom) - Southern District
# מחוז יהודה ושומרון (Mechoz Yehuda VeShomron) - Judea and Samaria District

levels(df$region) <- c("Jerusalem District", "Northern District", 
                       "Haifa District", "Central District",
                       "Tel Aviv District", "Southern District",
                       "Judea and Samaria District")

Set correct order to Wave and to religiosity

wave_order <- c("First", "Second", "Third", "Fourth", "Fifth", "Sixth")
religiosity_order <- 
  c("Secular", "Traditional", "Religious", "National Religious", "Ultra-Orthodox", "National Ultra-Orthodox")

df <- df %>%  
  mutate(Wave = factor(Wave, levels = wave_order)) %>%
  mutate(religiosity = factor(religiosity, levels = religiosity_order))

Filter out records with no Wave number or records with questionable data (age > 120).

n1 <- nrow(df) 

df <- df %>%
  filter(!(is.na(Wave))) %>%
  filter(is.na(age) | age <= 120) 

n2 <- nrow(df)

cat(paste("Removed", n1-n2, "records\n"))

Removed 1 records

Filter out rows with empty, NA, or chr(0) respondent_id for Third and Fourth waves only

n1 <- nrow(df) 

df <- df %>%
  filter(!(Wave %in% c("Third", "Fourth") & 
             (is.na(respondent_id) | respondent_id == "")))

n2 <- nrow(df)

cat(paste("Removed", n1-n2, "records\n"))

Removed 8 records

Filter out respondents where age in wave 3 is higher than in wave 4

n1 <- nrow(df)

# Find respondents with age in Third > age in Fourth
problematic_ids <- df %>%
  filter(Wave %in% c("Third", "Fourth"), !is.na(age)) %>%
  dplyr::select(respondent_id, Wave, age) %>%
  pivot_wider(names_from = Wave, values_from = age) %>%
  filter(!is.na(Third) & !is.na(Fourth) & Third > Fourth) %>%
  pull(respondent_id)

# Filter out problematic respondents
df <- df %>%
  filter(!respondent_id %in% problematic_ids)

n2 <- nrow(df)

cat(paste("Removed", n1-n2, "records\n"))

Removed 20 records

Add age group variable

# Create age group variable (move to measures)
df$age_group <- af_create_groups(df$age, c(18, 30, 45, 60, Inf), 
                                 c("18–30", "31–45", "46–60", "60plus"))

Create numeric Wave indicator

df$nwave <- as.numeric(df$Wave)

Print percentages of missing data (NA)

1.2 Data Dictionary

Create a data dictionary for the survey variables. LaTex version of the data dictionary table is kept in ‘out/data_dictionary.tex’.

dd <- af_create_data_dictionary(df)

tbl <- dd %>%
  cols_width(2 ~ px(400)) %>%
  tab_options(table.width = pct(80), table.font.size = "10pt") 

# # Save in LaTex format
# gt::gtsave(tbl, filename = "out/Israel_data_dictionary.tex", 
#            encoding = "UTF-8", fileEncoding = "UTF-8", use_glyphs = TRUE)

DT::datatable(as.data.frame(tbl))

1.3 Descriptive Statistics

Create a descriptive statistics table for the survey data. LaTex version of the data dictionary table is kept in ‘out/descriptive_stats.tex’.

dd <- datawizard::describe_distribution(df, include_factors= TRUE) %>%
  dplyr::select(-Skewness, -Kurtosis)

tbl <- gt::gt(dd)  %>%
  fmt_number(
    columns = c(2, 3, 4),  # Specify columns to format
    decimals = 2,           # Set number of decimal places
    drop_trailing_zeros = TRUE  # Remove trailing zeros after the decimal point
    # scientific = FALSE      # Don't use scientific notation
  ) %>%
  tab_options(table.font.size = "10pt") # table.width = pct(80), 

# # Save in LaTex format
# gt::gtsave(tbl, filename = "out/Israel_descriptive_stats.tex", 
#            encoding = "UTF-8", fileEncoding = "UTF-8", use_glyphs = TRUE)

DT::datatable(as.data.frame(tbl))

Save the data in the file ‘data/israel_survey.RDS’.

This the original data file after tidying and before adding any political extremism new variables.

saveRDS(df, "Israel Survey/data/israel_survey.RDS")