Loading the required libraries

if (!require("haven")) install.packages("haven")
## Cargando paquete requerido: haven
if (!require("dplyr")) install.packages("dplyr")
## Cargando paquete requerido: dplyr
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if (!require("magrittr")) install.packages("magrittr")
## Cargando paquete requerido: magrittr
if (!require("ggplot2")) install.packages("ggplot2")
## Cargando paquete requerido: ggplot2
if (!require("factoextra")) install.packages("factoextra")
## Cargando paquete requerido: factoextra
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(haven)
library(dplyr)
library(magrittr)
library(ggplot2)
library(factoextra)

Analysis Overview

This analysis aims to profile participants based on their concerns about international conflicts (variables P2 and P3) and their evaluations of key political leaders (variables VALORALIDERES_1 to VALORALIDERES_4).

The necessary libraries are loaded to facilitate the analysis.

library(haven)
library(dplyr)
library(magrittr)
library(ggplot2)
library(factoextra)

The dataset is imported and reviewed to ensure proper loading.

DFWar <- read_sav("C:/Users/Manuel/Desktop/3441.sav")
head(DFWar)

Non-response values are converted to missing values to maintain data integrity.

DFWar <- DFWar %>%
  mutate(P3 = na_if(P3, 98), P3 = na_if(P3, 99), P4 = na_if(P4, 8), P4 = na_if(P4, 9))
DFWar <- DFWar %>%
  mutate(VALORALIDERES_1 = na_if(VALORALIDERES_1, 98), VALORALIDERES_1 = na_if(VALORALIDERES_1, 99),
         VALORALIDERES_2 = na_if(VALORALIDERES_2, 98), VALORALIDERES_2 = na_if(VALORALIDERES_2, 99),
         VALORALIDERES_3 = na_if(VALORALIDERES_3, 98), VALORALIDERES_3 = na_if(VALORALIDERES_3, 99),
         VALORALIDERES_4 = na_if(VALORALIDERES_4, 98), VALORALIDERES_4 = na_if(VALORALIDERES_4, 99))

The transformation is verified by counting the missing values, with an expected total of 283.

NA_counts_all <- DFWar %>%
  summarize(NA_P3 = sum(is.na(P3)), NA_P4 = sum(is.na(P4)),
            NA_VALORALIDERES_1 = sum(is.na(VALORALIDERES_1)),
            NA_VALORALIDERES_2 = sum(is.na(VALORALIDERES_2)),
            NA_VALORALIDERES_3 = sum(is.na(VALORALIDERES_3)),
            NA_VALORALIDERES_4 = sum(is.na(VALORALIDERES_4)))
sum(NA_counts_all)
## [1] 283

A subset of the relevant columns is created, and missing values are removed.

SubDFWar <- select(DFWar, P2, P3, VALORALIDERES_1:VALORALIDERES_4)
SubDFWarNA <- na.omit(SubDFWar)

Hierarchical clustering is performed to identify patterns among participants.

X <- scale(SubDFWarNA, center = FALSE, scale = TRUE)
dj <- dist(X)
cc <- hclust(dj, method = "complete")
plot(cc, main = "Participant Clustering")

The optimal number of clusters is determined using the elbow method and silhouette analysis.

nclusterwar <- fviz_nbclust(SubDFWarNA, kmeans, method = "wss")
nclusterwar

library(cluster)
silhouette_values <- sapply(2:10, function(k) {
  kmeans_model <- kmeans(SubDFWarNA, centers = k)
  silhouette_avg <- mean(silhouette(kmeans_model$cluster, dist(SubDFWarNA)))
  return(silhouette_avg)
})

plot(2:10, silhouette_values, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of Clusters",
     ylab = "Average Silhouette Coefficient",
     main = "Silhouette Method for Optimal Cluster Count")

num_clusters <- 1:5
betweenss_values <- numeric(length(num_clusters))
for (k in num_clusters) {
  kmeans_model <- kmeans(SubDFWarNA, k, nstart = 25)
  betweenss_values[k] <- kmeans_model$betweenss}
for (k in num_clusters) {
  cat("Number of Clusters:", k, "- Between SS:", betweenss_values[k], "\n")}
## Number of Clusters: 1 - Between SS: 1.185981e-08 
## Number of Clusters: 2 - Between SS: 51712.62 
## Number of Clusters: 3 - Between SS: 69066.59 
## Number of Clusters: 4 - Between SS: 74733 
## Number of Clusters: 5 - Between SS: 79827.43

A final cluster analysis is conducted, and the results are summarized.

dist_matrix <- dist(SubDFWarNA)
hc <- hclust(dist_matrix)
cluster_membership <- cutree(hc, k = 3)
SubDFWarNA$Cluster <- cluster_membership
cluster_summary <- aggregate(. ~ cluster_membership, data = SubDFWarNA[, c("P2", "P3", "VALORALIDERES_1", "VALORALIDERES_2", "VALORALIDERES_3", "VALORALIDERES_4")], FUN = mean)
colnames(cluster_summary) <- c("Cluster", "UkraineConflict", "MiddleEastConflict", "PedroSanchez", "AlbertoFeijoo", "YolandaDiaz", "SantiagoAbascal")
print(cluster_summary)
##   Cluster UkraineConflict MiddleEastConflict PedroSanchez AlbertoFeijoo
## 1       1        2.193660           1.750517     7.192970      2.972433
## 2       2        2.457377           2.432787     1.352459      6.880328
## 3       3        2.397516           2.283456     2.591191      3.896669
##   YolandaDiaz SantiagoAbascal
## 1    7.048932        1.343212
## 2    2.032787        6.655738
## 3    2.501976        1.944664

Cluster Summaries:

Conclusion: The analysis identifies distinct groups based on conflict concerns and political leader evaluations, offering insights into public opinion segmentation.

Analysis of Voting Determinants Based on Household Income, Voting Probability, and Ideological Self-Placement

Introduction This analysis aims to explore the underlying components influencing voting behavior by examining three key variables: household income (INGRESHOG), voting probability (PROBVOTO), and ideological self-placement (ESCIDEOL). The results obtained will help assess the appropriateness and usefulness of the derived solution.

Data Processing The dataset used in this analysis was sourced from a survey file. The following preprocessing steps were conducted:

  1. Data Selection:
    • Only the relevant variables (INGRESHOG, PROBVOTO, ESCIDEOL) were retained for analysis.
library(haven)
DFvote <- read_sav("path/to/datafile.sav")
DFvote <- DFvote %>% select(INGRESHOG, PROBVOTO, ESCIDEOL)
  1. Handling Missing Values:
    • Missing values in PROBVOTO and ESCIDEOL were replaced with NA if they contained values 98 or 99.
    • Rows with missing values were subsequently omitted.
DFvote <- DFvote %>%
  mutate(PROBVOTO = ifelse(PROBVOTO %in% c(98, 99), NA, PROBVOTO),
         ESCIDEOL = ifelse(ESCIDEOL %in% c(98, 99), NA, ESCIDEOL))
DFvote <- na.omit(DFvote)
  1. Data Scaling and Transformation:
    • All variables were converted to numeric format to prevent conflicts.
    • Standardization was performed to ensure comparability.
DFvote <- DFvote %>%
  mutate(INGRESHOG = as.numeric(INGRESHOG),
         PROBVOTO = as.numeric(PROBVOTO),
         ESCIDEOL = as.numeric(ESCIDEOL))
DFvote_scaled <- DFvote %>%
  mutate(INGRESHOG = scale(INGRESHOG),
         PROBVOTO = scale(PROBVOTO),
         ESCIDEOL = scale(ESCIDEOL))

Exploratory Analysis An initial frequency analysis of the variables revealed a high probability of certain individuals voting, which could be an important factor in subsequent analysis.

variables <- c("INGRESHOG", "PROBVOTO", "ESCIDEOL")
for (variable in variables) {
  cat("Frequency of repeated values for", variable, ":\n")
  print(table(DFvote[[variable]]))
  cat("\n")
}

Correlation Analysis A correlation heatmap was generated to examine the relationships between variables. The results indicated minimal correlations, suggesting potential data simplicity and questioning the suitability of factorial analysis.

library(ggplot2)
library(reshape2)
matcor <- cor(DFvote_scaled)
matriz_cor_melt <- melt(matcor)
colnames(matriz_cor_melt) <- c("variable_x", "variable_y", "correlation")
ggplot(data = matriz_cor_melt, aes(x = variable_x, y = variable_y, fill = correlation)) +
  geom_tile() +
  geom_text(aes(label = round(correlation, 2)), color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1)) +
  coord_fixed()

Factor Analysis Given the results, a factorial analysis was conducted cautiously. The number of factors was determined using the eigenvalue criterion, which suggested that two factors should be retained.

library(psych)
pca_result <- prcomp(matcor, scale = TRUE)
eigenvalues <- pca_result$sdev^2
factores_significativos <- sum(eigenvalues > 1)
modelovote <- fa(DFvote_scaled, nfactors = 2, rotate = "varimax", fm = "minres")  
print(modelovote$loadings, cut = 0)

Conclusion The factorial analysis provides valuable insights into the determinants of voting behavior:

  1. Factor 1: Socioeconomic and Electoral Participation Dimension
    • Significant relationship between household income and voting probability.
    • Suggests lower-income individuals may be more inclined to participate in voting to influence policies affecting their economic situation.
  2. Factor 2: Ideological Dimension
    • Highlights the influence of ideological self-placement on electoral decisions, albeit with a lower explanatory power.

Final Remarks This analysis underscores the importance of understanding both socioeconomic and ideological dimensions in electoral behavior. Although the data quality poses some limitations, the extracted factors offer meaningful insights for policymakers and political strategists.

3

Modeling the variable P1 using age, gender, and ideological self-placement. Analysis of Voting Determinants Based on Household Income, Voting Probability, and Ideological Self-Placement

Introduction This analysis aims to explore the underlying components influencing voting behavior by examining three key variables: household income (INGRESHOG), voting probability (PROBVOTO), and ideological self-placement (ESCIDEOL). The results obtained will help assess the appropriateness and usefulness of the derived solution.

Data Processing The dataset used in this analysis was sourced from a survey file. The following preprocessing steps were conducted:

  1. Data Selection:
    • Only the relevant variables (INGRESHOG, PROBVOTO, ESCIDEOL) were retained for analysis.
library(haven)
DFvote <- read_sav("path/to/datafile.sav")
DFvote <- DFvote %>% select(INGRESHOG, PROBVOTO, ESCIDEOL)
  1. Handling Missing Values:
    • Missing values in PROBVOTO and ESCIDEOL were replaced with NA if they contained values 98 or 99.
    • Rows with missing values were subsequently omitted.
DFvote <- DFvote %>%
  mutate(PROBVOTO = ifelse(PROBVOTO %in% c(98, 99), NA, PROBVOTO),
         ESCIDEOL = ifelse(ESCIDEOL %in% c(98, 99), NA, ESCIDEOL))
DFvote <- na.omit(DFvote)
  1. Data Scaling and Transformation:
    • All variables were converted to numeric format to prevent conflicts.
    • Standardization was performed to ensure comparability.
DFvote <- DFvote %>%
  mutate(INGRESHOG = as.numeric(INGRESHOG),
         PROBVOTO = as.numeric(PROBVOTO),
         ESCIDEOL = as.numeric(ESCIDEOL))
DFvote_scaled <- DFvote %>%
  mutate(INGRESHOG = scale(INGRESHOG),
         PROBVOTO = scale(PROBVOTO),
         ESCIDEOL = scale(ESCIDEOL))

Exploratory Analysis An initial frequency analysis of the variables revealed a high probability of certain individuals voting, which could be an important factor in subsequent analysis.

variables <- c("INGRESHOG", "PROBVOTO", "ESCIDEOL")
for (variable in variables) {
  cat("Frequency of repeated values for", variable, ":\n")
  print(table(DFvote[[variable]]))
  cat("\n")
}

Correlation Analysis A correlation heatmap was generated to examine the relationships between variables. The results indicated minimal correlations, suggesting potential data simplicity and questioning the suitability of factorial analysis.

library(ggplot2)
library(reshape2)
matcor <- cor(DFvote_scaled)
matriz_cor_melt <- melt(matcor)
colnames(matriz_cor_melt) <- c("variable_x", "variable_y", "correlation")
ggplot(data = matriz_cor_melt, aes(x = variable_x, y = variable_y, fill = correlation)) +
  geom_tile() +
  geom_text(aes(label = round(correlation, 2)), color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1)) +
  coord_fixed()

Factor Analysis Given the results, a factorial analysis was conducted cautiously. The number of factors was determined using the eigenvalue criterion, which suggested that two factors should be retained.

library(psych)
pca_result <- prcomp(matcor, scale = TRUE)
eigenvalues <- pca_result$sdev^2
factores_significativos <- sum(eigenvalues > 1)
modelovote <- fa(DFvote_scaled, nfactors = 2, rotate = "varimax", fm = "minres")  
print(modelovote$loadings, cut = 0)

Conclusion The factorial analysis provides valuable insights into the determinants of voting behavior:

  1. Factor 1: Socioeconomic and Electoral Participation Dimension
    • Significant relationship between household income and voting probability.
    • Suggests lower-income individuals may be more inclined to participate in voting to influence policies affecting their economic situation.
  2. Factor 2: Ideological Dimension
    • Highlights the influence of ideological self-placement on electoral decisions, albeit with a lower explanatory power.

Logistic Regression Model Analysis

Data Preparation and Model Fitting

subdfsanchez <- read_sav("path/to/datafile.sav") %>%
  select(P1, EDAD, SEXO, ESCIDEOL) %>%
  mutate(ESCIDEOL = replace(ESCIDEOL, ESCIDEOL %in% c(98, 99), NA)) %>%
  na.omit()
subdfsanchez$P1 <- ifelse(subdfsanchez$P1 == 2, 0, subdfsanchez$P1)
modeloglmsanchez <- glm(P1 ~ EDAD + SEXO + ESCIDEOL, data = subdfsanchez, family = "binomial")
summary(modeloglmsanchez)

Evaluation of Model Fit

hoslem_test <- hoslem.test(subdfsanchez$P1, fitted(modeloglmsanchez), g = 5)
hoslem_test

Predictor Importance

Final Remarks This analysis underscores the importance of understanding both socioeconomic and ideological dimensions in electoral behavior. Although the data quality poses some limitations, the extracted factors and regression models offer meaningful insights for policymakers and political strategists.

4

Analysis of Voting Determinants Based on Household Income, Voting Probability, and Ideological Self-Placement

Introduction This analysis aims to explore the underlying components influencing voting behavior by examining three key variables: household income (INGRESHOG), voting probability (PROBVOTO), and ideological self-placement (ESCIDEOL). The results obtained will help assess the appropriateness and usefulness of the derived solution.

Data Processing The dataset used in this analysis was sourced from a survey file. The following preprocessing steps were conducted:

  1. Data Selection:
    • Only the relevant variables (INGRESHOG, PROBVOTO, ESCIDEOL) were retained for analysis.
library(haven)
DFvote <- read_sav("path/to/datafile.sav")
DFvote <- DFvote %>% select(INGRESHOG, PROBVOTO, ESCIDEOL)
  1. Handling Missing Values:
    • Missing values in PROBVOTO and ESCIDEOL were replaced with NA if they contained values 98 or 99.
    • Rows with missing values were subsequently omitted.
DFvote <- DFvote %>%
  mutate(PROBVOTO = ifelse(PROBVOTO %in% c(98, 99), NA, PROBVOTO),
         ESCIDEOL = ifelse(ESCIDEOL %in% c(98, 99), NA, ESCIDEOL))
DFvote <- na.omit(DFvote)
  1. Data Scaling and Transformation:
    • All variables were converted to numeric format to prevent conflicts.
    • Standardization was performed to ensure comparability.
DFvote <- DFvote %>%
  mutate(INGRESHOG = as.numeric(INGRESHOG),
         PROBVOTO = as.numeric(PROBVOTO),
         ESCIDEOL = as.numeric(ESCIDEOL))
DFvote_scaled <- DFvote %>%
  mutate(INGRESHOG = scale(INGRESHOG),
         PROBVOTO = scale(PROBVOTO),
         ESCIDEOL = scale(ESCIDEOL))

Exploratory Analysis An initial frequency analysis of the variables revealed a high probability of certain individuals voting, which could be an important factor in subsequent analysis.

variables <- c("INGRESHOG", "PROBVOTO", "ESCIDEOL")
for (variable in variables) {
  cat("Frequency of repeated values for", variable, ":\n")
  print(table(DFvote[[variable]]))
  cat("\n")
}

Regression Model to Explain Survey Duration

Data Preparation and Model Fitting

DFTIME <- read_sav("path/to/datafile.sav")
subftime <- DFTIME %>% select(EDAD, SEXO, ESCIDEOL, TIPO_TEL, IA_E3)
subftime$ESCIDEOL[subftime$ESCIDEOL %in% c(98, 99)] <- NA
subftime <- na.omit(subftime)
subftime$SEXO <- ifelse(subftime$SEXO == 2, 0, 1)
subftime$TIPO_TEL <- ifelse(subftime$TIPO_TEL == 2, 1, 0)
subftime$EDAD <- factor(subftime$EDAD)
subftime$SEXO <- factor(subftime$SEXO)
subftime$ESCIDEOL <- factor(subftime$ESCIDEOL)
subftime$TIPO_TEL <- factor(subftime$TIPO_TEL)

ANOVA Analysis

modeloanova <- aov(IA_E3 ~ EDAD + SEXO + ESCIDEOL + TIPO_TEL, data = subftime)
summary(modeloanova)

The variable EDAD is the only one that shows a significant effect on the duration of the survey, while the other variables (SEXO, ESCIDEOL, TIPO_TEL) do not have a significant effect according to the ANOVA analysis. This indicates that the duration of the survey is primarily influenced by the respondents’ age.

Linear Regression Analysis

modelo <- lm(IA_E3 ~ EDAD + SEXO + ESCIDEOL + TIPO_TEL, data = subftime)
summary(modelo)

The results suggest that age, ideological self-placement, and telephone type are significant factors influencing survey duration, whereas gender appears to have minimal and non-significant influence. Removing gender, we obtain a refined model:

modelo2 <- lm(IA_E3 ~ EDAD + ESCIDEOL + TIPO_TEL, data = subftime)
summary(modelo2)

Conclusion

Model Utility

  1. Resource Optimization: Helps identify potential over- or under-utilization of resources such as personnel and materials.
  2. Training and Development: Insights from the model can inform personalized training programs for interviewers.
  3. Quality Evaluation: Comparing predicted vs. actual survey durations can provide insights into data quality.
  4. Strategic Planning: Facilitates planning by predicting expected survey completion times under different conditions.
  5. Trend Identification: Analyzing variations in survey duration can reveal patterns that aid in operational efficiency.
  6. Performance Comparison: Enables comparison of interviewer performance across regions to establish best practices.