if (!require("haven")) install.packages("haven")
## Cargando paquete requerido: haven
if (!require("dplyr")) install.packages("dplyr")
## Cargando paquete requerido: dplyr
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if (!require("magrittr")) install.packages("magrittr")
## Cargando paquete requerido: magrittr
if (!require("ggplot2")) install.packages("ggplot2")
## Cargando paquete requerido: ggplot2
if (!require("factoextra")) install.packages("factoextra")
## Cargando paquete requerido: factoextra
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(haven)
library(dplyr)
library(magrittr)
library(ggplot2)
library(factoextra)
Analysis Overview
This analysis aims to profile participants based on their concerns about international conflicts (variables P2 and P3) and their evaluations of key political leaders (variables VALORALIDERES_1 to VALORALIDERES_4).
The necessary libraries are loaded to facilitate the analysis.
library(haven)
library(dplyr)
library(magrittr)
library(ggplot2)
library(factoextra)
The dataset is imported and reviewed to ensure proper loading.
DFWar <- read_sav("C:/Users/Manuel/Desktop/3441.sav")
head(DFWar)
Non-response values are converted to missing values to maintain data integrity.
DFWar <- DFWar %>%
mutate(P3 = na_if(P3, 98), P3 = na_if(P3, 99), P4 = na_if(P4, 8), P4 = na_if(P4, 9))
DFWar <- DFWar %>%
mutate(VALORALIDERES_1 = na_if(VALORALIDERES_1, 98), VALORALIDERES_1 = na_if(VALORALIDERES_1, 99),
VALORALIDERES_2 = na_if(VALORALIDERES_2, 98), VALORALIDERES_2 = na_if(VALORALIDERES_2, 99),
VALORALIDERES_3 = na_if(VALORALIDERES_3, 98), VALORALIDERES_3 = na_if(VALORALIDERES_3, 99),
VALORALIDERES_4 = na_if(VALORALIDERES_4, 98), VALORALIDERES_4 = na_if(VALORALIDERES_4, 99))
The transformation is verified by counting the missing values, with an expected total of 283.
NA_counts_all <- DFWar %>%
summarize(NA_P3 = sum(is.na(P3)), NA_P4 = sum(is.na(P4)),
NA_VALORALIDERES_1 = sum(is.na(VALORALIDERES_1)),
NA_VALORALIDERES_2 = sum(is.na(VALORALIDERES_2)),
NA_VALORALIDERES_3 = sum(is.na(VALORALIDERES_3)),
NA_VALORALIDERES_4 = sum(is.na(VALORALIDERES_4)))
sum(NA_counts_all)
## [1] 283
A subset of the relevant columns is created, and missing values are removed.
SubDFWar <- select(DFWar, P2, P3, VALORALIDERES_1:VALORALIDERES_4)
SubDFWarNA <- na.omit(SubDFWar)
Hierarchical clustering is performed to identify patterns among participants.
X <- scale(SubDFWarNA, center = FALSE, scale = TRUE)
dj <- dist(X)
cc <- hclust(dj, method = "complete")
plot(cc, main = "Participant Clustering")
The optimal number of clusters is determined using the elbow method and silhouette analysis.
nclusterwar <- fviz_nbclust(SubDFWarNA, kmeans, method = "wss")
nclusterwar
library(cluster)
silhouette_values <- sapply(2:10, function(k) {
kmeans_model <- kmeans(SubDFWarNA, centers = k)
silhouette_avg <- mean(silhouette(kmeans_model$cluster, dist(SubDFWarNA)))
return(silhouette_avg)
})
plot(2:10, silhouette_values, type = "b", pch = 19, frame = FALSE,
xlab = "Number of Clusters",
ylab = "Average Silhouette Coefficient",
main = "Silhouette Method for Optimal Cluster Count")
num_clusters <- 1:5
betweenss_values <- numeric(length(num_clusters))
for (k in num_clusters) {
kmeans_model <- kmeans(SubDFWarNA, k, nstart = 25)
betweenss_values[k] <- kmeans_model$betweenss}
for (k in num_clusters) {
cat("Number of Clusters:", k, "- Between SS:", betweenss_values[k], "\n")}
## Number of Clusters: 1 - Between SS: 1.185981e-08
## Number of Clusters: 2 - Between SS: 51712.62
## Number of Clusters: 3 - Between SS: 69066.59
## Number of Clusters: 4 - Between SS: 74733
## Number of Clusters: 5 - Between SS: 79827.43
A final cluster analysis is conducted, and the results are summarized.
dist_matrix <- dist(SubDFWarNA)
hc <- hclust(dist_matrix)
cluster_membership <- cutree(hc, k = 3)
SubDFWarNA$Cluster <- cluster_membership
cluster_summary <- aggregate(. ~ cluster_membership, data = SubDFWarNA[, c("P2", "P3", "VALORALIDERES_1", "VALORALIDERES_2", "VALORALIDERES_3", "VALORALIDERES_4")], FUN = mean)
colnames(cluster_summary) <- c("Cluster", "UkraineConflict", "MiddleEastConflict", "PedroSanchez", "AlbertoFeijoo", "YolandaDiaz", "SantiagoAbascal")
print(cluster_summary)
## Cluster UkraineConflict MiddleEastConflict PedroSanchez AlbertoFeijoo
## 1 1 2.193660 1.750517 7.192970 2.972433
## 2 2 2.457377 2.432787 1.352459 6.880328
## 3 3 2.397516 2.283456 2.591191 3.896669
## YolandaDiaz SantiagoAbascal
## 1 7.048932 1.343212
## 2 2.032787 6.655738
## 3 2.501976 1.944664
Cluster Summaries:
Conclusion: The analysis identifies distinct groups based on conflict concerns and political leader evaluations, offering insights into public opinion segmentation.
Analysis of Voting Determinants Based on Household Income, Voting Probability, and Ideological Self-Placement
Introduction This analysis aims to explore the underlying components influencing voting behavior by examining three key variables: household income (INGRESHOG), voting probability (PROBVOTO), and ideological self-placement (ESCIDEOL). The results obtained will help assess the appropriateness and usefulness of the derived solution.
Data Processing The dataset used in this analysis was sourced from a survey file. The following preprocessing steps were conducted:
library(haven)
DFvote <- read_sav("path/to/datafile.sav")
DFvote <- DFvote %>% select(INGRESHOG, PROBVOTO, ESCIDEOL)
NA if they contained values 98 or 99.DFvote <- DFvote %>%
mutate(PROBVOTO = ifelse(PROBVOTO %in% c(98, 99), NA, PROBVOTO),
ESCIDEOL = ifelse(ESCIDEOL %in% c(98, 99), NA, ESCIDEOL))
DFvote <- na.omit(DFvote)
DFvote <- DFvote %>%
mutate(INGRESHOG = as.numeric(INGRESHOG),
PROBVOTO = as.numeric(PROBVOTO),
ESCIDEOL = as.numeric(ESCIDEOL))
DFvote_scaled <- DFvote %>%
mutate(INGRESHOG = scale(INGRESHOG),
PROBVOTO = scale(PROBVOTO),
ESCIDEOL = scale(ESCIDEOL))
Exploratory Analysis An initial frequency analysis of the variables revealed a high probability of certain individuals voting, which could be an important factor in subsequent analysis.
variables <- c("INGRESHOG", "PROBVOTO", "ESCIDEOL")
for (variable in variables) {
cat("Frequency of repeated values for", variable, ":\n")
print(table(DFvote[[variable]]))
cat("\n")
}
Correlation Analysis A correlation heatmap was generated to examine the relationships between variables. The results indicated minimal correlations, suggesting potential data simplicity and questioning the suitability of factorial analysis.
library(ggplot2)
library(reshape2)
matcor <- cor(DFvote_scaled)
matriz_cor_melt <- melt(matcor)
colnames(matriz_cor_melt) <- c("variable_x", "variable_y", "correlation")
ggplot(data = matriz_cor_melt, aes(x = variable_x, y = variable_y, fill = correlation)) +
geom_tile() +
geom_text(aes(label = round(correlation, 2)), color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1)) +
coord_fixed()
Factor Analysis Given the results, a factorial analysis was conducted cautiously. The number of factors was determined using the eigenvalue criterion, which suggested that two factors should be retained.
library(psych)
pca_result <- prcomp(matcor, scale = TRUE)
eigenvalues <- pca_result$sdev^2
factores_significativos <- sum(eigenvalues > 1)
modelovote <- fa(DFvote_scaled, nfactors = 2, rotate = "varimax", fm = "minres")
print(modelovote$loadings, cut = 0)
Conclusion The factorial analysis provides valuable insights into the determinants of voting behavior:
Final Remarks This analysis underscores the importance of understanding both socioeconomic and ideological dimensions in electoral behavior. Although the data quality poses some limitations, the extracted factors offer meaningful insights for policymakers and political strategists.
Modeling the variable P1 using age, gender, and ideological self-placement. Analysis of Voting Determinants Based on Household Income, Voting Probability, and Ideological Self-Placement
Introduction This analysis aims to explore the underlying components influencing voting behavior by examining three key variables: household income (INGRESHOG), voting probability (PROBVOTO), and ideological self-placement (ESCIDEOL). The results obtained will help assess the appropriateness and usefulness of the derived solution.
Data Processing The dataset used in this analysis was sourced from a survey file. The following preprocessing steps were conducted:
library(haven)
DFvote <- read_sav("path/to/datafile.sav")
DFvote <- DFvote %>% select(INGRESHOG, PROBVOTO, ESCIDEOL)
NA if they contained values 98 or 99.DFvote <- DFvote %>%
mutate(PROBVOTO = ifelse(PROBVOTO %in% c(98, 99), NA, PROBVOTO),
ESCIDEOL = ifelse(ESCIDEOL %in% c(98, 99), NA, ESCIDEOL))
DFvote <- na.omit(DFvote)
DFvote <- DFvote %>%
mutate(INGRESHOG = as.numeric(INGRESHOG),
PROBVOTO = as.numeric(PROBVOTO),
ESCIDEOL = as.numeric(ESCIDEOL))
DFvote_scaled <- DFvote %>%
mutate(INGRESHOG = scale(INGRESHOG),
PROBVOTO = scale(PROBVOTO),
ESCIDEOL = scale(ESCIDEOL))
Exploratory Analysis An initial frequency analysis of the variables revealed a high probability of certain individuals voting, which could be an important factor in subsequent analysis.
variables <- c("INGRESHOG", "PROBVOTO", "ESCIDEOL")
for (variable in variables) {
cat("Frequency of repeated values for", variable, ":\n")
print(table(DFvote[[variable]]))
cat("\n")
}
Correlation Analysis A correlation heatmap was generated to examine the relationships between variables. The results indicated minimal correlations, suggesting potential data simplicity and questioning the suitability of factorial analysis.
library(ggplot2)
library(reshape2)
matcor <- cor(DFvote_scaled)
matriz_cor_melt <- melt(matcor)
colnames(matriz_cor_melt) <- c("variable_x", "variable_y", "correlation")
ggplot(data = matriz_cor_melt, aes(x = variable_x, y = variable_y, fill = correlation)) +
geom_tile() +
geom_text(aes(label = round(correlation, 2)), color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1)) +
coord_fixed()
Factor Analysis Given the results, a factorial analysis was conducted cautiously. The number of factors was determined using the eigenvalue criterion, which suggested that two factors should be retained.
library(psych)
pca_result <- prcomp(matcor, scale = TRUE)
eigenvalues <- pca_result$sdev^2
factores_significativos <- sum(eigenvalues > 1)
modelovote <- fa(DFvote_scaled, nfactors = 2, rotate = "varimax", fm = "minres")
print(modelovote$loadings, cut = 0)
Conclusion The factorial analysis provides valuable insights into the determinants of voting behavior:
Logistic Regression Model Analysis
Data Preparation and Model Fitting
subdfsanchez <- read_sav("path/to/datafile.sav") %>%
select(P1, EDAD, SEXO, ESCIDEOL) %>%
mutate(ESCIDEOL = replace(ESCIDEOL, ESCIDEOL %in% c(98, 99), NA)) %>%
na.omit()
subdfsanchez$P1 <- ifelse(subdfsanchez$P1 == 2, 0, subdfsanchez$P1)
modeloglmsanchez <- glm(P1 ~ EDAD + SEXO + ESCIDEOL, data = subdfsanchez, family = "binomial")
summary(modeloglmsanchez)
Evaluation of Model Fit
hoslem_test <- hoslem.test(subdfsanchez$P1, fitted(modeloglmsanchez), g = 5)
hoslem_test
Predictor Importance
Final Remarks This analysis underscores the importance of understanding both socioeconomic and ideological dimensions in electoral behavior. Although the data quality poses some limitations, the extracted factors and regression models offer meaningful insights for policymakers and political strategists.
Analysis of Voting Determinants Based on Household Income, Voting Probability, and Ideological Self-Placement
Introduction This analysis aims to explore the underlying components influencing voting behavior by examining three key variables: household income (INGRESHOG), voting probability (PROBVOTO), and ideological self-placement (ESCIDEOL). The results obtained will help assess the appropriateness and usefulness of the derived solution.
Data Processing The dataset used in this analysis was sourced from a survey file. The following preprocessing steps were conducted:
library(haven)
DFvote <- read_sav("path/to/datafile.sav")
DFvote <- DFvote %>% select(INGRESHOG, PROBVOTO, ESCIDEOL)
NA if they contained values 98 or 99.DFvote <- DFvote %>%
mutate(PROBVOTO = ifelse(PROBVOTO %in% c(98, 99), NA, PROBVOTO),
ESCIDEOL = ifelse(ESCIDEOL %in% c(98, 99), NA, ESCIDEOL))
DFvote <- na.omit(DFvote)
DFvote <- DFvote %>%
mutate(INGRESHOG = as.numeric(INGRESHOG),
PROBVOTO = as.numeric(PROBVOTO),
ESCIDEOL = as.numeric(ESCIDEOL))
DFvote_scaled <- DFvote %>%
mutate(INGRESHOG = scale(INGRESHOG),
PROBVOTO = scale(PROBVOTO),
ESCIDEOL = scale(ESCIDEOL))
Exploratory Analysis An initial frequency analysis of the variables revealed a high probability of certain individuals voting, which could be an important factor in subsequent analysis.
variables <- c("INGRESHOG", "PROBVOTO", "ESCIDEOL")
for (variable in variables) {
cat("Frequency of repeated values for", variable, ":\n")
print(table(DFvote[[variable]]))
cat("\n")
}
Regression Model to Explain Survey Duration
Data Preparation and Model Fitting
DFTIME <- read_sav("path/to/datafile.sav")
subftime <- DFTIME %>% select(EDAD, SEXO, ESCIDEOL, TIPO_TEL, IA_E3)
subftime$ESCIDEOL[subftime$ESCIDEOL %in% c(98, 99)] <- NA
subftime <- na.omit(subftime)
subftime$SEXO <- ifelse(subftime$SEXO == 2, 0, 1)
subftime$TIPO_TEL <- ifelse(subftime$TIPO_TEL == 2, 1, 0)
subftime$EDAD <- factor(subftime$EDAD)
subftime$SEXO <- factor(subftime$SEXO)
subftime$ESCIDEOL <- factor(subftime$ESCIDEOL)
subftime$TIPO_TEL <- factor(subftime$TIPO_TEL)
ANOVA Analysis
modeloanova <- aov(IA_E3 ~ EDAD + SEXO + ESCIDEOL + TIPO_TEL, data = subftime)
summary(modeloanova)
The variable EDAD is the only one that shows a significant effect on the duration of the survey, while the other variables (SEXO, ESCIDEOL, TIPO_TEL) do not have a significant effect according to the ANOVA analysis. This indicates that the duration of the survey is primarily influenced by the respondents’ age.
Linear Regression Analysis
modelo <- lm(IA_E3 ~ EDAD + SEXO + ESCIDEOL + TIPO_TEL, data = subftime)
summary(modelo)
The results suggest that age, ideological self-placement, and telephone type are significant factors influencing survey duration, whereas gender appears to have minimal and non-significant influence. Removing gender, we obtain a refined model:
modelo2 <- lm(IA_E3 ~ EDAD + ESCIDEOL + TIPO_TEL, data = subftime)
summary(modelo2)
Conclusion
Model Utility