Ciencia de datos [Modelo]

1. Carga de librerías y datos

2. Limpieza y exploración inicial de datos

#SELECCIÓN DE VARIABLES DE LA BASE ORIGINAL QUE SE EMPLEARÁN EN EL ANÁLISIS
variables <- c("p120", "p125", "p126", "p127", "p128", "p131_1", "p132_2", "p139", "p140_1", "p141_2", "p145", "p154", "p155", "p156", "p138",
"num_acciones_participacion_ciudadana", "p161", "p162", "p163",
"p164", "nom_mun_mv", "mun_amm_y_periferia", "cp4_1", "sexo_num", "Factor_CVNL", "cp7_1", "cp8_1", "cp9_1",
"p100", "p114",
"p52_1", "p52_2", "p52_3", "p52_4", "p52_5",
"p59", "p64",
"p73", "p74", "p75",
"num_discriminaciones")

#CREACIÓN DE DATAFRAME CON VARIABLES SELECCIONADAS
eav2024_selected <- eav2024[, variables]

#CREACIÓN DE DATAFRAME CON DATOS ÚNICAMENTE DE AMM (EL ANÁLISIS SE CENTRARÁ EN ESTA ÁREA)
amm <- subset(eav2024_selected, nom_mun_mv %in% c("San Nicolás de los Garza", "Guadalupe", "Apodaca", "San Pedro Garza García", "Santa Catarina", "General Escobedo", "Juárez", "García", "Santiago", "Pesquería", "Cadereyta Jiménez", "Monterrey"))

#OBSERVACIÓN DE 8888 Y 9999 PARA DETERMINAR QUÉ HACER CON ELLOS
m=sapply(amm, function(col) sum(col== 8888, na.rm = TRUE))
total1 <- sum(m) #SON EN TOTAL 2889, POR LO QUE NO SE VAN A ELIMINAR
m2=sapply(amm, function(col) sum(col== 9999, na.rm = TRUE))
total2 <- sum(m2)  #SON EN TOTAL 557, POR LO QUE SÍ SE VAN A ELIMINAR

#ELIMINACIÓN DE 9999
numericas <- c("p120", "p125", "p126", "p127", "p128", "p131_1", "p132_2", "p139", "p140_1", "p141_2", "p145", "p154", "p155", "p156", "p138",
"num_acciones_participacion_ciudadana", "p161", "p162", "p163",
"p164",
"p100", "p114",
"p52_1", "p52_2", "p52_3", "p52_4", "p52_5",
"p59", "p64",
"p73", "p74", "p75",
"num_discriminaciones") #IDENTIFICACIÓN DE VARIABLES CON 9999
fillas_99 <- rowSums(amm[, numericas] == 9999, na.rm = TRUE) == 0 #IDENTIFICACIÓN DE FILAS SIN 9999
eav2024_limpio <- amm[fillas_99,] #DATAGRAME SÓLO CON FILAS SIN 9999

#TRANSFORMACIÓN DE VARIABLES CATEGÓRICAS A FACTOR
eav2024_categoricas <- eav2024_limpio  %>%
  mutate(
    sexo = case_when(
      sexo_num == 0 ~ "Hombre",
      sexo_num == 1 ~ "Mujer"
    ),
    sexo = as.factor(sexo), #CREACIÓN DE VARIABLE FACTOR PARA SEXO (CON HOMBRE Y MUJER COMO TEXTO)
    Municipio = as.factor(nom_mun_mv), #CREACIÓN DE VARIABLE FACTOR PARA MUNICIPIO
  ) %>%
  mutate(across(
    .cols = c(
      p120, p126,p125, p126,p131_1, p132_2,p140_1,p141_2,p154, p161, p162, p163, p164, mun_amm_y_periferia, cp7_1, p64
    ),
    .fns = as.factor
  )) %>% #CONVERTIR VARIABLES QUE SE LEYERON COMO NUMÉRICAS PERO QUE EN REALIDAD SON CATEGÓRICAS A FACTOR
  select(-sexo_num, -nom_mun_mv) #ELIMINACIÓN DE COLUMNAS NUMÉRICAS QUE YA NO SON NECESARIAS

#RENOMBRADO DE VARIABLES
eav2024_renombre <- eav2024_categoricas %>%
  rename(
    voto_pasado = p120,
    alcalde_uso_recursos = p125,
    gob_uso_recursos = p126,
    transparencia_estado = p127,
    transparencia_municipio = p128,
    alcalde_honesto = p131_1,
    alcalde_capaz = p132_2,
    ineficiencia_gobierno_alcalde = p139,
    samuel_honesto = p140_1,
    samuel_capaz = p141_2,
    ineficiencia_gobierno_samuel = p145,
    aprueba_congreso = p154,
    percep_no_influencia_ciud_est = p155,
    percep_no_influencia_ciud_mun = p156,
    acciones_participacion = num_acciones_participacion_ciudadana,
    problema_confianza = p161,
    ambito_problema = p162,
    orgullo_nl = p163,
    ingreso_hogar = p164,
    zona_geografica = mun_amm_y_periferia,
    factor_expansion = Factor_CVNL,
    estudia_actualmente = cp7_1,
    nivel_educativo_max = cp8_1,
    nivel_educativo_actual = cp9_1,
    edad = cp4_1,
    percepcion_seguridad = p100,
    violencia_mujeres = p114,
    satisfaccion_alumbrado = p52_1,
    satisfaccion_calles = p52_2,
    satisfaccion_parques = p52_3,
    satisfaccion_drenaje = p52_4,
    satisfaccion_basura = p52_5,
    preocupacion_agua = p59,
    acciones_contaminacion = p64,
    situacion_economica_actual = p73,
    situacion_economica_futura = p74,
    suficiencia_ingreso = p75,
    num_discriminaciones = num_discriminaciones,
    percepcion_municipio = p138
  )

3. Preprocesamiento y creación de variables nuevas

#CREACIÓN DE VARIABLE EDUCACIÓN
niveles_educativos <- c(
  "Ninguno", "Preescolar", "Primaria", "Secundaria",
  "Preparatoria o bachillerato general", "Bachillerato tecnológico",
  "Técnicos con primaria", "Técnicos con secundaria",
  "Técnicos con preparatoria", "Normal con primaria o secundaria",
  "Normal de licenciatura", "Licenciatura",
  "Especialidad", "Maestría", "Doctorado", "No sabe"
) #ESPECIFICACIÓN DE NIVELES DE EDUCACIÓN EN LA ENCUESTA
eav2024_edu <- eav2024_renombre %>%
  mutate(
    nivel_educativo = case_when(
      estudia_actualmente == 1 ~ nivel_educativo_actual,
      estudia_actualmente == 0 ~ nivel_educativo_max,
      TRUE ~ NA_integer_  
    ), #SI LA PERSONA ESTUDIA ACTUALMENTE SE TOMA SU NIVEL ACTUAL, SINO SE TOMA SU NIVEL MÁXIMO (ESTO PARA JUNTAR LAS DOS VARIABLES Y EVITAR NA)
    nivel_educativo = ifelse(nivel_educativo %in% 8888:9999, NA, nivel_educativo),  #REEMPLAZAR 888 Y 9999 POR NA
    nivel_educativo = factor(nivel_educativo,
                             levels = 0:15,
                             labels = niveles_educativos) #CONVERTIR NIVEL_EDUCATIVO A FACTOR CON LAS ETIQUETAS CREADAS ANTES
  ) %>%
  select(-estudia_actualmente, -nivel_educativo_max, -nivel_educativo_actual) #ELIMINAR LAS VARIABLES QUE YA NO SIRVEN EN NUESTRA BASE

#CREACIÓN DE VARIABLE INDEPENDIENTE: USO DE RECURSOS PÚBLICOS
eav2024_na1 <- eav2024_edu %>%
  mutate(
    alcalde_uso_recursos = ifelse(alcalde_uso_recursos == 8888, NA, alcalde_uso_recursos),
    gob_uso_recursos     = ifelse(gob_uso_recursos == 8888, NA, gob_uso_recursos), #REEMPLAZO DE 8888 POR NA DE AMBAS VARIABLES QUE VAN A FORMAR A USO DE RECURSOS
    alcalde_uso_recursos = as.numeric(as.character(alcalde_uso_recursos)),
    gob_uso_recursos     = as.numeric(as.character(gob_uso_recursos)), #CONVERTIR A NUMÉRICAS LAS VARIABLES
    alcalde_uso_recursos = ifelse(alcalde_uso_recursos == 2, 1,
                             ifelse(alcalde_uso_recursos == 1, 0, alcalde_uso_recursos)),
    gob_uso_recursos     = ifelse(gob_uso_recursos == 2, 1,
                             ifelse(gob_uso_recursos == 1, 0, gob_uso_recursos))
  ) %>% #COMO LAS VARIABLES SE LEÍAN COMO 2 Y 1 Y NO COMO 0 Y 1, SE RECONFIGURARON PARA LEERSE ADECUADAMENTE
  mutate(
    uso_recursos_num = round(alcalde_uso_recursos * 0.7 + gob_uso_recursos * 0.3), #CREACIÓN DE NUEVA VARIABLE, DANDO MAYOR PESO A VARIABLE MUNICIPAL, SE REDONDEA EL RESULTADO PARA QUE SIGA SIENDO SOLO 0 Y 1
    uso_recursos_num = ifelse(is.na(alcalde_uso_recursos) & is.na(gob_uso_recursos), NA, uso_recursos_num), #SI AMBOS FACTORES SON NA ENTONCES SE DEJA COMO NA EN EL PONDERADO
    uso_recursos = factor(
      uso_recursos_num,
      levels = c(0, 1),
      labels = c("Sí", "No")
    )
  ) #CONVERTIR NUEVAMENTE A FACTOR, CON TEXTO SÍ Y NO EN LUGAR DE 0 Y 1

#CREACIÓN DE VARIABLE INDEPENDIENTE: PERCEPCIÓN DE TRANSPARENCIA
eav2024_na2 <- eav2024_na1 %>%
  mutate(
    transparencia_municipio = ifelse(transparencia_municipio== 8888, NA, transparencia_municipio),
    transparencia_estado = ifelse(transparencia_estado == 8888, NA, transparencia_estado)
  ) #CONVERTIR 8888 EN AMBAS VARIABLES POR NA
eav2024_na3 <- eav2024_na2 %>%
  mutate(
    transparencia= transparencia_municipio * 0.7 + transparencia_estado * 0.3
  ) #COMO ES NUMÉRICA, SÓLO SE NECESITA PONDERAR LAS RESPUESTAS, DÁNDOLE MAYOR PESO A MUNICIPAL

#CREACIÓN DE VARIABLE INDEPENDIENTE: PERCEPCIÓN DE HONESTIDAD GUBERNAMENTAL
eav2024_na4 <- eav2024_na3 %>%
  mutate(
    alcalde_honesto = ifelse(alcalde_honesto == 8888, NA, alcalde_honesto),
    samuel_honesto  = ifelse(samuel_honesto == 8888, NA, samuel_honesto),#REEMPLAZO DE 8888 POR NA DE AMBAS VARIABLES QUE VAN A FORMAR HONESTIDAD
    alcalde_honesto = as.numeric(as.character(alcalde_honesto)),
    samuel_honesto  = as.numeric(as.character(samuel_honesto)), #CONVERTIR A NUMÉRICAS LAS VARIABLES
    alcalde_honesto = ifelse(alcalde_honesto == 2, 1,
                        ifelse(alcalde_honesto == 1, 0, alcalde_honesto)),
    samuel_honesto  = ifelse(samuel_honesto == 2, 1,
                        ifelse(samuel_honesto == 1, 0, samuel_honesto))
  ) %>%  #COMO LAS VARIABLES SE LEÍAN COMO 2 Y 1 Y NO COMO 0 Y 1, SE RECONFIGURARON PARA LEERSE ADECUADAMENTE
  mutate(
    honestidad_num = round(alcalde_honesto * 0.7 + samuel_honesto * 0.3), #CREACIÓN DE NUEVA VARIABLE, DANDO MAYOR PESO A VARIABLE MUNICIPAL, SE REDONDEA EL RESULTADO PARA QUE SIGA SIENDO SOLO 0 Y 1
    honestidad_num = ifelse(is.na(alcalde_honesto) & is.na(samuel_honesto), NA, honestidad_num),  #SI AMBOS FACTORES SON NA ENTONCES SE DEJA COMO NA EN EL PONDERADO
    honestidad = factor(
      honestidad_num,
      levels = c(0, 1),
      labels = c("Sí", "No") #CONVERTIR NUEVAMENTE A FACTOR, CON TEXTO SÍ Y NO EN LUGAR DE 0 Y 1
    )
  )


#CREACIÓN DE VARIABLE INDEPENDIENTE: PERCEPCIÓN DE CAPACIDAD DE RESOLUCIÓN DE PROBLEMAS
eav2024_na5 <- eav2024_na4 %>%
  mutate(
    alcalde_capaz = ifelse(alcalde_capaz == 8888, NA, alcalde_capaz),
    samuel_capaz  = ifelse(samuel_capaz == 8888, NA, samuel_capaz), #REEMPLAZO DE 8888 POR NA DE AMBAS VARIABLES QUE VAN A FORMAR CAPACIDAD DE RESOLUCIÓN DE PROBLEMAS
    alcalde_capaz = as.numeric(as.character(alcalde_capaz)),
    samuel_capaz  = as.numeric(as.character(samuel_capaz)),  #CONVERTIR A NUMÉRICAS LAS VARIABLES
    alcalde_capaz = ifelse(alcalde_capaz == 2, 1,
                      ifelse(alcalde_capaz == 1, 0, alcalde_capaz)),
    samuel_capaz  = ifelse(samuel_capaz == 2, 1,
                      ifelse(samuel_capaz == 1, 0, samuel_capaz))
  ) %>% #COMO LAS VARIABLES SE LEÍAN COMO 2 Y 1 Y NO COMO 0 Y 1, SE RECONFIGURARON PARA LEERSE ADECUADAMENTE
  mutate(
    cap_solucion_num = round(alcalde_capaz * 0.7 + samuel_capaz * 0.3),  #CREACIÓN DE NUEVA VARIABLE, DANDO MAYOR PESO A VARIABLE MUNICIPAL, SE REDONDEA EL RESULTADO PARA QUE SIGA SIENDO SOLO 0 Y 1
    cap_solucion_num = ifelse(is.na(alcalde_capaz) & is.na(samuel_capaz), NA, cap_solucion_num), #SI AMBOS FACTORES SON NA ENTONCES SE DEJA COMO NA EN EL PONDERADO
    cap_solucion_problemas = factor(
      cap_solucion_num,
      levels = c(0, 1),
      labels = c("Sí", "No")
    )
  ) #CONVERTIR NUEVAMENTE A FACTOR, CON TEXTO SÍ Y NO EN LUGAR DE 0 Y 1


#CREACIÓN DE VARIABLE INDEPENDIENTE: PERCEPCIÓN DE LA CAPACIDAD DE INFLUENCIA CIUDADANA
eav2024_na6 <- eav2024_na5 %>%
  mutate(
    percep_no_influencia_ciud_mun = ifelse(percep_no_influencia_ciud_mun== 8888, NA, percep_no_influencia_ciud_mun),
    percep_no_influencia_ciud_est = ifelse(percep_no_influencia_ciud_est == 8888, NA, percep_no_influencia_ciud_est)
  ) #CONVERTIR 8888 EN AMBAS VARIABLES POR NA
eav2024_na7 <- eav2024_na6 %>%
  mutate(
    percep_no_influenci_ciudadana= percep_no_influencia_ciud_mun * 0.7 + percep_no_influencia_ciud_est * 0.3
  ) #COMO ES NUMÉRICA, SÓLO SE NECESITA PONDERAR LAS RESPUESTAS, DÁNDOLE MAYOR PESO A MUNICIPAL

#CREACIÓN DE VARIABLE INDEPENDIENTE: SATISFACCIÓN DE SERVICIOS PÚBLICOS
eav2024_na8 <- eav2024_na7 %>%
  mutate(
    satisfaccion_alumbrado = ifelse(satisfaccion_alumbrado== 8888, NA, satisfaccion_alumbrado),
    satisfaccion_calles = ifelse(satisfaccion_calles == 8888, NA, satisfaccion_calles),
    satisfaccion_parques = ifelse(satisfaccion_parques == 8888, NA, satisfaccion_parques),
    satisfaccion_drenaje = ifelse(satisfaccion_drenaje == 8888, NA, satisfaccion_drenaje),
    satisfaccion_basura = ifelse(satisfaccion_basura == 8888, NA, satisfaccion_basura)
  ) #REEMPLAZO DE 8888 POR NA DE TODAS LAS VARIABLES QUE VAN A FORMAR SATISFACCIÓN DE SERVICIOS PÚBLICOS
eav2024_na9 <- eav2024_na8 %>%
  mutate(
    satisfaccion_serv_publicos= satisfaccion_alumbrado * 0.2 + satisfaccion_calles * 0.2 + satisfaccion_parques * 0.2 + satisfaccion_drenaje * 0.2 + satisfaccion_basura * 0.2
  ) #COMO ES NUMÉRICA, SE HACE UN PONDERADO EQUITATIVO ENTRE TODAS LAS VARIABLES

#CREACIÓN DE VARIABLE DEPENDIENTE: NIVEL DE DESCONFIANZA
eav2024_na10 <- eav2024_na9 %>%
  mutate(
    ineficiencia_gobierno_alcalde = ifelse(ineficiencia_gobierno_alcalde== 8888, NA, ineficiencia_gobierno_alcalde),
    ineficiencia_gobierno_samuel = ifelse(ineficiencia_gobierno_samuel == 8888, NA, ineficiencia_gobierno_samuel)
  ) #REEMPLAZO DE 8888 POR NA EN AMBAS VARIABLES
eav2024_na11 <- eav2024_na10 %>%
  mutate(
    nivel_desconfianza = ineficiencia_gobierno_alcalde * 0.7 + ineficiencia_gobierno_samuel * 0.3
  ) #COMO ES NUMÉRICA SÓLO ES NECESARIO EL PONDERADO, DÁNDOLE MÁS PESO A MUNICIPAL

#BASE DE DATOS CON SÓLO VARIABLES CREADAS
variables_finales <- c("voto_pasado", "aprueba_congreso", "percepcion_municipio", "acciones_participacion", "problema_confianza", "ambito_problema", "orgullo_nl", "ingreso_hogar", "zona_geografica", "edad", "factor_expansion", "percepcion_seguridad", "violencia_mujeres", "preocupacion_agua", "acciones_contaminacion", "situacion_economica_actual", "situacion_economica_futura", "suficiencia_ingreso", "num_discriminaciones", "sexo", "Municipio", "nivel_educativo", "uso_recursos", "transparencia", "honestidad", "cap_solucion_problemas", "percep_no_influenci_ciudadana", "satisfaccion_serv_publicos", "nivel_desconfianza") #SELECCIÓN SÓLO DE VARIABLES NECESARIAS EN EL ANÁLISIS
eav2024_semifinal <- eav2024_na11[, variables_finales]  #CREACIÓN DEL DATAFRAME

#OBSERVACIÓN DE NAs
sum(is.na(eav2024_semifinal)) #OBSERVAR TOTAL DE NA

## [1] 1256

colSums(is.na(eav2024_semifinal)) #OBSERVAR CUÁNTOS NA HAY POR CADA VARIABLE

##                   voto_pasado              aprueba_congreso 
##                             0                             0 
##          percepcion_municipio        acciones_participacion 
##                             0                             0 
##            problema_confianza               ambito_problema 
##                             0                             0 
##                    orgullo_nl                 ingreso_hogar 
##                             0                             0 
##               zona_geografica                          edad 
##                             0                             0 
##              factor_expansion          percepcion_seguridad 
##                             0                             0 
##             violencia_mujeres             preocupacion_agua 
##                             0                             0 
##        acciones_contaminacion    situacion_economica_actual 
##                             0                             0 
##    situacion_economica_futura           suficiencia_ingreso 
##                             0                             0 
##          num_discriminaciones                          sexo 
##                           220                             0 
##                     Municipio               nivel_educativo 
##                             0                             0 
##                  uso_recursos                 transparencia 
##                           169                           186 
##                    honestidad        cap_solucion_problemas 
##                           297                           145 
## percep_no_influenci_ciudadana    satisfaccion_serv_publicos 
##                            87                            10 
##            nivel_desconfianza 
##                           142

#IMPUTACIÓN
vars_imputacion <- eav2024_semifinal %>%
  select(uso_recursos, honestidad, percep_no_influenci_ciudadana, nivel_desconfianza, transparencia,
         cap_solucion_problemas, num_discriminaciones, satisfaccion_serv_publicos, 
         sexo, edad) #SELECCIÓN DE VARIABLES QUE DEBEN SER IMPUTADAS PORQUE TIENEN NA Y SEXO Y EDAD PORQUE VAN A SER LA BASE DE LA IMPUTACION
vars_imputacion$sexo <- as.factor(vars_imputacion$sexo) #GARANTIZAR LECTURA DE SEXO COMO FACTOR
vars_imputacion$edad <- as.numeric(vars_imputacion$edad) #GARANTIZAR LECTURA DE EDAD COMO NUMERIC
#LECTURA DE LAS 3 CATEGÓRICAS COMO TAL
vars_imputacion$uso_recursos <- as.factor(vars_imputacion$uso_recursos) 
vars_imputacion$honestidad <- as.factor(vars_imputacion$honestidad)
vars_imputacion$cap_solucion_problemas <- as.factor(vars_imputacion$cap_solucion_problemas)
# INICIO DE IMPUTACIÓN
ini <- mice(vars_imputacion, maxit = 0) 
predMatrix <- ini$predictorMatrix
meth <- ini$method
#SELECCIÓN DE SÓLO EDAD Y SEXO COMO LAS VARIABLES PREDICTORAS
predMatrix[,] <- 0
predMatrix[c("uso_recursos", "honestidad", "percep_no_influenci_ciudadana", "nivel_desconfianza", 
             "transparencia", "cap_solucion_problemas", "num_discriminaciones", "satisfaccion_serv_publicos"), 
           c("edad", "sexo")] <- 1
#DEFINICIÓN DE MÉTODO DE IMPUTACIÓN PARA CATEGÓRICAS
meth["uso_recursos"] <- "logreg"
meth["honestidad"] <- "logreg"
meth["cap_solucion_problemas"] <- "logreg"
#REALIZAR IMPUTACIÓN 
imputacion <- mice(vars_imputacion, m = 1, method = meth, predictorMatrix = predMatrix, seed = 123)

## 
##  iter imp variable
##   1   1  uso_recursos  honestidad  percep_no_influenci_ciudadana  nivel_desconfianza  transparencia  cap_solucion_problemas  num_discriminaciones  satisfaccion_serv_publicos
##   2   1  uso_recursos  honestidad  percep_no_influenci_ciudadana  nivel_desconfianza  transparencia  cap_solucion_problemas  num_discriminaciones  satisfaccion_serv_publicos
##   3   1  uso_recursos  honestidad  percep_no_influenci_ciudadana  nivel_desconfianza  transparencia  cap_solucion_problemas  num_discriminaciones  satisfaccion_serv_publicos
##   4   1  uso_recursos  honestidad  percep_no_influenci_ciudadana  nivel_desconfianza  transparencia  cap_solucion_problemas  num_discriminaciones  satisfaccion_serv_publicos
##   5   1  uso_recursos  honestidad  percep_no_influenci_ciudadana  nivel_desconfianza  transparencia  cap_solucion_problemas  num_discriminaciones  satisfaccion_serv_publicos

datos_imputados <- complete(imputacion)
#REINTEGRACIÓN DE VARIABLES IMPUTADAS A BASE 
eav2024_semifinal$uso_recursos <- datos_imputados$uso_recursos
eav2024_semifinal$honestidad <- datos_imputados$honestidad
eav2024_semifinal$cap_solucion_problemas <- datos_imputados$cap_solucion_problemas
eav2024_semifinal$percep_no_influenci_ciudadana <- datos_imputados$percep_no_influenci_ciudadana
eav2024_semifinal$nivel_desconfianza <- datos_imputados$nivel_desconfianza
eav2024_semifinal$transparencia <- datos_imputados$transparencia
eav2024_semifinal$num_discriminaciones <- datos_imputados$num_discriminaciones
eav2024_semifinal$satisfaccion_serv_publicos <- datos_imputados$satisfaccion_serv_publicos

# VERIFICAR NA PARA GARANTIZAR QUE YA NO HAY MÁS
sum(is.na(eav2024_semifinal))

## [1] 0

# GUARDAR BASE FINAL
eav2024_final <- eav2024_semifinal

4. División de datos en entrenamiento y prueba

# PARA EL ENTRENAMIENTO DE MODELOS SE REALIZÓ DIVISÓN EN DATOS DE ENTRENAMIENTO Y PRUEBA
set.seed(123) #PARA GARANTIZAR RESULTADOS QUE SE REPITAN
n <- nrow(eav2024_final)
aleatorios <- sample(1:n, size=0.7*n)

#CREACIÓN DE MODELO DE ENTRENAMIENTO CON PESO 0.7
entrenamiento <- eav2024_final[aleatorios,]

#CREACIÓN DE MODELO DE PRUEBA
prueba <- eav2024_final[-aleatorios,]

5. Modelo inicial

# DEFINICIÓN DE MODELO LINEAL INICIAL CON VARIABLES PREDICTORAS YA AJUSTADAS (ES DECIR, YA SE ELIMINARON AQUELLAS QUE NO ERAN SIGNIFICATIVAS)
modelo_desconfianza <- lm(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                    cap_solucion_problemas + satisfaccion_serv_publicos, 
                  data = eav2024_final, weights = 
                    factor_expansion)
summary(modelo_desconfianza) #VISUALIZAR EL MODELO

## 
## Call:
## lm(formula = nivel_desconfianza ~ uso_recursos + transparencia + 
##     honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, 
##     data = eav2024_final, weights = factor_expansion)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.358  -8.847  -0.302   9.621  71.714 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 3.462482   0.046080  75.140  < 2e-16 ***
## uso_recursosNo             -0.267093   0.027539  -9.699  < 2e-16 ***
## transparencia              -0.086830   0.013457  -6.453 1.25e-10 ***
## honestidadNo               -0.412826   0.031871 -12.953  < 2e-16 ***
## cap_solucion_problemasNo   -0.649199   0.033070 -19.631  < 2e-16 ***
## satisfaccion_serv_publicos -0.033708   0.006005  -5.613 2.14e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.56 on 3507 degrees of freedom
## Multiple R-squared:  0.5985, Adjusted R-squared:  0.5979 
## F-statistic:  1046 on 5 and 3507 DF,  p-value: < 2.2e-16

vif(modelo_desconfianza) #COMPROBAR QUE NO HAY MULTICOLINEALIDAD (NO LA HAY)

##               uso_recursos              transparencia 
##                   2.305303                   1.107723 
##                 honestidad     cap_solucion_problemas 
##                   3.077697                   3.324686 
## satisfaccion_serv_publicos 
##                   1.168736

#plot(modelo_desconfianza$residuals)
bptest(modelo_desconfianza) #HAY HETEROCEDASTICIDAD, PERO NO ES UN PROBLEMA

## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_desconfianza
## BP = 3336671, df = 5, p-value < 2.2e-16

coeftest(modelo_desconfianza, vcov = vcovHC(modelo_desconfianza, type = "HC1")) #PARA CORREGIR HETEROCEDASTICIDADES EN EVALUACIÓN DEL MODELO

## 
## t test of coefficients:
## 
##                              Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)                 3.4624825  0.0621566  55.7058 < 2.2e-16 ***
## uso_recursosNo             -0.2670926  0.0388537  -6.8743 7.342e-12 ***
## transparencia              -0.0868298  0.0173229  -5.0124 5.641e-07 ***
## honestidadNo               -0.4128260  0.0489963  -8.4257 < 2.2e-16 ***
## cap_solucion_problemasNo   -0.6491987  0.0522022 -12.4362 < 2.2e-16 ***
## satisfaccion_serv_publicos -0.0337083  0.0077378  -4.3563 1.361e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6. Evaluación del modelo inicial

prediccion <- predict(modelo_desconfianza, newdata = prueba) #GENERAR PREDICCIONES SEGÚN EL MODELO DE DESCONFIANZA
rmse_desconfianza <- sqrt(mean((prueba$nivel_desconfianza - prediccion)^2)) #CÁLCULO DEL PROMEDIO DEL ERROR CUADRÁTICO 
mae_desconfianza <- mean(abs(prueba$nivel_desconfianza - prediccion)) #CÁLCULO DEL PROMEDIO DEL ERROR ABSOLUTO
r2_desconfianza <- cor(prueba$nivel_desconfianza, prediccion)^2 #CÁLCULO DE LA R2
#CÁLCULO DE RESIDUALES
residuales <- resid(modelo_desconfianza)
#GRÁFICO DE AJUSTE DE RESIDUALES
qqnorm(residuales)
qqline(residuales)

7. Selección de modelos lineales y reducción de dimensiones

#MÉTODO PLS 
modelo_pls <- plsr(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = eav2024_final,
  weights = factor_expansion,
  scale = TRUE,
  validation = "CV"
) #AJUSTE DE MODELO INICIAL PLS
summary(modelo_pls) #VISUALIZACIÓN DEL MODELO

## Data:    X dimension: 3513 5 
##  Y dimension: 3513 1
## Fit method: kernelpls
## Number of components considered: 5
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps
## CV          0.8397   0.5383   0.5327   0.5311    0.531   0.5310
## adjCV       0.8397   0.5383   0.5327   0.5310    0.531   0.5309
## 
## TRAINING: % variance explained
##                     1 comps  2 comps  3 comps  4 comps  5 comps
## X                     56.35    71.61    78.89    90.97   100.00
## nivel_desconfianza    58.95    59.84    60.11    60.12    60.13

plot(RMSEP(modelo_pls), legendpos = "topright") #GRÁFICA DEL RMSE DEL MODELO SEGÚN EL NÚMERO DE COMPONENTES

#AJUSTE DEL MODELO CON DATOS DE ENTRENAMIENTO
modelo_pls_TRAIN <- plsr(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = entrenamiento,
  weights = factor_expansion,
  scale = TRUE,
  validation = "CV"
)

#PREDICCIÓN SOBRE CONJUNTO DE PRUEBA
pred_pls <- predict(modelo_pls_TRAIN, newdata = prueba, ncomp = 2) #PREDICCIÓN CON DOS COMPONENTES PORQUE DE AHÍ EN ADELANTE LA MEJOR ES MUY POCO SIGNIFICATIVA
rmse_pls <- sqrt(mean((prueba$nivel_desconfianza - pred_pls)^2)) #CÁLCULO DE RMSE
mae_pls <- mean(abs(prueba$nivel_desconfianza - pred_pls)) #CÁLCULO DE MAE
r2_pls <- cor(prueba$nivel_desconfianza, pred_pls)^2 #CÁLCULO DE R2

#MÉTODO RIDGE
#PREPARACIÓN DE DATOS X Y Y SEGÚN MIS VARIABLES
x <- model.matrix(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                    cap_solucion_problemas + satisfaccion_serv_publicos, eav2024_final)[ ,-1]
y <- eav2024_final$nivel_desconfianza
#ASIGNACIÓN DEL FACTOR DE EXPANSIÓN COMO FACTOR, PARA PODER USAR WEIGHTS
factor <- eav2024_final$factor_expansion
#ENTRENAMIENTO CON VALIDACIÓN CRUZADA
seq(10,-2,length=10)

##  [1] 10.0000000  8.6666667  7.3333333  6.0000000  4.6666667  3.3333333
##  [7]  2.0000000  0.6666667 -0.6666667 -2.0000000

grid <- 10^seq(10,-2,length=10)
ridge.mod <- glmnet(x,y,alpha=0,lambda=grid, weights = factor)
set.seed(123) #PARA GARANTIZAR MISMOS RESULTADOS
cv.ridge <- cv.glmnet(x, y, alpha = 0, weights = factor)
lamba_optimo <- cv.ridge$lambda.min  
#REETRENAMENTO CON LA MABDA ÓPTIMA
ridge <- glmnet(x, y, alpha = 0, lambda = lamba_optimo, weights = factor)
coef(ridge) #OBSERVAR LOS COEFICIENTES

## 6 x 1 sparse Matrix of class "dgCMatrix"
##                                     s0
## (Intercept)                 3.45694764
## uso_recursosNo             -0.28570745
## transparencia              -0.08817803
## honestidadNo               -0.41669211
## cap_solucion_problemasNo   -0.58705130
## satisfaccion_serv_publicos -0.03536184

# AJUSTE DEL MODELO A DATOS DE ETRENAMIENTO
x_train <- model.matrix(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                         cap_solucion_problemas + satisfaccion_serv_publicos, entrenamiento)[ ,-1]
y_train <- entrenamiento$nivel_desconfianza
grid <- 10^seq(10,-2,length=10)
ridge.mod_train <- glmnet(x_train, y_train, alpha=0, lambda=grid, weights = entrenamiento$factor_expansion)
cv.ridge_train <- cv.glmnet(x_train, y_train, alpha = 0, weights = entrenamiento$factor_expansion)
lambda_optimo_train <- cv.ridge$lambda.min

# PREDICCIÓN CON CONUNTO DE PRUEBA
ridge_pred <- predict(ridge.mod_train, s = lambda_optimo_train, newx = model.matrix(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                                                                          cap_solucion_problemas + satisfaccion_serv_publicos, prueba)[ ,-1])

#CÁLCULO DE VARIABLES DE RMSE, MAE Y R2
rmse_ridge <- sqrt(mean((prueba$nivel_desconfianza - ridge_pred)^2))
mae_ridge <- mean(abs(prueba$nivel_desconfianza - ridge_pred))
r2_ridge <- cor(prueba$nivel_desconfianza, ridge_pred)^2

#MÉTODO LASSO
factor <- eav2024_final$factor_expansion #PARA PODER USAR WEIGHTS
#PREPARACIÓN DE DATOS PARA ENTRENAMIENTO Y PRUEBA
set.seed(123)
train = sample(1:nrow(x), nrow(x)/2)
test = -train
y.test = y[test]

#ENTRENAMIENTO CON VALIDACIÓN CRUZADA
lasso.mod <- glmnet(x[train ,], y[train], alpha=1, lambda=grid, weights = factor[train])
cv.out <- cv.glmnet(x[train ,], y[train], alpha=1, weights = factor[train]) 
plot(cv.out) #GRÁFICO

bestlam <- cv.out$lambda.min #LA LAMBDA ÓPTIMA

#PREDICCIONES CON CONJUNTO DE PRUEBA
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test,])

#COEFICIENTES SELECCIONADOS CON LASSO
out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = "coefficients", s = bestlam)[1:5,]
lasso.coef[lasso.coef != 0]

##              (Intercept)           uso_recursosNo            transparencia 
##                3.3782121               -0.2434922               -0.0656176 
##             honestidadNo cap_solucion_problemasNo 
##               -0.3797419               -0.6870017

#CÁLCULO DE RMSE, MAE Y R2
rmse_lasso <- sqrt(mean((y.test - lasso.pred)^2))
mae_lasso <- mean(abs(y.test - lasso.pred))
r2_lasso <- cor(y.test, lasso.pred)^2

# SUBCONJUNTOS FULL
#MODELO CON TODO EL CONJUNTO PARA ANÁLISIS INICIAL
regfit.full <- regsubsets (nivel_desconfianza~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos, data=eav2024_final, nvmax=5)
reg.summary  <- summary(regfit.full)
reg.summary #PARA VER QUÉ OBTUVO EL MODELO

## Subset selection object
## Call: regsubsets.formula(nivel_desconfianza ~ uso_recursos + transparencia + 
##     honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, 
##     data = eav2024_final, nvmax = 5)
## 5 Variables  (and intercept)
##                            Forced in Forced out
## uso_recursosNo                 FALSE      FALSE
## transparencia                  FALSE      FALSE
## honestidadNo                   FALSE      FALSE
## cap_solucion_problemasNo       FALSE      FALSE
## satisfaccion_serv_publicos     FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
##          uso_recursosNo transparencia honestidadNo cap_solucion_problemasNo
## 1  ( 1 ) " "            " "           " "          "*"                     
## 2  ( 1 ) " "            " "           "*"          "*"                     
## 3  ( 1 ) "*"            " "           "*"          "*"                     
## 4  ( 1 ) "*"            " "           "*"          "*"                     
## 5  ( 1 ) "*"            "*"           "*"          "*"                     
##          satisfaccion_serv_publicos
## 1  ( 1 ) " "                       
## 2  ( 1 ) " "                       
## 3  ( 1 ) " "                       
## 4  ( 1 ) "*"                       
## 5  ( 1 ) "*"

which.max(reg.summary$adjr2) #PARA VER LA MÁXIMA R2 OBTENIDA CON CUÁNTAS VARIABLES

## [1] 5

which.min(reg.summary$bic) #PARA VER EL MÍNIMO BIC OBTENIDO CON CUÁNTAS VARIABLES

## [1] 5

coeficientes <- coef(regfit.full, id = 5) #VER LOS COEFICIENTES SABIENDO QUE LO IDEAL SON 5 VARIABLES
print(coeficientes)

##                (Intercept)             uso_recursosNo 
##                 3.43153901                -0.24947525 
##              transparencia               honestidadNo 
##                -0.07511558                -0.38533598 
##   cap_solucion_problemasNo satisfaccion_serv_publicos 
##                -0.69000738                -0.03340941

#PARA VER LAS MÉTRICAS R2 Y BIC
reg.summary$adjr2[5]

## [1] 0.6006821

reg.summary$bic[5]

## [1] -3180.945

# AJUSTE CON CONJUNTO DE ENTRENAMIENTO
regfit.full_TRAIN <- regsubsets(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, 
                          data = entrenamiento, nvmax = 5)
reg.summary_TRAIN <- summary(regfit.full_TRAIN)
mejor_modelo_full_TRAIN <- which.max(reg.summary$adjr2) #PARA VER CUÁNTAS VARIABLES SON LAS IDEALES
coef_full_TRAIN <- coef(regfit.full, id = mejor_modelo_full_TRAIN) #PARA VER COEFICIENTES
X_test_full_TRAIN <- model.matrix(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, prueba)[, -1]

#CÁLCULO DE PREDICCIONES
pred_full <- X_test_full_TRAIN %*% coef_full_TRAIN[-1] + coef_full_TRAIN[1]

#CÁLCULO DE MÉTRICAS RMSE, MAE Y R2
rmse_full <- sqrt(mean((prueba$nivel_desconfianza - pred_full)^2))
mae_full <- mean(abs(prueba$nivel_desconfianza - pred_full))
r2_full <- cor(prueba$nivel_desconfianza, pred_full)^2

#MÉTODO FORWARD
#MODELO CON TODO EL CONJUNTO PARA ANÁLISIS INICIAL
regfit.fwd <- regsubsets (nivel_desconfianza~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos, data=eav2024_final, method = "forward", nvmax=5)
reg.summary.fwd  <- summary(regfit.fwd)
reg.summary.fwd #OBSERVAR MODELO

## Subset selection object
## Call: regsubsets.formula(nivel_desconfianza ~ uso_recursos + transparencia + 
##     honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, 
##     data = eav2024_final, method = "forward", nvmax = 5)
## 5 Variables  (and intercept)
##                            Forced in Forced out
## uso_recursosNo                 FALSE      FALSE
## transparencia                  FALSE      FALSE
## honestidadNo                   FALSE      FALSE
## cap_solucion_problemasNo       FALSE      FALSE
## satisfaccion_serv_publicos     FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: forward
##          uso_recursosNo transparencia honestidadNo cap_solucion_problemasNo
## 1  ( 1 ) " "            " "           " "          "*"                     
## 2  ( 1 ) " "            " "           "*"          "*"                     
## 3  ( 1 ) "*"            " "           "*"          "*"                     
## 4  ( 1 ) "*"            " "           "*"          "*"                     
## 5  ( 1 ) "*"            "*"           "*"          "*"                     
##          satisfaccion_serv_publicos
## 1  ( 1 ) " "                       
## 2  ( 1 ) " "                       
## 3  ( 1 ) " "                       
## 4  ( 1 ) "*"                       
## 5  ( 1 ) "*"

which.max(reg.summary.fwd$adjr2) #VER MEJOR NÚM DE VARIABLES SEGÚN MÁXIMA R2

## [1] 5

which.min(reg.summary.fwd$bic)  #VER MEJOR NUM DE VARIABLES SEGÚN MÍNIMO BIC

## [1] 5

coeficientes <- coef(regfit.fwd, id = 5) #PARA VER LOS COEFICIENTES
print(coeficientes)

##                (Intercept)             uso_recursosNo 
##                 3.43153901                -0.24947525 
##              transparencia               honestidadNo 
##                -0.07511558                -0.38533598 
##   cap_solucion_problemasNo satisfaccion_serv_publicos 
##                -0.69000738                -0.03340941

reg.summary.fwd$adjr2[5] #PARA VER R2

## [1] 0.6006821

reg.summary.fwd$bic[5] #PARA VER BIC

## [1] -3180.945

# AJUSTE DEL MODELO CON CONJUNTO DE ENTRENAMIENTO
regfit.fwd_TRAIN <- regsubsets(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                         cap_solucion_problemas + satisfaccion_serv_publicos, 
                         data = entrenamiento, method = "forward", nvmax = 5)
reg.summary.fwd_TRAIN <- summary(regfit.fwd_TRAIN) #PARA VER MODELO
mejor_modelo_fwd_TRAIN <- which.max(reg.summary.fwd_TRAIN$adjr2) #PARA VER MEJOR NÚM DE VARIABLES
coef_fwd_TRAIN <- coef(regfit.fwd_TRAIN, id = mejor_modelo_fwd_TRAIN) #PARA VER COEFICIENTES
X_test_fwd_TRAIN <- model.matrix(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, prueba)[, -1]

#CÁLCULO DE PREDICCIONES
pred_fwd <- X_test_fwd_TRAIN %*% coef_fwd_TRAIN[-1] + coef_fwd_TRAIN[1]

#CÁLCULO DE MÉTRICAS RMSE, MAE Y R2
rmse_fwd <- sqrt(mean((prueba$nivel_desconfianza - pred_fwd)^2))
mae_fwd <- mean(abs(prueba$nivel_desconfianza - pred_fwd))
r2_fwd <- cor(prueba$nivel_desconfianza, pred_fwd)^2

#MÉTODO BACKWARD
#MODELO CON TODO EL CONJUNTO PARA ANÁLISIS INICIAL
regfit.bwd <- regsubsets (nivel_desconfianza~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos, data=eav2024_final, method = "backward", nvmax=5)
reg.summary.bwd  <- summary(regfit.bwd)
reg.summary.bwd #OBSERVAR EL MODELO

## Subset selection object
## Call: regsubsets.formula(nivel_desconfianza ~ uso_recursos + transparencia + 
##     honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, 
##     data = eav2024_final, method = "backward", nvmax = 5)
## 5 Variables  (and intercept)
##                            Forced in Forced out
## uso_recursosNo                 FALSE      FALSE
## transparencia                  FALSE      FALSE
## honestidadNo                   FALSE      FALSE
## cap_solucion_problemasNo       FALSE      FALSE
## satisfaccion_serv_publicos     FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: backward
##          uso_recursosNo transparencia honestidadNo cap_solucion_problemasNo
## 1  ( 1 ) " "            " "           " "          "*"                     
## 2  ( 1 ) " "            " "           "*"          "*"                     
## 3  ( 1 ) "*"            " "           "*"          "*"                     
## 4  ( 1 ) "*"            " "           "*"          "*"                     
## 5  ( 1 ) "*"            "*"           "*"          "*"                     
##          satisfaccion_serv_publicos
## 1  ( 1 ) " "                       
## 2  ( 1 ) " "                       
## 3  ( 1 ) " "                       
## 4  ( 1 ) "*"                       
## 5  ( 1 ) "*"

which.max(reg.summary.bwd$adjr2) #PARA VER MEJOR NUM DE VARIABLES CON MAXIMA R2

## [1] 5

which.min(reg.summary.bwd$bic) #PARA VER MEJOR NUM DE VARIABLES CON MINIMO BIC

## [1] 5

which.min(reg.summary.bwd$cp) #PARA VER MEJOR NUM DE VARIABLES CON MINIMO CP

## [1] 5

coeficientes <- coef(regfit.bwd, id = 5) #PARA VER COEFICIENTES
print(coeficientes)

##                (Intercept)             uso_recursosNo 
##                 3.43153901                -0.24947525 
##              transparencia               honestidadNo 
##                -0.07511558                -0.38533598 
##   cap_solucion_problemasNo satisfaccion_serv_publicos 
##                -0.69000738                -0.03340941

reg.summary.bwd$adjr2[5] #PARA VER R2

## [1] 0.6006821

reg.summary.bwd$bic[5] #PARA VER BIC

## [1] -3180.945

# AJUSTE CON MODELO DE ENTRENAMIENTO
regfit.bwd_TRAIN <- regsubsets(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, 
                         data = entrenamiento, method = "backward", nvmax = 5)
reg.summary.bwd_TRAIN <- summary(regfit.bwd_TRAIN) #PARA VER MODELO
mejor_modelo_bwd_TRAIN <- which.max(reg.summary.bwd_TRAIN$adjr2) #PARA VER MEJOR NUM DE VARIABLES
coef_bwd_TRAIN <- coef(regfit.bwd_TRAIN, id = mejor_modelo_bwd_TRAIN) #PARA VER COEFICIENTES
X_test_bwd_TRAIN <- model.matrix(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, prueba)[, -1]

# PREDICCIONES CON CONJUNTO DE PRUEBA
pred_bwd <- X_test_bwd_TRAIN %*% coef_bwd_TRAIN[-1] + coef_bwd_TRAIN[1]

#CÁLCULO DE MÉTRICAS RMSE, MAE Y R2
rmse_bwd <- sqrt(mean((prueba$nivel_desconfianza - pred_bwd)^2))
mae_bwd <- mean(abs(prueba$nivel_desconfianza - pred_bwd))
r2_bwd <- cor(prueba$nivel_desconfianza, pred_bwd)^2

8. Regresión polinomial y splines

#MODELO LINEAL GRADO 1
fit.1 <- lm(nivel_desconfianza ~ transparencia, data = eav2024_final, weights = factor_expansion)
#MODELO POLINOMIAL GRADO 2
fit.2 <- lm(nivel_desconfianza ~ poly(transparencia, 2), data = eav2024_final, weights = factor_expansion)
#MODELO POLINOMIAL GRADO 3
fit.3 <- lm(nivel_desconfianza ~ poly(transparencia, 3), data = eav2024_final, weights = factor_expansion)
#COMPARACIÓN DE VARIANZAS CON ANOVA DE LOS TRES MODELOS
anova(fit.1, fit.2, fit.3)

## Analysis of Variance Table
## 
## Model 1: nivel_desconfianza ~ transparencia
## Model 2: nivel_desconfianza ~ poly(transparencia, 2)
## Model 3: nivel_desconfianza ~ poly(transparencia, 3)
##   Res.Df     RSS Df Sum of Sq      F   Pr(>F)   
## 1   3511 2173719                                
## 2   3510 2169540  1    4178.8 6.7594 0.009365 **
## 3   3509 2169335  1     205.2 0.3319 0.564568   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#SE DETERMINÓ QUE EL MEJOR GRADO ES 2

#MODELO LINEAL GRADO 1
fit.1 <- lm(nivel_desconfianza ~ satisfaccion_serv_publicos, data = eav2024_final, weights = factor_expansion)
#MODELO POLINOMIAL GRADO 2
fit.2 <- lm(nivel_desconfianza ~ poly(satisfaccion_serv_publicos, 2), data = eav2024_final, weights = factor_expansion)
#MODELO POLINOMIAL GRADO 3
fit.3 <- lm(nivel_desconfianza ~ poly(satisfaccion_serv_publicos, 3), data = eav2024_final, weights = factor_expansion)
#COMPARACIÓN DE MODELOS CON VARIANZAS CON ANOVA
anova(fit.1, fit.2, fit.3)

## Analysis of Variance Table
## 
## Model 1: nivel_desconfianza ~ satisfaccion_serv_publicos
## Model 2: nivel_desconfianza ~ poly(satisfaccion_serv_publicos, 2)
## Model 3: nivel_desconfianza ~ poly(satisfaccion_serv_publicos, 3)
##   Res.Df     RSS Df Sum of Sq       F   Pr(>F)    
## 1   3511 2110919                                  
## 2   3510 2096525  1   14393.3 24.0905 9.61e-07 ***
## 3   3509 2096512  1      13.1  0.0219   0.8823    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#SE DETERMINÓ QUE EL MEJOR GRADO ES 2

# MODELO POLINOMIAL CON TODAS LAS VARIALES, Y CON TRANSPARENCIA Y SATISFACCION DE SERVICIOS PUBLICOS ELEVADA AL CUADRADO
poli_desconfianza <- lm(nivel_desconfianza ~ uso_recursos + I(transparencia^2) + honestidad + cap_solucion_problemas + I(satisfaccion_serv_publicos^2),
        data = eav2024_final, weights = factor_expansion)
summary (poli_desconfianza) #PARA OBSERVAR EL MODELO

## 
## Call:
## lm(formula = nivel_desconfianza ~ uso_recursos + I(transparencia^2) + 
##     honestidad + cap_solucion_problemas + I(satisfaccion_serv_publicos^2), 
##     data = eav2024_final, weights = factor_expansion)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -81.712  -8.694  -0.352   9.414  72.060 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.2858329  0.0257287 127.711  < 2e-16 ***
## uso_recursosNo                  -0.2629304  0.0275296  -9.551  < 2e-16 ***
## I(transparencia^2)              -0.0211068  0.0030823  -6.848 8.82e-12 ***
## honestidadNo                    -0.4116107  0.0318274 -12.933  < 2e-16 ***
## cap_solucion_problemasNo        -0.6470542  0.0330251 -19.593  < 2e-16 ***
## I(satisfaccion_serv_publicos^2) -0.0027333  0.0004453  -6.138 9.30e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.54 on 3507 degrees of freedom
## Multiple R-squared:  0.5996, Adjusted R-squared:  0.599 
## F-statistic:  1050 on 5 and 3507 DF,  p-value: < 2.2e-16

# AJUSTE CON CONJUNTO DE ENTRENAMIENTO
poli_desconfianza_TRAIN <- lm(nivel_desconfianza ~ uso_recursos + I(transparencia^2) + honestidad + cap_solucion_problemas + I(satisfaccion_serv_publicos^2),
                        data = entrenamiento, weights = factor_expansion)

#REALIZAR PREDICCIONES CON CONJUNTO DE PRUEBA
pred_poli <- predict(poli_desconfianza_TRAIN, newdata = prueba)

#CÁLCULO DE MÉTRICAS RMSE, MAE Y R2
rmse_poli <- sqrt(mean((prueba$nivel_desconfianza - pred_poli)^2))
mae_poli <- mean(abs(prueba$nivel_desconfianza - pred_poli))
r2_poli <- cor(prueba$nivel_desconfianza, pred_poli)^2

# MÉTODO DE SPLINES
spline_desconfianza <- lm(
  nivel_desconfianza ~ 
    bs(transparencia, df = 3) +      
    bs(satisfaccion_serv_publicos, df = 3) +  
    cap_solucion_problemas +       
    uso_recursos +                   
    honestidad,                      
  data = eav2024_final,
  weights = factor_expansion
) #GRADOS DE LIBERTAD 3 PARA VARIABLES NUMÉRICAS, CATEGÓRICAS SIN CAMBIOS

summary(spline_desconfianza) #PARA OBSERVAR EL MODELO

## 
## Call:
## lm(formula = nivel_desconfianza ~ bs(transparencia, df = 3) + 
##     bs(satisfaccion_serv_publicos, df = 3) + cap_solucion_problemas + 
##     uso_recursos + honestidad, data = eav2024_final, weights = factor_expansion)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -79.519  -8.610  -0.462   9.151  71.657 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              3.47946    0.14785  23.534  < 2e-16
## bs(transparencia, df = 3)1              -0.09955    0.06547  -1.520 0.128495
## bs(transparencia, df = 3)2              -0.01217    0.08132  -0.150 0.881097
## bs(transparencia, df = 3)3              -0.38309    0.06528  -5.869 4.81e-09
## bs(satisfaccion_serv_publicos, df = 3)1 -0.59996    0.27492  -2.182 0.029155
## bs(satisfaccion_serv_publicos, df = 3)2 -0.04943    0.11852  -0.417 0.676651
## bs(satisfaccion_serv_publicos, df = 3)3 -0.59167    0.16480  -3.590 0.000335
## cap_solucion_problemasNo                -0.65147    0.03301 -19.733  < 2e-16
## uso_recursosNo                          -0.26294    0.02752  -9.553  < 2e-16
## honestidadNo                            -0.41149    0.03180 -12.940  < 2e-16
##                                            
## (Intercept)                             ***
## bs(transparencia, df = 3)1                 
## bs(transparencia, df = 3)2                 
## bs(transparencia, df = 3)3              ***
## bs(satisfaccion_serv_publicos, df = 3)1 *  
## bs(satisfaccion_serv_publicos, df = 3)2    
## bs(satisfaccion_serv_publicos, df = 3)3 ***
## cap_solucion_problemasNo                ***
## uso_recursosNo                          ***
## honestidadNo                            ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.52 on 3503 degrees of freedom
## Multiple R-squared:  0.601,  Adjusted R-squared:    0.6 
## F-statistic: 586.4 on 9 and 3503 DF,  p-value: < 2.2e-16

# AJUSTE CON CONJUNTO DE ENTRENAMIENTO
spline_desconfianza_TRAIN <- lm(
  nivel_desconfianza ~ 
    bs(transparencia, df = 3) +      
    bs(satisfaccion_serv_publicos, df = 3) +  
    cap_solucion_problemas +       
    uso_recursos +                   
    honestidad,                      
  data = entrenamiento,
  weights = factor_expansion
)

#PREDICCIONES CON CONJUNTO DE PRUEBA
pred_spline <- predict(spline_desconfianza_TRAIN, newdata = prueba)

#CÁLCULO DE MÉTRICAS RMSE, MAE Y R2
rmse_spline <- sqrt(mean((prueba$nivel_desconfianza - pred_spline)^2))
mae_spline <- mean(abs(prueba$nivel_desconfianza - pred_spline))
r2_spline <- cor(prueba$nivel_desconfianza, pred_spline)^2

9. Árboles de decisiones

#MÉTODO DE ÁRBOL DE DECISIÓN
arbol <- tree(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                    cap_solucion_problemas + satisfaccion_serv_publicos, 
              data = eav2024_final,
              control = tree.control(nobs = nrow(eav2024_final), minsize = 50, mindev = 0.001)) #SÓLO SE DEVIDE SI LA DESVIACIÓN RESIDUAL SE REDUCE AL MENOS UN 0.1%
summary(arbol) #PARA OBERVAR EL MODELO

## 
## Regression tree:
## tree(formula = nivel_desconfianza ~ uso_recursos + transparencia + 
##     honestidad + cap_solucion_problemas + satisfaccion_serv_publicos, 
##     data = eav2024_final, control = tree.control(nobs = nrow(eav2024_final), 
##         minsize = 50, mindev = 0.001))
## Number of terminal nodes:  17 
## Residual mean deviance:  0.2663 = 931.1 / 3496 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.27200 -0.34750  0.02759  0.00000  0.29150  2.41800

plot(arbol) #PARA GRAFICAR EL MODELO
text(arbol, pretty = 0, cex = 0.5)

# AJUSTE CON SUBCONJUNTO DE ENTRENAMIENTO
arbol_TRAIN <- tree(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                    cap_solucion_problemas + satisfaccion_serv_publicos, 
              data = entrenamiento,
              control = tree.control(nobs = nrow(entrenamiento), minsize = 50, mindev = 0.001))

#CÁLCULO DE PREDICCIONES
pred_arbol <- predict(arbol, newdata = prueba)

#CÁLCULO DE MÉTRICAS RMSE, MAE Y R2
rmse_arbol <- sqrt(mean((prueba$nivel_desconfianza - pred_arbol)^2))
mae_arbol <- mean(abs(prueba$nivel_desconfianza - pred_arbol))
r2_arbol <- cor(prueba$nivel_desconfianza, pred_arbol)^2

# MÉTODO RANDOM FOREST
set.seed(123) #PARA QUE SE PUEDA REPETIR
#CREACIÓN DEL MODELO
modelo_rf <- randomForest(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = eav2024_final,
  ntree = 500,            
  mtry = 2,               
  nodesize = 50,          
  importance = TRUE       
)

# AJUSTE DEL MODELO CON CONJUNTO DE ENTRENAMIENTO
set.seed(123)
modelo_rf_TRAIN <- randomForest(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = entrenamiento,
  ntree = 500,            
  mtry = 2,               
  nodesize = 50,          
  importance = TRUE       
)
#CÁLCULO DE PREDICCIONES CON CONJUNTO DE PRUEBA
pred_rf <- predict(modelo_rf_TRAIN, newdata = prueba)
real <- prueba$nivel_desconfianza

#CREACIÓN DE MÉTRICAS RMSE, MAE Y R2
mae_rf <- mean(abs(pred_rf - real))
rmse_rf <- sqrt(mean((pred_rf - real)^2))
sst <- sum((real - mean(real))^2)
sse <- sum((real - pred_rf)^2)
r2_rf <- 1 - (sse / sst)

10. Comparación de modelos y selección del mejor

# TABLA COMPARATIVA (USA A LAS MÉTRICAS RMSE, MAE Y R2 CALCULADAS A LO LARGO DEL CÓDIGO)
comparacion <- data.frame(
  Modelo = c("Regresión Lineal", "PLS", "Ridge", "Lasso", "Subconjunto Full", "Forward", "Backward", "Polinomial", "Splines", "Árbol de Decisiones", "Random Forest"),
  RMSE = c(rmse_desconfianza, rmse_pls, rmse_ridge, rmse_lasso, rmse_full, rmse_fwd, rmse_bwd, rmse_poli, rmse_spline, rmse_arbol, rmse_rf),
  MAE = c(mae_desconfianza, mae_pls, mae_ridge, mae_lasso, mae_full, mae_fwd, mae_bwd, mae_poli, mae_spline, mae_arbol, mae_rf),
  R2 = c(r2_desconfianza, r2_pls, r2_ridge, r2_lasso, r2_full, r2_fwd, r2_bwd, r2_poli, r2_spline, r2_arbol, r2_rf)
)
print(comparacion) #PARA VISUALIZAR LA TABLA

##                 Modelo      RMSE       MAE        R2
## 1     Regresión Lineal 0.5247405 0.4100130 0.6156728
## 2                  PLS 0.5301126 0.4123576 0.6089290
## 3                Ridge 0.5283890 0.4097778 0.6124929
## 4                Lasso 0.5335874 0.4148628 0.6018457
## 5     Subconjunto Full 0.5249581 0.4087624 0.6156435
## 6              Forward 0.5268591 0.4091342 0.6136758
## 7             Backward 0.5268591 0.4091342 0.6136758
## 8           Polinomial 0.5271232 0.4108251 0.6135037
## 9              Splines 0.5279060 0.4091704 0.6125824
## 10 Árbol de Decisiones 0.5193885 0.4034628 0.6234272
## 11       Random Forest 0.5207818 0.4018511 0.6209594

# TABLA COMPARATIVA CROSS VALIDATION
# PARA ESTA SECCIÓN, SE CALCULÓ NUEVAMENTE TODOS LOS MODELOS CON CARET PARA HACER CROSS VALIDATION Y HACER LA MISMA TABLA ANTERIOR PERO CON ESTOS PARÁMETROS

control_cv <- trainControl(method = "cv", number = 5)

# REGRESION LINEAL

modelo_lm_cv <- train(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = eav2024_final,
  method = "lm",
  weights = factor_expansion,
  trControl = control_cv,
  preProcess = c("center", "scale")
)

#PLS
set.seed(123)
modelo_pls_cv <- train(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = eav2024_final,
  method = "pls",
  preProcess = c("center", "scale"),
  weights = eav2024_final$factor_expansion,
  trControl = control_cv,
  tuneLength = 10 
)

# RIDGE

x <- model.matrix(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                    cap_solucion_problemas + satisfaccion_serv_publicos, 
                  data = eav2024_final)[, -1] 

y <- eav2024_final$nivel_desconfianza
w <- eav2024_final$factor_expansion

datos_ridge <- as.data.frame(cbind(x, nivel_desconfianza = y))

modelo_ridge_cv <- train(
  nivel_desconfianza ~ .,
  data = datos_ridge,
  method = "glmnet",
  preProcess = c("center", "scale"),
  weights = w,
  trControl = control_cv,
  tuneGrid = expand.grid(
    alpha = 0,                        
    lambda = 10^seq(10, -2, length = 100) 
  )
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

# LASSO

x <- model.matrix(nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
                    cap_solucion_problemas + satisfaccion_serv_publicos, data = eav2024_final)[, -1]

y <- eav2024_final$nivel_desconfianza
w <- eav2024_final$factor_expansion
datos_lasso <- as.data.frame(cbind(x, nivel_desconfianza = y))
modelo_lasso_cv <- train(
  nivel_desconfianza ~ .,
  data = datos_lasso,
  method = "glmnet",
  preProcess = c("center", "scale"),
  weights = w,
  trControl = control_cv,
  tuneGrid = expand.grid(
    alpha = 1,                             # Lasso
    lambda = 10^seq(10, -2, length = 100) # secuencia de lambdas
  )
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

# SUBCONJUNTO FULL

modelo_full_cv <- train(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = eav2024_final,
  method = "leapSeq",     
  tuneLength = 5,       
  trControl = control_cv,
  preProcess = c("center", "scale")
)


# FORWARD

modelo_forward_cv <- train(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = eav2024_final,
  method = "leapForward",
  tuneLength = 5,          
  trControl = control_cv,
  preProcess = c("center", "scale")
)


# BACKWARD

modelo_backward_cv <- train(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = eav2024_final,
  method = "leapBackward",      # backward stepwise
  tuneLength = 5,               # prueba modelos con 1 a 5 variables
  trControl = control_cv,
  preProcess = c("center", "scale")
)

# POLINOMIAL

eav2024_final_cv <- eav2024_final
eav2024_final_cv$transparencia2 <- eav2024_final$transparencia^2
eav2024_final_cv$satisfaccion2 <- eav2024_final$satisfaccion_serv_publicos^2
modelo_poli_cv <- train(
  nivel_desconfianza ~ uso_recursos + transparencia2 + honestidad + cap_solucion_problemas + satisfaccion2,
  data = eav2024_final_cv,
  method = "lm",
  weights = eav2024_final$factor_expansion,
  trControl = control_cv,
  preProcess = c("center", "scale")
)


# ARBOL DE DECISIONES

modelo_arbol_cv <- train(
  nivel_desconfianza ~ uso_recursos + transparencia + honestidad + 
    cap_solucion_problemas + satisfaccion_serv_publicos,
  data = eav2024_final,
  method = "rpart",  
  trControl = trainControl(method = "cv", number = 5),
  tuneLength = 10
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

# RESULTADOS EN LA TABLA
resultados <- resamples(list(LINEAL = modelo_lm_cv, PLS = modelo_pls_cv, RIDGE = modelo_ridge_cv, LASSO = modelo_lasso_cv, FULL = modelo_full_cv, FORWARD = modelo_forward_cv, BACKWARD = modelo_backward_cv, POLINOMIAL = modelo_poli_cv, ARBOL = modelo_arbol_cv))
summary(resultados)

## 
## Call:
## summary.resamples(object = resultados)
## 
## Models: LINEAL, PLS, RIDGE, LASSO, FULL, FORWARD, BACKWARD, POLINOMIAL, ARBOL 
## Number of resamples: 5 
## 
## MAE 
##                 Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LINEAL     0.4033076 0.4036134 0.4122514 0.4160344 0.4146086 0.4463911    0
## PLS        0.4021059 0.4107766 0.4140450 0.4156300 0.4168364 0.4343858    0
## RIDGE      0.4079585 0.4094664 0.4149745 0.4144644 0.4170653 0.4228572    0
## LASSO      0.4039181 0.4066887 0.4096076 0.4143606 0.4193563 0.4322320    0
## FULL       0.4015595 0.4076628 0.4109417 0.4155107 0.4193972 0.4379922    0
## FORWARD    0.4046620 0.4109643 0.4121251 0.4150148 0.4179738 0.4293489    0
## BACKWARD   0.4056064 0.4077626 0.4079527 0.4155093 0.4253375 0.4308872    0
## POLINOMIAL 0.3944603 0.4101261 0.4175937 0.4164656 0.4214112 0.4387364    0
## ARBOL      0.3983101 0.4118424 0.4200281 0.4156439 0.4225824 0.4254565    0
## 
## RMSE 
##                 Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LINEAL     0.5097762 0.5242637 0.5246281 0.5306758 0.5262969 0.5684138    0
## PLS        0.5252846 0.5265188 0.5271244 0.5312927 0.5279426 0.5495932    0
## RIDGE      0.5160489 0.5284097 0.5332996 0.5315991 0.5386438 0.5415936    0
## LASSO      0.5216522 0.5239609 0.5240592 0.5310818 0.5386116 0.5471251    0
## FULL       0.5189526 0.5209094 0.5291044 0.5311629 0.5359012 0.5509469    0
## FORWARD    0.5153826 0.5293393 0.5294265 0.5310681 0.5350699 0.5461224    0
## BACKWARD   0.5206186 0.5227406 0.5272680 0.5311577 0.5275907 0.5575704    0
## POLINOMIAL 0.4948304 0.5304958 0.5352983 0.5307644 0.5390036 0.5541939    0
## ARBOL      0.5108121 0.5220065 0.5301997 0.5326834 0.5451412 0.5552578    0
## 
## Rsquared 
##                 Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LINEAL     0.5660765 0.5910989 0.6003171 0.6006721 0.6148168 0.6310514    0
## PLS        0.5817542 0.6014574 0.6030881 0.6003621 0.6049872 0.6105237    0
## RIDGE      0.5804904 0.5991726 0.5994417 0.6002781 0.6035355 0.6187500    0
## LASSO      0.5723145 0.6021497 0.6038297 0.6000622 0.6046580 0.6173592    0
## FULL       0.5808135 0.5957652 0.6008676 0.6004875 0.6101948 0.6147963    0
## FORWARD    0.5630135 0.5945411 0.6008817 0.5999228 0.6079709 0.6332069    0
## BACKWARD   0.5768465 0.6044225 0.6044676 0.6000320 0.6044715 0.6099518    0
## POLINOMIAL 0.5690717 0.5806274 0.5983973 0.6006520 0.6044543 0.6507093    0
## ARBOL      0.5700087 0.5837771 0.6023921 0.5977462 0.6115473 0.6210059    0

Justificación: Despúes de analizar todos los modelos predictivos, el modelo de Random Forest resultó ser el que tiene mejor desempeño general, con un RMSE de 0.5207818, MAE de 0.4018511 y un R² aproximado de 0.6209594 sin cross validation. Sin embargo, al comparar con el modelo de regresión lineal base, que obtuvo un RMSE de 16.56, un MAE similar y un R² de 0.5985, se observa que las diferencias en capacidad predictiva son mínimas. Dado que el modelo lineal es considerablemente más fácil de interpretar y explicar, especialmente en contextos como los de esta evidencia, donde los resultados serán presentados a un público no especializado en este tipo de análisis, se eligió utilizar el modelo lineal como base del análisis final. Esta elección prioriza la interpretación del modelo considerando que no haya un sacrificio significativo de la capacidad predictiva de la variable nivel de desconfianza.

11. Implementación en Shiny

# UI

# SERVER

12. Conclusiones

Describe aquí tus hallazgos, interpretación de métricas y recomendaciones para la política o la aplicación práctica del modelo.